2007-07-17 11:53:03 -05:00

528 lines
18 KiB
Python
Executable File

#!/usr/bin/env python
import xml.dom.minidom as dom
import cgi
import sys
default_styles = {
'Comment' : 'def:comment',
'String' : 'def:string',
'Preprocessor' : 'def:preprocessor',
'Keyword' : 'def:keyword',
'Data Type' : 'def:data-type',
'Decimal' : 'def:decimal',
'Specials' : 'def:specials',
'Function' : 'def:function',
'Base-N Integer' : 'def:base-n-integer',
'Floating Point' : 'def:floating-point',
'Floating point' : 'def:floating-point',
'Others' : None,
'Other' : None,
'Others 2' : None,
'Others 3' : None,
}
def get_default_style(style):
map_to = default_styles[style]
if map_to is None:
print >> sys.stderr, "Warning: '%s' style is used, it is replaced with 'foobar'. Fix it after converting" % (style,)
map_to = 'foobar'
elif map_to == 'def:preprocessor':
print >> sys.stderr, "Warning: '%s' style is used, it is highly unlikely this use is correct, check after converting" % (style,)
return map_to
def escape_escape_char(ch):
if ch == '\\':
return '\\\\'
elif ch in ['@']:
return ch
raise RuntimeError("don't know how to escape '%s'" % (ch,))
def escape_regex(s):
return cgi.escape(s)
def normalize_id(id):
if id == "C#":
return "c-sharp"
elif id == ".desktop":
return "desktop"
elif id == ".ini":
return "ini"
elif id == "C++ Line Comment":
return "cpp-line-comment"
elif id == "Markup (inline)":
return "markup-inline"
elif id == "Markup (block)":
return "markup-block"
else:
return id.replace(', ', '-').replace('.', '-').replace('*', '-').replace(',', '-').replace(' ', '-').replace('/', '-').replace('#', '-').lower()
class LangFile(object):
def __init__(self, id, name, _name, section, _section, mimetypes, globs, filename):
object.__init__(self)
assert name or _name
assert section or _section
self.id = normalize_id(id or name or _name)
self.name = name
self._name = _name
self.section = section
self._section = _section
self.mimetypes = mimetypes
self.globs = globs
self.filename = filename
self.contexts = []
self.escape_char = None
def set_esc_char(self, char):
self.escape_char = char
def add_context(self, ctx):
self.contexts.append(ctx)
def format_header(self, indent):
string = '<?xml version="1.0" encoding="UTF-8"?>\n<language id="%s"' % (self.id,)
if self.name:
string += ' name="%s"' % (self.name,)
else:
string += ' _name="%s"' % (self._name,)
string += ' version="2.0"'
if self.section:
string += ' section="%s"' % (self.section,)
else:
string += ' _section="%s"' % (self._section,)
string += '>\n'
if self.mimetypes or self.globs:
string += indent + '<metadata>\n'
if self.mimetypes:
string += 2*indent + '<property name="mimetypes">%s</property>\n' % (cgi.escape(self.mimetypes),)
if self.globs:
string += 2*indent + '<property name="globs">%s</property>\n' % (cgi.escape(self.globs),)
string += indent + '</metadata>\n\n'
return string
def format_footer(self, indent):
return '</language>\n'
def format_styles(self, indent):
string = indent + "<styles>\n"
styles = {}
for ctx in self.contexts:
map_to = get_default_style(ctx.style_name)
styles[ctx.style] = [ctx.style_name, map_to]
for s in styles:
id = s
name, map_to = styles[s]
if map_to:
string += indent*2 + '<style id="%s" _name="%s" map-to="%s"/>\n' % (id, name, map_to)
else:
string += indent*2 + '<style id="%s" _name="%s"/>\n' % (id, name)
string += indent + "</styles>\n\n"
return string
def format_contexts(self, indent):
string = indent + '<definitions>\n'
if self.escape_char and self.escape_char != '\\':
char = escape_escape_char(self.escape_char)
string += indent*2 + '<context id="generated-escape">\n'
string += indent*3 + '<match>%s.</match>\n' % (char,)
string += indent*2 + '</context>\n'
string += indent*2 + '<context id="generated-line-escape">\n'
string += indent*3 + '<start>%s$</start>\n' % (char,)
string += indent*3 + '<end>^</end>\n'
string += indent*2 + '</context>\n'
for ctx in self.contexts:
if self.escape_char:
if self.escape_char != '\\':
esc_ctx = 'generated-escape'
line_esc_ctx = 'generated-line-escape'
else:
esc_ctx = 'def:escape'
line_esc_ctx = 'def:line-continue'
else:
esc_ctx = None
line_esc_ctx = None
string += ctx.format(indent, esc_ctx, line_esc_ctx)
string += indent*2 + '<context id="%s">\n' % (self.id,)
string += indent*3 + '<include>\n'
for ctx in self.contexts:
string += indent*4 + '<context ref="%s"/>\n' % (ctx.id,)
string += indent*3 + '</include>\n'
string += indent*2 + '</context>\n'
string += indent + '</definitions>\n'
return string
def format(self, indent=' '):
string = self.format_header(indent)
string += self.format_styles(indent)
string += self.format_contexts(indent)
string += self.format_footer(indent)
return string
class Context(object):
def __init__(self, name, _name, style):
object.__init__(self)
assert (name or _name) and style
self.name = name
self._name = _name
self.style_name = style
self.style = style.replace(' ', '-').lower()
self.id = normalize_id(name or _name)
self.is_container = False
def format(self, indent, esc_ctx, line_esc_ctx):
print "Implement me: %s.format()" % (type(self).__name__,)
return indent*2 + '<context id="%s"/>\n' % (self.id)
def format_escape(self, indent, esc_ctx, line_esc_ctx):
string = ""
if self.is_container and esc_ctx is not None:
string += indent*3 + '<include>\n'
string += indent*4 + '<context ref="%s"/>\n' % (esc_ctx,)
string += indent*4 + '<context ref="%s"/>\n' % (line_esc_ctx,)
string += indent*3 + '</include>\n'
return string
class KeywordList(Context):
def __init__(self, name, _name, style, keywords, case_sensitive,
match_empty_string_at_beginning,
match_empty_string_at_end,
beginning_regex, end_regex):
Context.__init__(self, name, _name, style)
self.keywords = keywords
self.case_sensitive = case_sensitive # ???
self.match_empty_string_at_beginning = match_empty_string_at_beginning
self.match_empty_string_at_end = match_empty_string_at_end
self.beginning_regex = beginning_regex
self.end_regex = end_regex
def format(self, indent, esc_ctx, line_esc_ctx):
string = indent*2 + '<context id="%s" style-ref="%s">\n' % (self.id, self.style)
if self.beginning_regex:
string += indent*3 + '<prefix>%s</prefix>\n' % (escape_regex(self.beginning_regex),)
elif not self.match_empty_string_at_beginning:
string += indent*3 + '<prefix></prefix>\n'
if self.end_regex:
string += indent*3 + '<suffix>%s</suffix>\n' % (escape_regex(self.end_regex),)
elif not self.match_empty_string_at_end:
string += indent*3 + '<suffix></suffix>\n'
for kw in self.keywords:
string += indent*3 + '<keyword>%s</keyword>\n' % (escape_regex(kw),)
string += self.format_escape(indent, esc_ctx, line_esc_ctx)
string += indent*2 + '</context>\n'
return string
class PatternItem(Context):
def __init__(self, name, _name, style, pattern):
Context.__init__(self, name, _name, style)
assert pattern
self.pattern = pattern
def format(self, indent, esc_ctx, line_esc_ctx):
string = indent*2 + '<context id="%s" style-ref="%s">\n' % (self.id, self.style)
string += indent*3 + '<match>%s</match>\n' % (escape_regex(self.pattern),)
string += self.format_escape(indent, esc_ctx, line_esc_ctx)
string += indent*2 + '</context>\n'
return string
class LineComment(Context):
def __init__(self, name, _name, style, start):
Context.__init__(self, name, _name, style)
assert start
self.start = start
self.is_container = True
def format(self, indent, esc_ctx, line_esc_ctx):
string = indent*2 + '<context id="%s" style-ref="%s" end-at-line-end="true">\n' % (self.id, self.style)
string += indent*3 + '<start>%s</start>\n' % (escape_regex(self.start),)
string += self.format_escape(indent, esc_ctx, line_esc_ctx)
string += indent*2 + '</context>\n'
return string
class BlockComment(Context):
def __init__(self, name, _name, style, start, end):
Context.__init__(self, name, _name, style)
assert start and end
self.start = start
self.end = end
self.is_container = True
def format(self, indent, esc_ctx, line_esc_ctx):
string = indent*2 + '<context id="%s" style-ref="%s">\n' % (self.id, self.style)
string += indent*3 + '<start>%s</start>\n' % (escape_regex(self.start),)
string += indent*3 + '<end>%s</end>\n' % (escape_regex(self.end),)
string += self.format_escape(indent, esc_ctx, line_esc_ctx)
string += indent*2 + '</context>\n'
return string
class String(Context):
def __init__(self, name, _name, style, start, end, end_at_line_end):
Context.__init__(self, name, _name, style)
assert start and end
self.start = start
if end and end.endswith("\\n"):
end = end[:-2]
end_at_line_end = True
self.end = end
self.end_at_line_end = end_at_line_end
self.is_container = True
def format(self, indent, esc_ctx, line_esc_ctx):
string = indent*2 + '<context id="%s" style-ref="%s"' % (self.id, self.style)
if self.end_at_line_end:
string += ' end-at-line-end="true"'
string += '>\n'
if self.start:
string += indent*3 + '<start>%s</start>\n' % (escape_regex(self.start),)
if self.end:
string += indent*3 + '<end>%s</end>\n' % (escape_regex(self.end),)
string += self.format_escape(indent, esc_ctx, line_esc_ctx)
string += indent*2 + '</context>\n'
return string
class SyntaxItem(Context):
def __init__(self, name, _name, style, start, end):
Context.__init__(self, name, _name, style)
assert start and end
self.start = start
self.end = end
self.end_at_line_end = False
if end and end.endswith("\\n"):
self.end = end[:-2]
self.end_at_line_end = True
self.is_container = True
def format(self, indent, esc_ctx, line_esc_ctx):
string = indent*2 + '<context id="%s" style-ref="%s"' % (self.id, self.style)
if self.end_at_line_end:
string += ' end-at-line-end="true"'
string += '>\n'
if self.start:
string += indent*3 + '<start>%s</start>\n' % (escape_regex(self.start),)
if self.end:
string += indent*3 + '<end>%s</end>\n' % (escape_regex(self.end),)
string += self.format_escape(indent, esc_ctx, line_esc_ctx)
string += indent*2 + '</context>\n'
return string
def first_child(node):
child = node.firstChild
while child is not None and child.nodeType != dom.Node.ELEMENT_NODE:
child = child.nextSibling
return child
def next_sibling(node):
next = node.nextSibling
while next is not None and next.nodeType != dom.Node.ELEMENT_NODE:
next = next.nextSibling
return next
def parseLineComment(cur, name, _name, style):
child = first_child(cur)
assert child is not None and child.tagName == "start-regex"
return LineComment(name, _name, style, child.firstChild.nodeValue)
def parseBlockComment(cur, name, _name, style):
start_regex = None
end_regex = None
child = first_child(cur)
while child is not None:
if child.tagName == "start-regex":
start_regex = child.firstChild.nodeValue
elif child.tagName == "end-regex":
end_regex = child.firstChild.nodeValue
child = next_sibling(child)
assert start_regex is not None
assert end_regex is not None
return BlockComment(name, _name, style, start_regex, end_regex)
def parseString(cur, name, _name, style):
start_regex = None
end_regex = None
end_at_line_end = True
prop = cur.getAttribute("end-at-line-end")
if prop:
if prop in ["TRUE", "1"]:
end_at_line_end = True
else:
end_at_line_end = False
child = first_child(cur)
while child is not None:
if child.tagName == "start-regex":
start_regex = child.firstChild.nodeValue
elif child.tagName == "end-regex":
end_regex = child.firstChild.nodeValue
child = next_sibling(child)
assert start_regex is not None
assert end_regex is not None
return String(name, _name, style, start_regex, end_regex, end_at_line_end)
def parseKeywordList(cur, name, _name, style):
case_sensitive = True
match_empty_string_at_beginning = True
match_empty_string_at_end = True
beginning_regex = None
end_regex = None
keywords = []
prop = cur.getAttribute("case-sensitive")
if prop:
if prop in ["TRUE", "1"]:
case_sensitive = True
else:
case_sensitive = False
prop = cur.getAttribute("match-empty-string-at-beginning")
if prop:
if prop in ["TRUE", "1"]:
match_empty_string_at_beginning = True
else:
match_empty_string_at_beginning = False
prop = cur.getAttribute("match-empty-string-at-end")
if prop:
if prop in ["TRUE", "1"]:
match_empty_string_at_end = True
else:
match_empty_string_at_end = False
prop = cur.getAttribute("beginning-regex")
if prop:
beginning_regex = prop
prop = cur.getAttribute("end-regex")
if prop:
end_regex = prop
child = first_child(cur)
while child is not None:
if child.tagName == "keyword":
keywords.append(child.firstChild.nodeValue)
child = next_sibling(child)
assert keywords
return KeywordList(name, _name, style, keywords, case_sensitive,
match_empty_string_at_beginning,
match_empty_string_at_end,
beginning_regex, end_regex)
def parsePatternItem(cur, name, _name, style):
child = first_child(cur)
assert child is not None and child.tagName == "regex"
return PatternItem(name, _name, style, child.firstChild.nodeValue)
def parseSyntaxItem(cur, name, _name, style):
start_regex = None
end_regex = None
child = first_child(cur)
while child is not None:
if child.tagName == "start-regex":
start_regex = child.firstChild.nodeValue
elif child.tagName == "end-regex":
end_regex = child.firstChild.nodeValue
child = next_sibling(child)
assert start_regex is not None
assert end_regex is not None
return SyntaxItem(name, _name, style, start_regex, end_regex)
def parseTag(cur):
_name = None
name = None
_name = cur.getAttribute("_name")
name = cur.getAttribute("name")
assert name or _name
style = cur.getAttribute("style") or "Normal"
if cur.tagName == "line-comment":
ctx = parseLineComment(cur, name, _name, style)
elif cur.tagName == "block-comment":
ctx = parseBlockComment(cur, name, _name, style)
elif cur.tagName == "string":
ctx = parseString(cur, name, _name, style)
elif cur.tagName == "keyword-list":
ctx = parseKeywordList(cur, name, _name, style)
elif cur.tagName == "pattern-item":
ctx = parsePatternItem(cur, name, _name, style)
elif cur.tagName == "syntax-item":
ctx = parseSyntaxItem(cur, name, _name, style)
else:
print "Unknown tag: %s" % (cur.tagName,)
ctx = None
return ctx
def parse_file(filename):
doc = dom.parse(filename)
node = doc.documentElement
contexts = []
esc_char = None
assert node.tagName == "language"
lang_file = LangFile(node.getAttribute("id"),
node.getAttribute("name"),
node.getAttribute("_name"),
node.getAttribute("section"),
node.getAttribute("_section"),
node.getAttribute("mimetypes"),
node.getAttribute("globs"),
filename)
node = first_child(node)
assert node is not None
while node is not None:
if node.tagName == "escape-char":
lang_file.set_esc_char(node.firstChild.nodeValue)
else:
lang_file.add_context(parseTag(node))
node = next_sibling(node)
return lang_file
if __name__ == '__main__':
import sys
if not sys.argv[1:]:
print "usage: %s LANG_FILE" % (sys.argv[0])
sys.exit(1)
lang_file = parse_file(sys.argv[1])
sys.stdout.write(lang_file.format())