-
util.py
# coding=utf-8 __author__ = ‘twocold‘ # 文本块生成器 def lines(file): for line in file: yield line yield ‘\n‘ def blocks(file): block = [] for line in lines(file): if line.strip(): block.append(line) elif block: yield ‘‘.join(block).strip() block = []
-
handlers.py
# coding=utf-8 __author__ = ‘twocold‘ class Handler: """ 处理从Parser调用的方法的对象 这个解析器会在每个块的开始部分调用start()和end()方法,使用合适的 块名作为参数。sub()方法会用于正则表达式替换中。当使用了‘emphasis‘ 这样的名字调用时,它会返回合适的替换函数。 """ def callback(self, perfix, name, *args): method = getattr(self, perfix+name, None) if callable(method): return method(*args) def start(self, name): self.callback(‘start_‘, name) def end(self, name): self.callback(‘end_‘, name) def sub(self, name): def substitution(match): result = self.callback(‘sub‘, name, match) if result is None: result = match.group(1) return result return substitution class HTMLRenderer(Handler): """ 用于生产HTML的具体处理程序 HTMLRenderer内的方法都可以通过超类处理程序的start()、 end()和sub()方法来访问。它们实现了用于HTML文档的基本标签。 """ def start_document(self): print ‘<html><head><title>...</title></head><body>‘ def end_document(self): print ‘</body></html>‘ def start_paragraph(self): print ‘<p>‘ def end_paragraph(self): print ‘</p>‘ def start_heading(self): print ‘<h2>‘ def end_heading(self): print ‘</h2>‘ def start_list(self): print ‘<ul>‘ def end_list(self): print ‘</ul>‘ def start_listitem(self): print ‘<li>‘ def end_listitem(self): print ‘</li>‘ def start_title(self): print ‘<h1>‘ def end_title(self): print ‘</h1>‘ def sub_emphasis(self, match): return ‘<em>%s</em>‘ % match.group(1) def sub_url(self, match): return ‘<a href="%s">%s</a>‘ % (match.group(1), match.group(1)) def sub_mail(self, match): return ‘<a href="mailto:%s">%s</a>‘ % (match.group(1), match.group(1)) def feed(self, data): print data
-
rules.py
# coding=utf-8 __author__ = ‘twocold‘ class Rule: """ 所有规则的基类 """ def action(self, block, handler): handler.start(self.type) handler.feed(block) handler.end(self.type) return type class HeadingRule(Rule): """ 标题占一行,最多70多个字符,并且不易冒号结尾。 """ type = ‘heading‘ def condition(self, block): return not ‘\n‘ not in block and len(block ) <= 70 and not block[1] == ‘:‘ class TitleRule(HeadingRule): """ 题目是文档的第一个块,但前提是它是大标题 """ type = ‘title‘ first = True def condition(self, block): if not self.first: return False self.first = False return HeadingRule.condition(self, block) class ListItemRule(Rule): """ 列表项是以连字符开始的段落。作为格式化的一部分,要移除连接字符 """ type = ‘listitem‘ def condition(self,block): return block[0] == ‘_‘ def action(self, block, handler): handler.start(self.type) handler.feed(block[1:].strip()) handler.end(self.type) return type class ListRule(ListItemRule): """ 列表从不是列表项的块和随后的列表项之间。在最后一个连续列表项之后结束。 """ type = ‘list‘ inside = False def condition(self,block): return type def action(self, block, handler): if not self.inside and ListItemRule.condition(self, block): handler.start(self.type) self.inside = True elif self.inside and not ListItemRule.condition(self, block): handler.end(self.type) self.inside = False return False class ParagraphRule(Rule): """ 段落只是其他规则并没有覆盖到的块 """ type = ‘paragraph‘ def condition(self, block): return True
-
markup.py
# coding=utf-8 __author__ = ‘twocold‘ import sys, re from handlers import * from util import * from rules import * class Parser: """ 语法分析器读取文本文件、应用规则并且控制处理程序 """ def __init__(self, handler): self.handler = handler self.rules = [] self.filters = [] def addRule(self, rule): self.rules.append(rule) def addFilter(self, pattern, name): def filter(block, handler): return re.sub(pattern, handler.sub(name), block) self.filters.append(filter) def parse(self, file): self.handler.start(‘document‘) for block in blocks(file): for filter in self.filters: block = filter(block, self.handler) for rule in self.rules: if rule.condition(block): last = rule.action(block, self.handler) if last: break self.handler.end(‘document‘) class BasicTextParser(Parser): """ 在构造函数中增加规则和过滤器的具体羽凡分析 """ def __init__(self, handler): Parser.__init__(self, handler) self.addRule(ListRule()) self.addRule(ListItemRule()) self.addRule(TitleRule()) self.addRule(HeadingRule()) self.addRule(ParagraphRule()) self.addFilter(r‘\*(.+?\*)‘, ‘emphasis‘) self.addFilter(r‘(http://\.a-zA-Z/]+)‘, ‘url‘) self.addFilter(r‘([\.a-zA-Z][email protected][\.a-zA-Z]+[a-zA-Z]+)‘, ‘mail‘) handler = HTMLRenderer() parser = BasicTextParser(handler) parser.parse(sys.stdin)
时间: 2024-11-08 20:45:42