Commit 2123a11c authored by gaojiuli's avatar gaojiuli

init

parent e6ac1409
...@@ -5,7 +5,4 @@ Convert HTML to Markdown. ...@@ -5,7 +5,4 @@ Convert HTML to Markdown.
from tomd import Tomd from tomd import Tomd
Tomd('<h1>title</h1>').markdown Tomd('<h1>title</h1>').markdown
Tomd('<h1>title</h1>','h1').markdown
Tomd('https://github.com').markdown
Tomd('https://github.com','.title .content').markdown
``` ```
This diff is collapsed.
...@@ -7,18 +7,20 @@ MARKDOWN = { ...@@ -7,18 +7,20 @@ MARKDOWN = {
'h4': ('\n#### ', '\n'), 'h4': ('\n#### ', '\n'),
'h5': ('\n##### ', '\n'), 'h5': ('\n##### ', '\n'),
'h6': ('\n###### ', '\n'), 'h6': ('\n###### ', '\n'),
'p': ('\n', '\n'),
'p_with_out_class': ('\n', '\n'),
'code': ('`', '`'), 'code': ('`', '`'),
'ul': ('\n', '\n'), 'ul': ('', ''),
'ol': ('\n', '\n'), 'ol': ('', ''),
'li': ('*. ', ''), 'li': ('*. ', ''),
'blockquote': ('> ', '\n'), 'blockquote': ('> ', '\n'),
'em': ('**', '**'), 'em': ('**', '**'),
'a': ('[](', ')'), 'a': ('[](', ')'),
'img': ('![](', ')'), 'img': ('![](', ')'),
'block_code': ('\n```\n', '\n```\n'), 'block_code': ('\n```\n', '\n```\n'),
'span': ('', '') 'span': ('', ''),
'p': ('\n', '\n'),
'p_with_out_class': ('\n', '\n'),
'inline_p': ('', ''),
'inline_p_with_out_class': ('', '')
} }
BlOCK_ELEMENTS = { BlOCK_ELEMENTS = {
...@@ -29,23 +31,31 @@ BlOCK_ELEMENTS = { ...@@ -29,23 +31,31 @@ BlOCK_ELEMENTS = {
'h5': '<h5.*?>(.*?)</h5>', 'h5': '<h5.*?>(.*?)</h5>',
'h6': '<h6.*?>(.*?)</h6>', 'h6': '<h6.*?>(.*?)</h6>',
'p': '<p\s.*?>(.*?)</p>', 'p': '<p\s.*?>(.*?)</p>',
'p_with_out_class': '<p>(.*?)</p>', 'p_with_out_class': '<p>(.*?)</p>', # conflict with <pre>
'blockquote': '<blockquote.*?>(.*?)</blockquote>', 'blockquote': '<blockquote.*?>(.*?)</blockquote>',
'ul': '<ul.*?>(.*?)</ul>', 'ul': '<ul.*?>(.*?)</ul>',
'ol': '<ol.*?>(.*?)</ol>',
'block_code': '<pre.*?><code.*?>(.*?)</code></pre>', 'block_code': '<pre.*?><code.*?>(.*?)</code></pre>',
} }
INLINE_ELEMENTS = { INLINE_ELEMENTS = {
'inline_p': '<p\s.*?>(.*?)</p>',
'inline_p_with_out_class': '<p>(.*?)</p>',
'code': '<code.*?>(.*?)</code>', 'code': '<code.*?>(.*?)</code>',
'span': '<span.*?>(.*?)</span>', 'span': '<span.*?>(.*?)</span>',
'ul': '<ul.*?>(.*?)</ul>',
'ol': '<ol.*?>(.*?)</ol>', 'ol': '<ol.*?>(.*?)</ol>',
'li': '<li.*?>(.*?)</li>', 'li': '<li.*?>(.*?)</li>',
'img': '<img.*?>(.*?)</img>', 'img': '<img.*?>(.*?)</img>',
'a': '<a.*?>(.*?)</a>', 'a': '<a.*?>(.*?)</a>',
'em': '<em.*?>(.*?)</em>', 'em': '<em.*?>(.*?)</em>'
# 'pre': '<pre.*><code.*>(.*)</code></pre>',
} }
## pos < max_pos
DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>']
class Element: class Element:
def __init__(self, pos, content, tag): def __init__(self, pos, content, tag):
...@@ -75,16 +85,16 @@ class Tomd: ...@@ -75,16 +85,16 @@ class Tomd:
self._elements = [] self._elements = []
self._markdown = None self._markdown = None
self.parse_block() self.parse_block()
print(self._markdown) for index, element in enumerate(DELETE_ELEMENTS):
for element in self._elements: self._markdown = re.sub(element, '', self._markdown)
if len(element._result) > 1000:
print(element.__dict__)
def parse_block(self): def parse_block(self):
for tag, pattern in BlOCK_ELEMENTS.items(): for tag, pattern in BlOCK_ELEMENTS.items():
for m in re.finditer(pattern, self.html, re.I | re.S | re.M): for m in re.finditer(pattern, self.html, re.I | re.S | re.M):
element = Element(pos=m.start(), content=''.join(m.groups()), tag=tag) element = Element(pos=m.start(), content=''.join(m.groups()), tag=tag)
self._elements.append(element) self._elements.append(element)
self._elements.sort(key=lambda element: element.pos) self._elements.sort(key=lambda element: element.pos)
self._markdown = ''.join([str(e) for e in self._elements]) self._markdown = ''.join([str(e) for e in self._elements])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment