Commit dd7f6f4a authored by zsinx6's avatar zsinx6

fixed img single tag without closing

parent 963d5cd0
...@@ -2,7 +2,7 @@ from setuptools import find_packages, setup ...@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
setup( setup(
name="tomd", name="tomd",
version="0.1.3", version="0.1.4",
description="Convert HTML to Markdown.", description="Convert HTML to Markdown.",
author="Gaojiuli", author="Gaojiuli",
author_email="gaojiuli@gmail.com", author_email="gaojiuli@gmail.com",
......
...@@ -12,6 +12,7 @@ string = """ ...@@ -12,6 +12,7 @@ string = """
<a href="https://github.com">link</a> <a href="https://github.com">link</a>
<img src="https://github.com" class="dsad">img</img> <img src="https://github.com" class="dsad">img</img>
<img src="https://github.com" class="dsad"/> <img src="https://github.com" class="dsad"/>
<img src="https://github.com" class="dsad">
</p> </p>
<ul> <ul>
<li>1</li> <li>1</li>
......
import re, os import re
import os
import warnings
__all__ = ['Tomd', 'convert'] __all__ = ['Tomd', 'convert']
...@@ -55,7 +57,7 @@ BlOCK_ELEMENTS = { ...@@ -55,7 +57,7 @@ BlOCK_ELEMENTS = {
INLINE_ELEMENTS = { INLINE_ELEMENTS = {
'td': '<td.*?>((.|\n)*?)</td>', #td element may span lines 'td': '<td.*?>((.|\n)*?)</td>', # td element may span lines
'tr': '<tr.*?>((.|\n)*?)</tr>', 'tr': '<tr.*?>((.|\n)*?)</tr>',
'th': '<th.*?>(.*?)</th>', 'th': '<th.*?>(.*?)</th>',
'b': '<b.*?>(.*?)</b>', 'b': '<b.*?>(.*?)</b>',
...@@ -77,7 +79,8 @@ INLINE_ELEMENTS = { ...@@ -77,7 +79,8 @@ INLINE_ELEMENTS = {
'tbody': '<tbody.*?>((.|\n)*)</tbody>', 'tbody': '<tbody.*?>((.|\n)*)</tbody>',
} }
DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>','<br clear="none"/>', '<center.*?>', '</center>'] DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>', '<br clear="none"/>', '<center.*?>', '</center>']
class Element: class Element:
def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False): def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
...@@ -91,10 +94,7 @@ class Element: ...@@ -91,10 +94,7 @@ class Element:
self._result = None self._result = None
if self.is_block: if self.is_block:
# print "parsing tag:", self.tag, ", content: ", repr(self.content)
self.parse_inline() self.parse_inline()
# if self.tag != 'table':
# print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)
def __str__(self): def __str__(self):
wrapper = MARKDOWN.get(self.tag) wrapper = MARKDOWN.get(self.tag)
...@@ -102,44 +102,43 @@ class Element: ...@@ -102,44 +102,43 @@ class Element:
return self._result return self._result
def parse_inline(self): def parse_inline(self):
self.content = self.content.replace('\r', '') #windows \r character self.content = self.content.replace('\r', '') # windows \r character
self.content = self.content.replace('\xc2\xa0', ' ') #no break space self.content = self.content.replace('\xc2\xa0', ' ') # no break space
self.content = self.content.replace('&quot;', '\"') #html quote mark self.content = self.content.replace('&quot;', '\"') # html quote mark
for m in re.finditer("<img(.*?)en_todo.*?>",self.content): for m in re.finditer("<img(.*?)en_todo.*?>", self.content):
#remove img and change to [ ] and [x] # remove img and change to [ ] and [x]
#evernote specific parsing # evernote specific parsing
imgSrc = re.search('src=".*?"',m.group()) imgSrc = re.search('src=".*?"', m.group())
imgLoc = imgSrc.group()[5:-1] #remove source and " " imgLoc = imgSrc.group()[5:-1] # remove source and " "
imgLoc = imgLoc.replace('\\', '/') #\\ folder slash rotate imgLoc = imgLoc.replace('\\', '/') # \\ folder slash rotate
if os.stat(self.folder + "/" + imgLoc).st_size < 250: if os.stat(self.folder + "/" + imgLoc).st_size < 250:
self.content = self.content.replace(m.group(),"[ ] ") self.content = self.content.replace(m.group(), "[ ] ")
else: else:
self.content = self.content.replace(m.group(),"[x] ") self.content = self.content.replace(m.group(), "[x] ")
# print self.content
if "e_" in self.tag: #evernote-specific parsing if "e_" in self.tag: # evernote-specific parsing
# if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M): for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
#hmm can there only be one table? # hmm can there only be one table?
# print "AHHHH THERES A TABLE\n\n"
inner = Element(start_pos=m.start(), inner = Element(start_pos=m.start(),
end_pos=m.end(), end_pos=m.end(),
content=''.join(m.groups()), content=''.join(m.groups()),
tag='table',folder=self.folder, tag='table', folder=self.folder,
is_block=True) is_block=True)
self.content = inner.content self.content = inner.content
return #no need for further parsing ? return # no need for further parsing ?
# if no table, parse as usual # if no table, parse as usual
self.content = self.content.replace('<hr/>', '\n---\n') self.content = self.content.replace('<hr/>', '\n---\n')
self.content = self.content.replace('<br/>', '') self.content = self.content.replace('<br/>', '')
if self.tag == "table": #for removing tbody if self.tag == "table": # for removing tbody
self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content) self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)
for tag, pattern in INLINE_ELEMENTS.items(): INLINE_ELEMENTS_LIST_KEYS = list(INLINE_ELEMENTS.keys())
# print "---now looking at", tag, pattern INLINE_ELEMENTS_LIST_KEYS.sort()
for tag in INLINE_ELEMENTS_LIST_KEYS:
pattern = INLINE_ELEMENTS[tag]
if tag == 'a': if tag == 'a':
self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content) self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
...@@ -147,6 +146,8 @@ class Element: ...@@ -147,6 +146,8 @@ class Element:
self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content) self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content)
elif tag == 'img_single': elif tag == 'img_single':
self.content = re.sub(pattern, '![](\g<1>)', self.content) self.content = re.sub(pattern, '![](\g<1>)', self.content)
elif tag == 'img_single_no_close':
self.content = re.sub(pattern, '![](\g<1>)', self.content)
elif self.tag == 'ul' and tag == 'li': elif self.tag == 'ul' and tag == 'li':
self.content = re.sub(pattern, '- \g<1>', self.content) self.content = re.sub(pattern, '- \g<1>', self.content)
elif self.tag == 'ol' and tag == 'li': elif self.tag == 'ol' and tag == 'li':
...@@ -157,13 +158,11 @@ class Element: ...@@ -157,13 +158,11 @@ class Element:
self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
elif self.tag == 'tr' and tag == 'td': elif self.tag == 'tr' and tag == 'td':
self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', '')) self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
self.content = self.content.replace("||","|") #end of column also needs a pipe self.content = self.content.replace("||", "|") # end of column also needs a pipe
# print "---converting, td remove duplicate:", tag, self.content
elif self.tag == 'table' and tag == 'td': elif self.tag == 'table' and tag == 'td':
self.content = re.sub(pattern, '|\g<1>|', self.content) self.content = re.sub(pattern, '|\g<1>|', self.content)
self.content = self.content.replace("||","|") #end of column also needs a pipe self.content = self.content.replace("||", "|") # end of column also needs a pipe
self.content = self.content.replace('|\n\n', '|\n') #replace double new line self.content = self.content.replace('|\n\n', '|\n') # replace double new line
# print "---converting, td remove duplicate:", tag, self.content
self.construct_table() self.construct_table()
else: else:
wrapper = MARKDOWN.get(tag) wrapper = MARKDOWN.get(tag)
...@@ -177,39 +176,36 @@ class Element: ...@@ -177,39 +176,36 @@ class Element:
# this function, after self.content has gained | for table entries, # this function, after self.content has gained | for table entries,
# adds the |---| in markdown to create a proper table # adds the |---| in markdown to create a proper table
temp = self.content.split('\n',3) temp = self.content.split('\n', 3)
for elt in temp: for elt in temp:
if elt != "": if elt != "":
count = elt.count("|") #count number of pipes count = elt.count("|") # count number of pipes
break break
pipe = "\n|" #beginning \n for safety pipe = "\n|" # beginning \n for safety
for i in xrange(count-1): for i in range(count - 1):
pipe += "---|" pipe += "---|"
pipe += "\n" pipe += "\n"
self.content = pipe + pipe + self.content + "\n" #TODO: column titles? self.content = pipe + pipe + self.content + "\n" # TODO: column titles?
self.content = self.content.replace('|\n\n', '|\n') #replace double new line self.content = self.content.replace('|\n\n', '|\n') # replace double new line
self.content = self.content.replace("<br/>\n","<br/>") #end of column also needs a pipe self.content = self.content.replace("<br/>\n", "<br/>") # end of column also needs a pipe
class Tomd: class Tomd:
def __init__(self, html='', folder='',file='',options=None): def __init__(self, html='', folder='', file='', options=None):
self.html = html #actual data self.html = html # actual data
self.folder = folder self.folder = folder
self.file = file self.file = file
self.options = options # haven't been implemented yet self.options = options # haven't been implemented yet
self._markdown = self.convert(self.html,self.options) self._markdown = self.convert(self.html, self.options)
def convert(self, html="", options=None): def convert(self, html="", options=None):
if html == "": if html == "":
html = self.html html = self.html
#main function here # main function here
elements = [] elements = []
for tag, pattern in BlOCK_ELEMENTS.items(): for tag, pattern in BlOCK_ELEMENTS.items():
# print "pattern is", pattern, "tag", tag
for m in re.finditer(pattern, html, re.I | re.S | re.M): for m in re.finditer(pattern, html, re.I | re.S | re.M):
# now m contains the pattern without the tag # now m contains the pattern without the tag
# if tag == "e_p":
# print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
element = Element(start_pos=m.start(), element = Element(start_pos=m.start(),
end_pos=m.end(), end_pos=m.end(),
content=''.join(m.groups()), content=''.join(m.groups()),
...@@ -224,10 +220,6 @@ class Tomd: ...@@ -224,10 +220,6 @@ class Tomd:
elements.remove(e) elements.remove(e)
if can_append: if can_append:
elements.append(element) elements.append(element)
# print "\n\n\ndone with convert, element is"
# for e in elements:
# print repr(str(e))
# print "---"
elements.sort(key=lambda element: element.start_pos) elements.sort(key=lambda element: element.start_pos)
self._markdown = ''.join([str(e) for e in elements]) self._markdown = ''.join([str(e) for e in elements])
...@@ -240,19 +232,19 @@ class Tomd: ...@@ -240,19 +232,19 @@ class Tomd:
self.convert(self.html, self.options) self.convert(self.html, self.options)
return self._markdown return self._markdown
def export(self,folder=False): def export(self, folder=False):
if len(self.file) < 1: if len(self.file) < 1:
warnings.warn("file not specified, renamed to tmp.md") warnings.warn("file not specified, renamed to tmp.md")
file = "tmp.md" file = "tmp.md"
else: else:
file = self.file.replace('.html','.md') #rename to md file = self.file.replace('.html', '.md') # rename to md
if len(self.folder) < 2: if len(self.folder) < 2:
warnings.warn("folder not specified, will save to pwd") warnings.warn("folder not specified, will save to pwd")
elif not folder: elif not folder:
file = self.folder + '/' + file file = self.folder + '/' + file
else: #if folder is specified else: # if folder is specified
file = folder + '/' + file file = folder + '/' + file
f = open(file,'w') f = open(file, 'w')
f.write(self._markdown) f.write(self._markdown)
f.close() f.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment