fixed img single tag without closing

dd7f6f4a · zsinx6 · 963d5cd0 · dd7f6f4a · dd7f6f4a · dd7f6f4a
Commit dd7f6f4a authored Nov 07, 2017 by zsinx6
Hide whitespace changes
Inline Side-by-side

Showing with 54 additions and 61 deletions

setup.py setup.py +1 -1

test_tomd.py test_tomd.py +1 -0

tomd.py tomd.py +52 -60

No files found.
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup

 setup(
    name="tomd",
-    version="0.1.3",
+    version="0.1.4",
    description="Convert HTML to Markdown.",
    author="Gaojiuli",
    author_email="gaojiuli@gmail.com",

--- a/test_tomd.py
+++ b/test_tomd.py
@@ -12,6 +12,7 @@ string = """
 <a href="https://github.com">link</a>
 <img src="https://github.com" class="dsad">img</img>
 <img src="https://github.com" class="dsad"/>
+<img src="https://github.com" class="dsad">
 </p>
 <ul>
 <li>1</li>

--- a/tomd.py
+++ b/tomd.py
-import re, os
+import re
+import os
+import warnings

 __all__ = ['Tomd', 'convert']

@@ -55,7 +57,7 @@ BlOCK_ELEMENTS = {


 INLINE_ELEMENTS = {
-    'td': '<td.*?>((.|\n)*?)</td>', #td element may span lines
+    'td': '<td.*?>((.|\n)*?)</td>',  # td element may span lines
    'tr': '<tr.*?>((.|\n)*?)</tr>',
    'th': '<th.*?>(.*?)</th>',
    'b': '<b.*?>(.*?)</b>',
@@ -77,7 +79,8 @@ INLINE_ELEMENTS = {
    'tbody': '<tbody.*?>((.|\n)*)</tbody>',
 }

-DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>','<br clear="none"/>', '<center.*?>', '</center>']
+DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>', '<br clear="none"/>', '<center.*?>', '</center>']
+

 class Element:
    def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
@@ -91,10 +94,7 @@ class Element:
        self._result = None

        if self.is_block:
-            # print "parsing tag:", self.tag, ", content: ", repr(self.content)
            self.parse_inline()
-            # if self.tag != 'table':
-                # print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)

    def __str__(self):
        wrapper = MARKDOWN.get(self.tag)
@@ -102,44 +102,43 @@ class Element:
        return self._result

    def parse_inline(self):
-        self.content = self.content.replace('\r', '') #windows \r character
-        self.content = self.content.replace('\xc2\xa0', ' ') #no break space
-        self.content = self.content.replace('&quot;', '\"') #html quote mark
-
-        for m in re.finditer("<img(.*?)en_todo.*?>",self.content):
-            #remove img and change to [ ] and [x]
-            #evernote specific parsing
-            imgSrc = re.search('src=".*?"',m.group())
-            imgLoc = imgSrc.group()[5:-1] #remove source and " "
-            imgLoc = imgLoc.replace('\\', '/') #\\ folder slash rotate
+        self.content = self.content.replace('\r', '')  # windows \r character
+        self.content = self.content.replace('\xc2\xa0', ' ')  # no break space
+        self.content = self.content.replace('&quot;', '\"')  # html quote mark
+
+        for m in re.finditer("<img(.*?)en_todo.*?>", self.content):
+            # remove img and change to [ ] and [x]
+            # evernote specific parsing
+            imgSrc = re.search('src=".*?"', m.group())
+            imgLoc = imgSrc.group()[5:-1]  # remove source and " "
+            imgLoc = imgLoc.replace('\\', '/')  # \\ folder slash rotate
            if os.stat(self.folder + "/" + imgLoc).st_size < 250:
-                self.content = self.content.replace(m.group(),"[ ] ")
+                self.content = self.content.replace(m.group(), "[ ] ")
            else:
-                self.content = self.content.replace(m.group(),"[x] ")
-        # print self.content
+                self.content = self.content.replace(m.group(), "[x] ")

-        if "e_" in self.tag: #evernote-specific parsing
-            # if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
+        if "e_" in self.tag:  # evernote-specific parsing
            for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
-                #hmm can there only be one table?
-                # print "AHHHH THERES A TABLE\n\n"
+                # hmm can there only be one table?
                inner = Element(start_pos=m.start(),
-                                  end_pos=m.end(),
-                                  content=''.join(m.groups()),
-                                  tag='table',folder=self.folder,
-                                  is_block=True)
+                                end_pos=m.end(),
+                                content=''.join(m.groups()),
+                                tag='table', folder=self.folder,
+                                is_block=True)
                self.content = inner.content
-                return #no need for further parsing ?
+                return  # no need for further parsing ?

            # if no table, parse as usual
            self.content = self.content.replace('<hr/>', '\n---\n')
            self.content = self.content.replace('<br/>', '')

-        if self.tag == "table": #for removing tbody
+        if self.tag == "table":  # for removing tbody
            self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)

-        for tag, pattern in INLINE_ELEMENTS.items():
-            # print "---now looking at", tag, pattern
+        INLINE_ELEMENTS_LIST_KEYS = list(INLINE_ELEMENTS.keys())
+        INLINE_ELEMENTS_LIST_KEYS.sort()
+        for tag in INLINE_ELEMENTS_LIST_KEYS:
+            pattern = INLINE_ELEMENTS[tag]

            if tag == 'a':
                self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
@@ -147,6 +146,8 @@ class Element:
                self.content = re.sub(pattern, '![\g<2>](\g<1>)', self.content)
            elif tag == 'img_single':
                self.content = re.sub(pattern, '![](\g<1>)', self.content)
+            elif tag == 'img_single_no_close':
+                self.content = re.sub(pattern, '![](\g<1>)', self.content)
            elif self.tag == 'ul' and tag == 'li':
                self.content = re.sub(pattern, '- \g<1>', self.content)
            elif self.tag == 'ol' and tag == 'li':
@@ -157,19 +158,17 @@ class Element:
                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'td':
                self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
-                self.content = self.content.replace("||","|") #end of column also needs a pipe
-                # print "---converting, td remove duplicate:", tag, self.content
+                self.content = self.content.replace("||", "|")  # end of column also needs a pipe
            elif self.tag == 'table' and tag == 'td':
                self.content = re.sub(pattern, '|\g<1>|', self.content)
-                self.content = self.content.replace("||","|") #end of column also needs a pipe
-                self.content = self.content.replace('|\n\n', '|\n') #replace double new line
-                # print "---converting, td remove duplicate:", tag, self.content
+                self.content = self.content.replace("||", "|")  # end of column also needs a pipe
+                self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
                self.construct_table()
            else:
                wrapper = MARKDOWN.get(tag)
                self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)

-        if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2: 
+        if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2:
            # focusing on div, add new line if not there (and if content is long enough)
            self.content += '\n'

@@ -177,39 +176,36 @@ class Element:
        # this function, after self.content has gained | for table entries,
        # adds the |---| in markdown to create a proper table

-        temp = self.content.split('\n',3)
+        temp = self.content.split('\n', 3)
        for elt in temp:
            if elt != "":
-                count = elt.count("|") #count number of pipes
+                count = elt.count("|")  # count number of pipes
                break
-        pipe = "\n|" #beginning \n for safety
-        for i in xrange(count-1):
+        pipe = "\n|"  # beginning \n for safety
+        for i in range(count - 1):
            pipe += "---|"
        pipe += "\n"
-        self.content = pipe + pipe + self.content + "\n" #TODO: column titles?
-        self.content = self.content.replace('|\n\n', '|\n') #replace double new line
-        self.content = self.content.replace("<br/>\n","<br/>") #end of column also needs a pipe
+        self.content = pipe + pipe + self.content + "\n"  # TODO: column titles?
+        self.content = self.content.replace('|\n\n', '|\n')  # replace double new line
+        self.content = self.content.replace("<br/>\n", "<br/>")  # end of column also needs a pipe


 class Tomd:
-    def __init__(self, html='', folder='',file='',options=None):
-        self.html = html #actual data
+    def __init__(self, html='', folder='', file='', options=None):
+        self.html = html  # actual data
        self.folder = folder
        self.file = file
-        self.options = options # haven't been implemented yet
-        self._markdown = self.convert(self.html,self.options)
+        self.options = options  # haven't been implemented yet
+        self._markdown = self.convert(self.html, self.options)

    def convert(self, html="", options=None):
        if html == "":
            html = self.html
-        #main function here
+        # main function here
        elements = []
        for tag, pattern in BlOCK_ELEMENTS.items():
-            # print "pattern is", pattern, "tag", tag
            for m in re.finditer(pattern, html, re.I | re.S | re.M):
                # now m contains the pattern without the tag
-                # if tag == "e_p":
-                # print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
                element = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
@@ -224,10 +220,6 @@ class Tomd:
                        elements.remove(e)
                if can_append:
                    elements.append(element)
-        # print "\n\n\ndone with convert, element is"
-        # for e in elements:
-        #     print repr(str(e))
-        # print "---"
        elements.sort(key=lambda element: element.start_pos)
        self._markdown = ''.join([str(e) for e in elements])

@@ -240,19 +232,19 @@ class Tomd:
        self.convert(self.html, self.options)
        return self._markdown

-    def export(self,folder=False):
+    def export(self, folder=False):
        if len(self.file) < 1:
            warnings.warn("file not specified, renamed to tmp.md")
            file = "tmp.md"
        else:
-            file = self.file.replace('.html','.md') #rename to md
+            file = self.file.replace('.html', '.md')  # rename to md
        if len(self.folder) < 2:
            warnings.warn("folder not specified, will save to pwd")
        elif not folder:
            file = self.folder + '/' + file
-        else: #if folder is specified
+        else:  # if folder is specified
            file = folder + '/' + file
-        f = open(file,'w')
+        f = open(file, 'w')
        f.write(self._markdown)
        f.close()