Merge pull request #2 from qandrew/master

add support for tables

Merge pull request #2 from qandrew/master
add support for tables
b1ad8182 · JiuLi Gao · GitHub · d90b10c4 · 65fbba81 · b1ad8182
Commit b1ad8182 authored Aug 16, 2017 by JiuLi Gao Committed by GitHub Aug 16, 2017
Show whitespace changes
Inline Side-by-side

Showing with 285 additions and 16 deletions

ex.md ex.md +101 -0

main.py main.py +60 -0

tomd.py tomd.py +124 -16

No files found.
--- a/ex.md
+++ b/ex.md
+# h1
+
+## h2
+
+### h3
+
+#### h4
+
+##### h5
+
+###### h6
+
+paragraph
+[link](https://github.com)
+![img](https://github.com)
+
+
+- 1
+- 2
+- 3
+
+1. 1
+1. 2
+1. 3
+
+> blockquote
+
+`inline code`
+
+```
+block code
+```
+
+
+~~del~~
+**bold**
+*italic*
+***bold italic***
+**em**
+**strong**
+
+
+---
+
+
+|th1|th2
+|------
+|td|td
+|td|td
+
+
+# h1
+
+## h2
+
+### h3
+
+#### h4
+
+##### h5
+
+###### h6
+
+paragraph
+[link](https://github.com)
+![img](https://github.com)
+
+
+- 1
+- 2
+- 3
+
+1. 1
+1. 2
+1. 3
+
+> blockquote
+
+`inline code`
+
+```
+block code
+```
+
+
+~~del~~
+**bold**
+*italic*
+***bold italic***
+**em**
+**strong**
+
+
+---
+
+
+|th1|th2
+|------
+|td|td
+|td|td
+
--- a/main.py
+++ b/main.py
+# June 27 2017
+# Andrew Xia
+# main program (for testing stuff)
+
+import tomd
+
+# FOLDER = "/home/andrew/Documents/Evernote_170625/"
+FOLDER = "/home/andrew/Documents/Github/evernote-analysis"
+# FILE = "Week11.html"
+FILE = "pensive.html"
+CONTENT = ""
+
+FOLDER = "/home/andrew/Documents/Evernote_170625/Log/2016"
+FILE = "Week 10 37 to 313.html"
+
+f = open(FOLDER + "/" + FILE)
+for line in f:
+  CONTENT += line
+
+# CONTENT = """
+# <p>For <em>Implementing</em>, working with <a href="http://chiraag.scripts.mit.edu/wiki/start" target="_blank"><strong>Chiraag Juvekar</strong></a> and <a href="http://www-mtl.mit.edu/~anantha/" target="_blank"><strong>Prof. Anantha Chandrakasan</strong></a>.</p>
+
+# <h3 id="abstract">Abstract</h3>
+
+# <p>Having an
+# is </p>
+# <hr/>
+# <p>My paper can be found <a href="/files/superUROP.pdf"><strong>here</strong></a></p>
+# """
+
+# CONTENT = """
+# <table bgcolor="#D4DDE5" border="0">
+# <tr><td><b>Created:</b></td><td><i>10/31/2011 8:59 PM</i></td></tr>
+# <tr><td><b>Updated:</b></td><td><i>5/12/2012 5:42 PM</i></td></tr>
+# <tr><td><b>Tags:</b></td><td><i>birthday</i></td></tr>
+# </table>
+# """
+
+# CONTENT = """<table border="1" cellpadding="2" cellspacing="0" style="font-size: 13px;" width="100%"><tbody><tr><td valign="top">Day (Sleep)</td><td valign="top">Internet</td><td valign="top">SAT Plan/Actual</td><td valign="top">PRIMES</td><td valign="top">Homework</td><td valign="top">Athletics</td></tr><tr><td valign="top">Monday<br/>
+# 7hr</td><td valign="top">2.30hr<br/>
+# 4LOL</td><td valign="top"><br/></td><td valign="top">0.05min<br/>
+# Email, </td><td valign="top">Dartmouth</td><td valign="top"><br/></td></tr><tr><td valign="top">Tuesday<br/>
+# 10hr</td><td valign="top">1.50hr<br/>
+# LOL</td><td valign="top">:SAT K8</td><td valign="top">1.00hr<br/>
+# 7Zip Data</td><td valign="top"><br/></td><td valign="top">Swim</td></tr><tr><td valign="top">Wednesday<br/>
+# 11hr</td><td valign="top">1.54hr<br/>
+# LOL</td><td valign="top">SAT 36</td><td valign="top">1.10hr<br/>
+# HIV Data, trying excel convert</td><td valign="top">College</td><td valign="top">Swim</td></tr><tr><td valign="top">Thursday<br/>
+# 10hr</td><td valign="top">1.37min<br/>
+# 4LOL</td><td valign="top">SAT 36, 35</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim</td></tr><tr><td valign="top">Friday<br/>
+# 10hr</td><td valign="top">0.30min</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Driving Lesson</td><td valign="top">Swim</td></tr><tr><td valign="top">Saturday<br/>
+# 8hr</td><td valign="top">2.10hr<br/>
+# 3LOL</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim Meet</td></tr><tr><td valign="top">Sunday<br/>
+# 9hr</td><td valign="top">1.08hr<br/>
+# 2LOL</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim Meet</td></tr></tbody></table>
+# """
+
+converter = tomd.Tomd(CONTENT,FOLDER,FILE)
+# print(converter.markdown())
+
--- a/tomd.py
+++ b/tomd.py
-import re
+import re, os

 __all__ = ['Tomd', 'convert']

@@ -24,13 +24,18 @@ MARKDOWN = {
    'inline_p_with_out_class': ('', ''),
    'b': ('**', '**'),
    'i': ('*', '*'),
+    'em': ('*', '*'),
    'del': ('~~', '~~'),
    'hr': ('\n---', '\n\n'),
    'thead': ('\n', '|------\n'),
    'tbody': ('\n', '\n'),
    'td': ('|', ''),
    'th': ('|', ''),
-    'tr': ('', '\n')
+    'tr': ('', '\n'),
+    'table': ('', '\n'),
+
+    #evernote
+    'e_p': ('', '\n')
 }

 BlOCK_ELEMENTS = {
@@ -48,12 +53,16 @@ BlOCK_ELEMENTS = {
    'p': '<p\s.*?>(.*?)</p>',
    'p_with_out_class': '<p>(.*?)</p>',
    'thead': '<thead.*?>(.*?)</thead>',
-    'tr': '<tr>(.*?)</tr>'
+    # 'tr': '<tr>(.*?)</tr>',
+    'table': '<table.*?>(.*?)</table>', #assume that table must be around tr
+
+    # evernote
+    'e_p': '<div.*?>(.*?)</div>' #div for paragraph ?
 }

 INLINE_ELEMENTS = {
-    'td': '<td>(.*?)</td>',
-    'tr': '<tr>(.*?)</tr>',
+    'td': '<td.*?>((.|\n)*?)</td>', #td element may span lines
+    'tr': '<tr>((.|\n)*?)</tr>',
    'th': '<th>(.*?)</th>',
    'b': '<b>(.*?)</b>',
    'i': '<i>(.*?)</i>',
@@ -68,24 +77,28 @@ INLINE_ELEMENTS = {
    'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
    'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
    'em': '<em.*?>(.*?)</em>',
-    'strong': '<strong.*?>(.*?)</strong>'
+    'strong': '<strong.*?>(.*?)</strong>',
+    'tbody': '<tbody.*?>((.|\n)*)</tbody>',
 }

-DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>']
-
+DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>','<br clear="none"/>']

 class Element:
-    def __init__(self, start_pos, end_pos, content, tag, is_block=False):
+    def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.content = content
        self._elements = []
        self.is_block = is_block
        self.tag = tag
+        self.folder = folder
        self._result = None

        if self.is_block:
+            # print "parsing tag:", self.tag, ", content: ", repr(self.content)
            self.parse_inline()
+            # if self.tag != 'table':
+                # print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)

    def __str__(self):
        wrapper = MARKDOWN.get(self.tag)
@@ -93,7 +106,44 @@ class Element:
        return self._result

    def parse_inline(self):
+        self.content = self.content.replace('\r', '') #windows \r character
+        self.content = self.content.replace('\xc2\xa0', ' ') #no break space
+        self.content = self.content.replace('&quot;', '\"') #html quote mark
+
+        for m in re.finditer("<img(.*?)en_todo.*?>",self.content):
+            #remove img and change to [ ] and [x]
+            #evernote specific parsing
+            imgSrc = re.search('src=".*?"',m.group())
+            imgLoc = imgSrc.group()[5:-1] #remove source and " "
+            imgLoc = imgLoc.replace('\\', '/') #\\ folder slash rotate
+            if os.stat(self.folder + "/" + imgLoc).st_size < 250:
+                self.content = self.content.replace(m.group(),"[ ] ")
+            else:
+                self.content = self.content.replace(m.group(),"[x] ")
+        # print self.content
+
+        if "e_" in self.tag: #evernote-specific parsing
+            # if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
+            for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
+                #hmm can there only be one table?
+                # print "AHHHH THERES A TABLE\n\n"
+                inner = Element(start_pos=m.start(),
+                                  end_pos=m.end(),
+                                  content=''.join(m.groups()),
+                                  tag='table',folder=self.folder,
+                                  is_block=True)
+                self.content = inner.content
+                return #no need for further parsing ?
+
+            # if no table, parse as usual
+            self.content = self.content.replace('<hr/>', '\n---\n')
+            self.content = self.content.replace('<br/>', '')
+
+        if self.tag == "table": #for removing tbody
+            self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)
+
        for tag, pattern in INLINE_ELEMENTS.items():
+            # print "---now looking at", tag, pattern

            if tag == 'a':
                self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
@@ -108,26 +158,65 @@ class Element:
            elif self.tag == 'tr' and tag == 'th':
                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'td':
-                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
+                self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
+                self.content = self.content.replace("||","|") #end of column also needs a pipe
+                # print "---converting, td remove duplicate:", tag, self.content
+            elif self.tag == 'table' and tag == 'td':
+                self.content = re.sub(pattern, '|\g<1>|', self.content)
+                self.content = self.content.replace("||","|") #end of column also needs a pipe
+                self.content = self.content.replace('|\n\n', '|\n') #replace double new line
+                # print "---converting, td remove duplicate:", tag, self.content
+                self.construct_table()
            else:
                wrapper = MARKDOWN.get(tag)
                self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)

+        if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2: 
+            # focusing on div, add new line if not there (and if content is long enough)
+            self.content += '\n'
+
+    def construct_table(self):
+        # this function, after self.content has gained | for table entries,
+        # adds the |---| in markdown to create a proper table
+
+        temp = self.content.split('\n',3)
+        for elt in temp:
+            if elt != "":
+                count = elt.count("|") #count number of pipes
+                break
+        pipe = "\n|" #beginning \n for safety
+        for i in xrange(count-1):
+            pipe += "---|"
+        pipe += "\n"
+        self.content = pipe + pipe + self.content + "\n" #TODO: column titles?
+        self.content = self.content.replace('|\n\n', '|\n') #replace double new line
+        self.content = self.content.replace("<br/>\n","<br/>") #end of column also needs a pipe
+

 class Tomd:
-    def __init__(self, html='', options=None):
-        self.html = html
-        self.options = options
-        self._markdown = ''
+    def __init__(self, html='', folder='',file='',options=None):
+        self.html = html #actual data
+        self.folder = folder
+        self.file = file
+        self.options = options # haven't been implemented yet
+        self._markdown = self.convert(self.html,self.options)

-    def convert(self, html, options=None):
+    def convert(self, html="", options=None):
+        if html == "":
+            html = self.html
+        #main function here
        elements = []
        for tag, pattern in BlOCK_ELEMENTS.items():
+            # print "pattern is", pattern, "tag", tag
            for m in re.finditer(pattern, html, re.I | re.S | re.M):
+                # now m contains the pattern without the tag
+                # if tag == "e_p":
+                # print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
                element = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
                                  tag=tag,
+                                  folder=self.folder,
                                  is_block=True)
                can_append = True
                for e in elements:
@@ -137,7 +226,10 @@ class Tomd:
                        elements.remove(e)
                if can_append:
                    elements.append(element)
-
+        # print "\n\n\ndone with convert, element is"
+        # for e in elements:
+        #     print repr(str(e))
+        # print "---"
        elements.sort(key=lambda element: element.start_pos)
        self._markdown = ''.join([str(e) for e in elements])

@@ -150,6 +242,22 @@ class Tomd:
        self.convert(self.html, self.options)
        return self._markdown

+    def export(self,folder=False):
+        if len(self.file) < 1:
+            warnings.warn("file not specified, renamed to tmp.md")
+            file = "tmp.md"
+        else:
+            file = self.file.replace('.html','.md') #rename to md
+        if len(self.folder) < 2:
+            warnings.warn("folder not specified, will save to pwd")
+        elif not folder:
+            file = self.folder + '/' + file
+        else: #if folder is specified
+            file = folder + '/' + file
+        f = open(file,'w')
+        f.write(self._markdown)
+        f.close()
+

 _inst = Tomd()
 convert = _inst.convert