Revert "add support for tables"

b35f4fd4 · JiuLi Gao · GitHub · b1ad8182 · b1ad8182 · b1ad8182
Commit b35f4fd4 authored Aug 16, 2017 by JiuLi Gao Committed by GitHub Aug 16, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 285 deletions

ex.md ex.md +0 -101

main.py main.py +0 -60

tomd.py tomd.py +16 -124

No files found.
--- a/ex.md
+++ b/ex.md
-# h1
-## h2
-### h3
-#### h4
-##### h5
-###### h6
-paragraph
-[link](https://github.com)
-![img](https://github.com)
- 1
- 2
- 3
-1. 1
-1. 2
-1. 3
-> blockquote
-`inline code`
-```
-block code
-```
-~~del~~
-**bold**
-*italic*
-***bold italic***
-**em**
-**strong**
---
-|th1|th2
-|------
-|td|td
-|td|td
-# h1
-## h2
-### h3
-#### h4
-##### h5
-###### h6
-paragraph
-[link](https://github.com)
-![img](https://github.com)
- 1
- 2
- 3
-1. 1
-1. 2
-1. 3
-> blockquote
-`inline code`
-```
-block code
-```
-~~del~~
-**bold**
-*italic*
-***bold italic***
-**em**
-**strong**
---
-|th1|th2
-|------
-|td|td
-|td|td
--- a/main.py
+++ b/main.py
-# June 27 2017
-# Andrew Xia
-# main program (for testing stuff)
-import tomd
-# FOLDER = "/home/andrew/Documents/Evernote_170625/"
-FOLDER = "/home/andrew/Documents/Github/evernote-analysis"
-# FILE = "Week11.html"
-FILE = "pensive.html"
-CONTENT = ""
-FOLDER = "/home/andrew/Documents/Evernote_170625/Log/2016"
-FILE = "Week 10 37 to 313.html"
-f = open(FOLDER + "/" + FILE)
-for line in f:
-  CONTENT += line
-# CONTENT = """
-# <p>For <em>Implementing</em>, working with <a href="http://chiraag.scripts.mit.edu/wiki/start" target="_blank"><strong>Chiraag Juvekar</strong></a> and <a href="http://www-mtl.mit.edu/~anantha/" target="_blank"><strong>Prof. Anantha Chandrakasan</strong></a>.</p>
-# <h3 id="abstract">Abstract</h3>
-# <p>Having an
-# is </p>
-# <hr/>
-# <p>My paper can be found <a href="/files/superUROP.pdf"><strong>here</strong></a></p>
-# """
-# CONTENT = """
-# <table bgcolor="#D4DDE5" border="0">
-# <tr><td><b>Created:</b></td><td><i>10/31/2011 8:59 PM</i></td></tr>
-# <tr><td><b>Updated:</b></td><td><i>5/12/2012 5:42 PM</i></td></tr>
-# <tr><td><b>Tags:</b></td><td><i>birthday</i></td></tr>
-# </table>
-# """
-# CONTENT = """<table border="1" cellpadding="2" cellspacing="0" style="font-size: 13px;" width="100%"><tbody><tr><td valign="top">Day (Sleep)</td><td valign="top">Internet</td><td valign="top">SAT Plan/Actual</td><td valign="top">PRIMES</td><td valign="top">Homework</td><td valign="top">Athletics</td></tr><tr><td valign="top">Monday<br/>
-# 7hr</td><td valign="top">2.30hr<br/>
-# 4LOL</td><td valign="top"><br/></td><td valign="top">0.05min<br/>
-# Email, </td><td valign="top">Dartmouth</td><td valign="top"><br/></td></tr><tr><td valign="top">Tuesday<br/>
-# 10hr</td><td valign="top">1.50hr<br/>
-# LOL</td><td valign="top">:SAT K8</td><td valign="top">1.00hr<br/>
-# 7Zip Data</td><td valign="top"><br/></td><td valign="top">Swim</td></tr><tr><td valign="top">Wednesday<br/>
-# 11hr</td><td valign="top">1.54hr<br/>
-# LOL</td><td valign="top">SAT 36</td><td valign="top">1.10hr<br/>
-# HIV Data, trying excel convert</td><td valign="top">College</td><td valign="top">Swim</td></tr><tr><td valign="top">Thursday<br/>
-# 10hr</td><td valign="top">1.37min<br/>
-# 4LOL</td><td valign="top">SAT 36, 35</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim</td></tr><tr><td valign="top">Friday<br/>
-# 10hr</td><td valign="top">0.30min</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Driving Lesson</td><td valign="top">Swim</td></tr><tr><td valign="top">Saturday<br/>
-# 8hr</td><td valign="top">2.10hr<br/>
-# 3LOL</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim Meet</td></tr><tr><td valign="top">Sunday<br/>
-# 9hr</td><td valign="top">1.08hr<br/>
-# 2LOL</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim Meet</td></tr></tbody></table>
-# """
-converter = tomd.Tomd(CONTENT,FOLDER,FILE)
-# print(converter.markdown())
--- a/tomd.py
+++ b/tomd.py
-import re, os
+import re
 __all__ = ['Tomd', 'convert']
@@ -24,18 +24,13 @@ MARKDOWN = {
    'inline_p_with_out_class': ('', ''),
    'b': ('**', '**'),
    'i': ('*', '*'),
-    'em': ('*', '*'),
    'del': ('~~', '~~'),
    'hr': ('\n---', '\n\n'),
    'thead': ('\n', '|------\n'),
    'tbody': ('\n', '\n'),
    'td': ('|', ''),
    'th': ('|', ''),
-    'tr': ('', '\n'),
+    'tr': ('', '\n')
-    'table': ('', '\n'),
-    #evernote
-    'e_p': ('', '\n')
 }
 BlOCK_ELEMENTS = {
@@ -53,16 +48,12 @@ BlOCK_ELEMENTS = {
    'p': '<p\s.*?>(.*?)</p>',
    'p_with_out_class': '<p>(.*?)</p>',
    'thead': '<thead.*?>(.*?)</thead>',
-    # 'tr': '<tr>(.*?)</tr>',
+    'tr': '<tr>(.*?)</tr>'
-    'table': '<table.*?>(.*?)</table>', #assume that table must be around tr
-    # evernote
-    'e_p': '<div.*?>(.*?)</div>' #div for paragraph ?
 }
 INLINE_ELEMENTS = {
-    'td': '<td.*?>((.|\n)*?)</td>', #td element may span lines
+    'td': '<td>(.*?)</td>',
-    'tr': '<tr>((.|\n)*?)</tr>',
+    'tr': '<tr>(.*?)</tr>',
    'th': '<th>(.*?)</th>',
    'b': '<b>(.*?)</b>',
    'i': '<i>(.*?)</i>',
@@ -77,28 +68,24 @@ INLINE_ELEMENTS = {
    'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
    'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
    'em': '<em.*?>(.*?)</em>',
-    'strong': '<strong.*?>(.*?)</strong>',
+    'strong': '<strong.*?>(.*?)</strong>'
-    'tbody': '<tbody.*?>((.|\n)*)</tbody>',
 }
-DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>','<br clear="none"/>']
+DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>']
 class Element:
-    def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
+    def __init__(self, start_pos, end_pos, content, tag, is_block=False):
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.content = content
        self._elements = []
        self.is_block = is_block
        self.tag = tag
-        self.folder = folder
        self._result = None
        if self.is_block:
-            # print "parsing tag:", self.tag, ", content: ", repr(self.content)
            self.parse_inline()
-            # if self.tag != 'table':
-                # print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)
    def __str__(self):
        wrapper = MARKDOWN.get(self.tag)
@@ -106,44 +93,7 @@ class Element:
        return self._result
    def parse_inline(self):
-        self.content = self.content.replace('\r', '') #windows \r character
-        self.content = self.content.replace('\xc2\xa0', ' ') #no break space
-        self.content = self.content.replace('&quot;', '\"') #html quote mark
-        for m in re.finditer("<img(.*?)en_todo.*?>",self.content):
-            #remove img and change to [ ] and [x]
-            #evernote specific parsing
-            imgSrc = re.search('src=".*?"',m.group())
-            imgLoc = imgSrc.group()[5:-1] #remove source and " "
-            imgLoc = imgLoc.replace('\\', '/') #\\ folder slash rotate
-            if os.stat(self.folder + "/" + imgLoc).st_size < 250:
-                self.content = self.content.replace(m.group(),"[ ] ")
-            else:
-                self.content = self.content.replace(m.group(),"[x] ")
-        # print self.content
-        if "e_" in self.tag: #evernote-specific parsing
-            # if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
-            for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
-                #hmm can there only be one table?
-                # print "AHHHH THERES A TABLE\n\n"
-                inner = Element(start_pos=m.start(),
-                                  end_pos=m.end(),
-                                  content=''.join(m.groups()),
-                                  tag='table',folder=self.folder,
-                                  is_block=True)
-                self.content = inner.content
-                return #no need for further parsing ?
-            # if no table, parse as usual
-            self.content = self.content.replace('<hr/>', '\n---\n')
-            self.content = self.content.replace('<br/>', '')
-        if self.tag == "table": #for removing tbody
-            self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)
        for tag, pattern in INLINE_ELEMENTS.items():
-            # print "---now looking at", tag, pattern
            if tag == 'a':
                self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
@@ -158,65 +108,26 @@ class Element:
            elif self.tag == 'tr' and tag == 'th':
                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
            elif self.tag == 'tr' and tag == 'td':
-                self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
+                self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
-                self.content = self.content.replace("||","|") #end of column also needs a pipe
-                # print "---converting, td remove duplicate:", tag, self.content
-            elif self.tag == 'table' and tag == 'td':
-                self.content = re.sub(pattern, '|\g<1>|', self.content)
-                self.content = self.content.replace("||","|") #end of column also needs a pipe
-                self.content = self.content.replace('|\n\n', '|\n') #replace double new line
-                # print "---converting, td remove duplicate:", tag, self.content
-                self.construct_table()
            else:
                wrapper = MARKDOWN.get(tag)
                self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
-        if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2: 
-            # focusing on div, add new line if not there (and if content is long enough)
-            self.content += '\n'
-    def construct_table(self):
-        # this function, after self.content has gained | for table entries,
-        # adds the |---| in markdown to create a proper table
-        temp = self.content.split('\n',3)
-        for elt in temp:
-            if elt != "":
-                count = elt.count("|") #count number of pipes
-                break
-        pipe = "\n|" #beginning \n for safety
-        for i in xrange(count-1):
-            pipe += "---|"
-        pipe += "\n"
-        self.content = pipe + pipe + self.content + "\n" #TODO: column titles?
-        self.content = self.content.replace('|\n\n', '|\n') #replace double new line
-        self.content = self.content.replace("<br/>\n","<br/>") #end of column also needs a pipe
 class Tomd:
-    def __init__(self, html='', folder='',file='',options=None):
+    def __init__(self, html='', options=None):
-        self.html = html #actual data
+        self.html = html
-        self.folder = folder
+        self.options = options
-        self.file = file
+        self._markdown = ''
-        self.options = options # haven't been implemented yet
-        self._markdown = self.convert(self.html,self.options)
-    def convert(self, html="", options=None):
+    def convert(self, html, options=None):
-        if html == "":
-            html = self.html
-        #main function here
        elements = []
        for tag, pattern in BlOCK_ELEMENTS.items():
-            # print "pattern is", pattern, "tag", tag
            for m in re.finditer(pattern, html, re.I | re.S | re.M):
-                # now m contains the pattern without the tag
-                # if tag == "e_p":
-                # print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
                element = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
                                  tag=tag,
-                                  folder=self.folder,
                                  is_block=True)
                can_append = True
                for e in elements:
@@ -226,10 +137,7 @@ class Tomd:
                        elements.remove(e)
                if can_append:
                    elements.append(element)
-        # print "\n\n\ndone with convert, element is"
-        # for e in elements:
-        #     print repr(str(e))
-        # print "---"
        elements.sort(key=lambda element: element.start_pos)
        self._markdown = ''.join([str(e) for e in elements])
@@ -242,22 +150,6 @@ class Tomd:
        self.convert(self.html, self.options)
        return self._markdown
-    def export(self,folder=False):
-        if len(self.file) < 1:
-            warnings.warn("file not specified, renamed to tmp.md")
-            file = "tmp.md"
-        else:
-            file = self.file.replace('.html','.md') #rename to md
-        if len(self.folder) < 2:
-            warnings.warn("folder not specified, will save to pwd")
-        elif not folder:
-            file = self.folder + '/' + file
-        else: #if folder is specified
-            file = folder + '/' + file
-        f = open(file,'w')
-        f.write(self._markdown)
-        f.close()
 _inst = Tomd()
 convert = _inst.convert