tomd support for evernote checkbox

3540279d · Andrew Xia · d1f207f5 · 3540279d · 3540279d
Commit 3540279d authored Jul 07, 2017 by Andrew Xia
Hide whitespace changes
Inline Side-by-side

Showing with 135 additions and 11 deletions

ex.md ex.md +101 -0

tomd.py tomd.py +34 -11

No files found.
--- a/ex.md
+++ b/ex.md
+# h1
+## h2
+### h3
+#### h4
+##### h5
+###### h6
+paragraph
+[link](https://github.com)
+![img](https://github.com)
+- 1
+- 2
+- 3
+1. 1
+1. 2
+1. 3
+> blockquote
+`inline code`
+```
+block code
+```
+~~del~~
+**bold**
+*italic*
+***bold italic***
+**em**
+**strong**
+---
+|th1|th2
+|------
+|td|td
+|td|td
+# h1
+## h2
+### h3
+#### h4
+##### h5
+###### h6
+paragraph
+[link](https://github.com)
+![img](https://github.com)
+- 1
+- 2
+- 3
+1. 1
+1. 2
+1. 3
+> blockquote
+`inline code`
+```
+block code
+```
+~~del~~
+**bold**
+*italic*
+***bold italic***
+**em**
+**strong**
+---
+|th1|th2
+|------
+|td|td
+|td|td
--- a/tomd.py
+++ b/tomd.py
-import re
+import re, os
 __all__ = ['Tomd', 'convert']
@@ -56,7 +56,7 @@ BlOCK_ELEMENTS = {
    'table': '<table.*?>(.*?)</table>', #assume that table must be around tr
    # evernote
-    'e_p': '<div style="font-size.*?>(.*?)</div>' #div for paragraph ?
+    'e_p': '<div.*?>(.*?)</div>' #div for paragraph ?
 }
 INLINE_ELEMENTS = {
@@ -80,23 +80,24 @@ INLINE_ELEMENTS = {
    'tbody': '<tbody.*?>((.|\n)*)</tbody>',
 }
-DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>']
+DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>','<br clear="none"/>']
 class Element:
-    def __init__(self, start_pos, end_pos, content, tag, is_block=False):
+    def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.content = content
        self._elements = []
        self.is_block = is_block
        self.tag = tag
+        self.folder = folder
        self._result = None
        if self.is_block:
-            print "parsing tag:", self.tag, ", content: ", repr(self.content)
+            # print "parsing tag:", self.tag, ", content: ", repr(self.content)
            self.parse_inline()
-            print "parsed:", self.tag, ", content: ", self.content
+            if self.tag != 'table':
+                print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)
    def __str__(self):
        wrapper = MARKDOWN.get(self.tag)
@@ -105,6 +106,19 @@ class Element:
    def parse_inline(self):
        self.content = self.content.replace('\r', '') #windows \r character
+        self.content = self.content.replace('&quot;', '\"') #html quote mark
+        for m in re.finditer("<img(.*?)en_todo.*?>",self.content):
+            #remove img and change to [ ] and [x]
+            #evernote specific parsing
+            imgSrc = re.search('src=".*?"',m.group())
+            imgLoc = imgSrc.group()[5:-1] #remove source and " "
+            imgLoc = imgLoc.replace('\\', '/') #\\ folder slash rotate
+            if os.stat(self.folder + "/" + imgLoc).st_size < 250:
+                self.content = self.content.replace(m.group(),"[ ] ")
+            else:
+                self.content = self.content.replace(m.group(),"[x] ")
+        # print self.content
        if "e_" in self.tag: #evernote-specific parsing
            # if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
@@ -114,7 +128,7 @@ class Element:
                inner = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
-                                  tag='table',
+                                  tag='table',folder=self.folder,
                                  is_block=True)
                self.content = inner.content
                return #no need for further parsing ?
@@ -154,6 +168,10 @@ class Element:
            else:
                wrapper = MARKDOWN.get(tag)
                self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
+        # if self.tag == "e_p" and self.content[-2:] != '\n': #div, add new line if not there
+        #     self.content += '\n'
    def construct_table(self):
        # this function, after self.content has gained | for table entries,
        # adds the |---| in markdown to create a proper table
@@ -173,12 +191,16 @@ class Element:
 class Tomd:
-    def __init__(self, html='', options=None):
+    def __init__(self, html='', folder='',file='',options=None):
        self.html = html #actual data
+        self.folder = folder
+        self.file = file
        self.options = options # haven't been implemented yet
        self._markdown = ''
-    def convert(self, html, options=None):
+    def convert(self, html="", options=None):
+        if html == "":
+            html = self.html
        #main function here
        elements = []
        for tag, pattern in BlOCK_ELEMENTS.items():
@@ -186,11 +208,12 @@ class Tomd:
            for m in re.finditer(pattern, html, re.I | re.S | re.M):
                # now m contains the pattern without the tag
                # if tag == "e_p":
-                print "found", tag, m.groups(), "start", m.start(), "end", m.end()
+                print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
                element = Element(start_pos=m.start(),
                                  end_pos=m.end(),
                                  content=''.join(m.groups()),
                                  tag=tag,
+                                  folder=self.folder,
                                  is_block=True)
                can_append = True
                for e in elements: