Commit 3540279d authored by Andrew Xia's avatar Andrew Xia

tomd support for evernote checkbox

parent d1f207f5
# h1
## h2
### h3
#### h4
##### h5
###### h6
paragraph
[link](https://github.com)
![img](https://github.com)
- 1
- 2
- 3
1. 1
1. 2
1. 3
> blockquote
`inline code`
```
block code
```
~~del~~
**bold**
*italic*
***bold italic***
**em**
**strong**
---
|th1|th2
|------
|td|td
|td|td
# h1
## h2
### h3
#### h4
##### h5
###### h6
paragraph
[link](https://github.com)
![img](https://github.com)
- 1
- 2
- 3
1. 1
1. 2
1. 3
> blockquote
`inline code`
```
block code
```
~~del~~
**bold**
*italic*
***bold italic***
**em**
**strong**
---
|th1|th2
|------
|td|td
|td|td
import re import re, os
__all__ = ['Tomd', 'convert'] __all__ = ['Tomd', 'convert']
...@@ -56,7 +56,7 @@ BlOCK_ELEMENTS = { ...@@ -56,7 +56,7 @@ BlOCK_ELEMENTS = {
'table': '<table.*?>(.*?)</table>', #assume that table must be around tr 'table': '<table.*?>(.*?)</table>', #assume that table must be around tr
# evernote # evernote
'e_p': '<div style="font-size.*?>(.*?)</div>' #div for paragraph ? 'e_p': '<div.*?>(.*?)</div>' #div for paragraph ?
} }
INLINE_ELEMENTS = { INLINE_ELEMENTS = {
...@@ -80,23 +80,24 @@ INLINE_ELEMENTS = { ...@@ -80,23 +80,24 @@ INLINE_ELEMENTS = {
'tbody': '<tbody.*?>((.|\n)*)</tbody>', 'tbody': '<tbody.*?>((.|\n)*)</tbody>',
} }
DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>'] DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>','<br clear="none"/>']
class Element: class Element:
def __init__(self, start_pos, end_pos, content, tag, is_block=False): def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
self.start_pos = start_pos self.start_pos = start_pos
self.end_pos = end_pos self.end_pos = end_pos
self.content = content self.content = content
self._elements = [] self._elements = []
self.is_block = is_block self.is_block = is_block
self.tag = tag self.tag = tag
self.folder = folder
self._result = None self._result = None
if self.is_block: if self.is_block:
print "parsing tag:", self.tag, ", content: ", repr(self.content) # print "parsing tag:", self.tag, ", content: ", repr(self.content)
self.parse_inline() self.parse_inline()
print "parsed:", self.tag, ", content: ", self.content if self.tag != 'table':
print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)
def __str__(self): def __str__(self):
wrapper = MARKDOWN.get(self.tag) wrapper = MARKDOWN.get(self.tag)
...@@ -105,6 +106,19 @@ class Element: ...@@ -105,6 +106,19 @@ class Element:
def parse_inline(self): def parse_inline(self):
self.content = self.content.replace('\r', '') #windows \r character self.content = self.content.replace('\r', '') #windows \r character
self.content = self.content.replace('&quot;', '\"') #html quote mark
for m in re.finditer("<img(.*?)en_todo.*?>",self.content):
#remove img and change to [ ] and [x]
#evernote specific parsing
imgSrc = re.search('src=".*?"',m.group())
imgLoc = imgSrc.group()[5:-1] #remove source and " "
imgLoc = imgLoc.replace('\\', '/') #\\ folder slash rotate
if os.stat(self.folder + "/" + imgLoc).st_size < 250:
self.content = self.content.replace(m.group(),"[ ] ")
else:
self.content = self.content.replace(m.group(),"[x] ")
# print self.content
if "e_" in self.tag: #evernote-specific parsing if "e_" in self.tag: #evernote-specific parsing
# if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content): # if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
...@@ -114,7 +128,7 @@ class Element: ...@@ -114,7 +128,7 @@ class Element:
inner = Element(start_pos=m.start(), inner = Element(start_pos=m.start(),
end_pos=m.end(), end_pos=m.end(),
content=''.join(m.groups()), content=''.join(m.groups()),
tag='table', tag='table',folder=self.folder,
is_block=True) is_block=True)
self.content = inner.content self.content = inner.content
return #no need for further parsing ? return #no need for further parsing ?
...@@ -154,6 +168,10 @@ class Element: ...@@ -154,6 +168,10 @@ class Element:
else: else:
wrapper = MARKDOWN.get(tag) wrapper = MARKDOWN.get(tag)
self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content) self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
# if self.tag == "e_p" and self.content[-2:] != '\n': #div, add new line if not there
# self.content += '\n'
def construct_table(self): def construct_table(self):
# this function, after self.content has gained | for table entries, # this function, after self.content has gained | for table entries,
# adds the |---| in markdown to create a proper table # adds the |---| in markdown to create a proper table
...@@ -173,12 +191,16 @@ class Element: ...@@ -173,12 +191,16 @@ class Element:
class Tomd: class Tomd:
def __init__(self, html='', options=None): def __init__(self, html='', folder='',file='',options=None):
self.html = html #actual data self.html = html #actual data
self.folder = folder
self.file = file
self.options = options # haven't been implemented yet self.options = options # haven't been implemented yet
self._markdown = '' self._markdown = ''
def convert(self, html, options=None): def convert(self, html="", options=None):
if html == "":
html = self.html
#main function here #main function here
elements = [] elements = []
for tag, pattern in BlOCK_ELEMENTS.items(): for tag, pattern in BlOCK_ELEMENTS.items():
...@@ -186,11 +208,12 @@ class Tomd: ...@@ -186,11 +208,12 @@ class Tomd:
for m in re.finditer(pattern, html, re.I | re.S | re.M): for m in re.finditer(pattern, html, re.I | re.S | re.M):
# now m contains the pattern without the tag # now m contains the pattern without the tag
# if tag == "e_p": # if tag == "e_p":
print "found", tag, m.groups(), "start", m.start(), "end", m.end() print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
element = Element(start_pos=m.start(), element = Element(start_pos=m.start(),
end_pos=m.end(), end_pos=m.end(),
content=''.join(m.groups()), content=''.join(m.groups()),
tag=tag, tag=tag,
folder=self.folder,
is_block=True) is_block=True)
can_append = True can_append = True
for e in elements: for e in elements:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment