Commit f57d0482 authored by p's avatar p

Elements with parameters

parents 49d84459 b1ad8182
# h1
## h2
### h3
#### h4
##### h5
###### h6
paragraph
[link](https://github.com)
![img](https://github.com)
- 1
- 2
- 3
1. 1
1. 2
1. 3
> blockquote
`inline code`
```
block code
```
~~del~~
**bold**
*italic*
***bold italic***
**em**
**strong**
---
|th1|th2
|------
|td|td
|td|td
# h1
## h2
### h3
#### h4
##### h5
###### h6
paragraph
[link](https://github.com)
![img](https://github.com)
- 1
- 2
- 3
1. 1
1. 2
1. 3
> blockquote
`inline code`
```
block code
```
~~del~~
**bold**
*italic*
***bold italic***
**em**
**strong**
---
|th1|th2
|------
|td|td
|td|td
# June 27 2017
# Andrew Xia
# main program (for testing stuff)
import tomd
# FOLDER = "/home/andrew/Documents/Evernote_170625/"
FOLDER = "/home/andrew/Documents/Github/evernote-analysis"
# FILE = "Week11.html"
FILE = "pensive.html"
CONTENT = ""
FOLDER = "/home/andrew/Documents/Evernote_170625/Log/2016"
FILE = "Week 10 37 to 313.html"
f = open(FOLDER + "/" + FILE)
for line in f:
CONTENT += line
# CONTENT = """
# <p>For <em>Implementing</em>, working with <a href="http://chiraag.scripts.mit.edu/wiki/start" target="_blank"><strong>Chiraag Juvekar</strong></a> and <a href="http://www-mtl.mit.edu/~anantha/" target="_blank"><strong>Prof. Anantha Chandrakasan</strong></a>.</p>
# <h3 id="abstract">Abstract</h3>
# <p>Having an
# is </p>
# <hr/>
# <p>My paper can be found <a href="/files/superUROP.pdf"><strong>here</strong></a></p>
# """
# CONTENT = """
# <table bgcolor="#D4DDE5" border="0">
# <tr><td><b>Created:</b></td><td><i>10/31/2011 8:59 PM</i></td></tr>
# <tr><td><b>Updated:</b></td><td><i>5/12/2012 5:42 PM</i></td></tr>
# <tr><td><b>Tags:</b></td><td><i>birthday</i></td></tr>
# </table>
# """
# CONTENT = """<table border="1" cellpadding="2" cellspacing="0" style="font-size: 13px;" width="100%"><tbody><tr><td valign="top">Day (Sleep)</td><td valign="top">Internet</td><td valign="top">SAT Plan/Actual</td><td valign="top">PRIMES</td><td valign="top">Homework</td><td valign="top">Athletics</td></tr><tr><td valign="top">Monday<br/>
# 7hr</td><td valign="top">2.30hr<br/>
# 4LOL</td><td valign="top"><br/></td><td valign="top">0.05min<br/>
# Email, </td><td valign="top">Dartmouth</td><td valign="top"><br/></td></tr><tr><td valign="top">Tuesday<br/>
# 10hr</td><td valign="top">1.50hr<br/>
# LOL</td><td valign="top">:SAT K8</td><td valign="top">1.00hr<br/>
# 7Zip Data</td><td valign="top"><br/></td><td valign="top">Swim</td></tr><tr><td valign="top">Wednesday<br/>
# 11hr</td><td valign="top">1.54hr<br/>
# LOL</td><td valign="top">SAT 36</td><td valign="top">1.10hr<br/>
# HIV Data, trying excel convert</td><td valign="top">College</td><td valign="top">Swim</td></tr><tr><td valign="top">Thursday<br/>
# 10hr</td><td valign="top">1.37min<br/>
# 4LOL</td><td valign="top">SAT 36, 35</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim</td></tr><tr><td valign="top">Friday<br/>
# 10hr</td><td valign="top">0.30min</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Driving Lesson</td><td valign="top">Swim</td></tr><tr><td valign="top">Saturday<br/>
# 8hr</td><td valign="top">2.10hr<br/>
# 3LOL</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim Meet</td></tr><tr><td valign="top">Sunday<br/>
# 9hr</td><td valign="top">1.08hr<br/>
# 2LOL</td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top"><br/></td><td valign="top">Swim Meet</td></tr></tbody></table>
# """
converter = tomd.Tomd(CONTENT,FOLDER,FILE)
# print(converter.markdown())
import re import re, os
__all__ = ['Tomd', 'convert'] __all__ = ['Tomd', 'convert']
...@@ -30,7 +30,9 @@ MARKDOWN = { ...@@ -30,7 +30,9 @@ MARKDOWN = {
'tbody': ('\n', '\n'), 'tbody': ('\n', '\n'),
'td': ('|', ''), 'td': ('|', ''),
'th': ('|', ''), 'th': ('|', ''),
'tr': ('', '\n') 'tr': ('', '\n'),
'table': ('', '\n'),
'e_p': ('', '\n')
} }
BlOCK_ELEMENTS = { BlOCK_ELEMENTS = {
...@@ -51,9 +53,10 @@ BlOCK_ELEMENTS = { ...@@ -51,9 +53,10 @@ BlOCK_ELEMENTS = {
'tr': '<tr.*?>(.*?)</tr>' 'tr': '<tr.*?>(.*?)</tr>'
} }
INLINE_ELEMENTS = { INLINE_ELEMENTS = {
'td': '<td.*?>(.*?)</td>', 'td': '<td.*?>((.|\n)*?)</td>', #td element may span lines
'tr': '<tr.*?>(.*?)</tr>', 'tr': '<tr.*?>((.|\n)*?)</tr>',
'th': '<th.*?>(.*?)</th>', 'th': '<th.*?>(.*?)</th>',
'b': '<b.*?>(.*?)</b>', 'b': '<b.*?>(.*?)</b>',
'i': '<i.*?>(.*?)</i>', 'i': '<i.*?>(.*?)</i>',
...@@ -68,24 +71,28 @@ INLINE_ELEMENTS = { ...@@ -68,24 +71,28 @@ INLINE_ELEMENTS = {
'img': '<img.*?src="(.*?)".*?>(.*?)</img>', 'img': '<img.*?src="(.*?)".*?>(.*?)</img>',
'a': '<a.*?href="(.*?)".*?>(.*?)</a>', 'a': '<a.*?href="(.*?)".*?>(.*?)</a>',
'em': '<em.*?>(.*?)</em>', 'em': '<em.*?>(.*?)</em>',
'strong': '<strong.*?>(.*?)</strong>' 'strong': '<strong.*?>(.*?)</strong>',
'tbody': '<tbody.*?>((.|\n)*)</tbody>',
} }
DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>'] DELETE_ELEMENTS = ['<span.*?>', '</span>', '<div.*?>', '</div>','<br clear="none"/>']
class Element: class Element:
def __init__(self, start_pos, end_pos, content, tag, is_block=False): def __init__(self, start_pos, end_pos, content, tag, folder, is_block=False):
self.start_pos = start_pos self.start_pos = start_pos
self.end_pos = end_pos self.end_pos = end_pos
self.content = content self.content = content
self._elements = [] self._elements = []
self.is_block = is_block self.is_block = is_block
self.tag = tag self.tag = tag
self.folder = folder
self._result = None self._result = None
if self.is_block: if self.is_block:
# print "parsing tag:", self.tag, ", content: ", repr(self.content)
self.parse_inline() self.parse_inline()
# if self.tag != 'table':
# print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)
def __str__(self): def __str__(self):
wrapper = MARKDOWN.get(self.tag) wrapper = MARKDOWN.get(self.tag)
...@@ -93,7 +100,44 @@ class Element: ...@@ -93,7 +100,44 @@ class Element:
return self._result return self._result
def parse_inline(self): def parse_inline(self):
self.content = self.content.replace('\r', '') #windows \r character
self.content = self.content.replace('\xc2\xa0', ' ') #no break space
self.content = self.content.replace('&quot;', '\"') #html quote mark
for m in re.finditer("<img(.*?)en_todo.*?>",self.content):
#remove img and change to [ ] and [x]
#evernote specific parsing
imgSrc = re.search('src=".*?"',m.group())
imgLoc = imgSrc.group()[5:-1] #remove source and " "
imgLoc = imgLoc.replace('\\', '/') #\\ folder slash rotate
if os.stat(self.folder + "/" + imgLoc).st_size < 250:
self.content = self.content.replace(m.group(),"[ ] ")
else:
self.content = self.content.replace(m.group(),"[x] ")
# print self.content
if "e_" in self.tag: #evernote-specific parsing
# if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
for m in re.finditer(BlOCK_ELEMENTS['table'], self.content, re.I | re.S | re.M):
#hmm can there only be one table?
# print "AHHHH THERES A TABLE\n\n"
inner = Element(start_pos=m.start(),
end_pos=m.end(),
content=''.join(m.groups()),
tag='table',folder=self.folder,
is_block=True)
self.content = inner.content
return #no need for further parsing ?
# if no table, parse as usual
self.content = self.content.replace('<hr/>', '\n---\n')
self.content = self.content.replace('<br/>', '')
if self.tag == "table": #for removing tbody
self.content = re.sub(INLINE_ELEMENTS['tbody'], '\g<1>', self.content)
for tag, pattern in INLINE_ELEMENTS.items(): for tag, pattern in INLINE_ELEMENTS.items():
# print "---now looking at", tag, pattern
if tag == 'a': if tag == 'a':
self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content) self.content = re.sub(pattern, '[\g<2>](\g<1>)', self.content)
...@@ -108,26 +152,65 @@ class Element: ...@@ -108,26 +152,65 @@ class Element:
elif self.tag == 'tr' and tag == 'th': elif self.tag == 'tr' and tag == 'th':
self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', ''))
elif self.tag == 'tr' and tag == 'td': elif self.tag == 'tr' and tag == 'td':
self.content = re.sub(pattern, '|\g<1>', self.content.replace('\n', '')) self.content = re.sub(pattern, '|\g<1>|', self.content.replace('\n', ''))
self.content = self.content.replace("||","|") #end of column also needs a pipe
# print "---converting, td remove duplicate:", tag, self.content
elif self.tag == 'table' and tag == 'td':
self.content = re.sub(pattern, '|\g<1>|', self.content)
self.content = self.content.replace("||","|") #end of column also needs a pipe
self.content = self.content.replace('|\n\n', '|\n') #replace double new line
# print "---converting, td remove duplicate:", tag, self.content
self.construct_table()
else: else:
wrapper = MARKDOWN.get(tag) wrapper = MARKDOWN.get(tag)
self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content) self.content = re.sub(pattern, '{}\g<1>{}'.format(wrapper[0], wrapper[1]), self.content)
if self.tag == "e_p" and self.content[-1:] != '\n' and len(self.content) > 2:
# focusing on div, add new line if not there (and if content is long enough)
self.content += '\n'
def construct_table(self):
# this function, after self.content has gained | for table entries,
# adds the |---| in markdown to create a proper table
temp = self.content.split('\n',3)
for elt in temp:
if elt != "":
count = elt.count("|") #count number of pipes
break
pipe = "\n|" #beginning \n for safety
for i in xrange(count-1):
pipe += "---|"
pipe += "\n"
self.content = pipe + pipe + self.content + "\n" #TODO: column titles?
self.content = self.content.replace('|\n\n', '|\n') #replace double new line
self.content = self.content.replace("<br/>\n","<br/>") #end of column also needs a pipe
class Tomd: class Tomd:
def __init__(self, html='', options=None): def __init__(self, html='', folder='',file='',options=None):
self.html = html self.html = html #actual data
self.options = options self.folder = folder
self._markdown = '' self.file = file
self.options = options # haven't been implemented yet
self._markdown = self.convert(self.html,self.options)
def convert(self, html, options=None): def convert(self, html="", options=None):
if html == "":
html = self.html
#main function here
elements = [] elements = []
for tag, pattern in BlOCK_ELEMENTS.items(): for tag, pattern in BlOCK_ELEMENTS.items():
# print "pattern is", pattern, "tag", tag
for m in re.finditer(pattern, html, re.I | re.S | re.M): for m in re.finditer(pattern, html, re.I | re.S | re.M):
# now m contains the pattern without the tag
# if tag == "e_p":
# print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
element = Element(start_pos=m.start(), element = Element(start_pos=m.start(),
end_pos=m.end(), end_pos=m.end(),
content=''.join(m.groups()), content=''.join(m.groups()),
tag=tag, tag=tag,
folder=self.folder,
is_block=True) is_block=True)
can_append = True can_append = True
for e in elements: for e in elements:
...@@ -137,7 +220,10 @@ class Tomd: ...@@ -137,7 +220,10 @@ class Tomd:
elements.remove(e) elements.remove(e)
if can_append: if can_append:
elements.append(element) elements.append(element)
# print "\n\n\ndone with convert, element is"
# for e in elements:
# print repr(str(e))
# print "---"
elements.sort(key=lambda element: element.start_pos) elements.sort(key=lambda element: element.start_pos)
self._markdown = ''.join([str(e) for e in elements]) self._markdown = ''.join([str(e) for e in elements])
...@@ -150,6 +236,22 @@ class Tomd: ...@@ -150,6 +236,22 @@ class Tomd:
self.convert(self.html, self.options) self.convert(self.html, self.options)
return self._markdown return self._markdown
def export(self,folder=False):
if len(self.file) < 1:
warnings.warn("file not specified, renamed to tmp.md")
file = "tmp.md"
else:
file = self.file.replace('.html','.md') #rename to md
if len(self.folder) < 2:
warnings.warn("folder not specified, will save to pwd")
elif not folder:
file = self.folder + '/' + file
else: #if folder is specified
file = folder + '/' + file
f = open(file,'w')
f.write(self._markdown)
f.close()
_inst = Tomd() _inst = Tomd()
convert = _inst.convert convert = _inst.convert
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment