Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tomd
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Jobs
Commits
Open sidebar
Чумбаев Максим
tomd
Commits
dd7f6f4a
Commit
dd7f6f4a
authored
Nov 07, 2017
by
zsinx6
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed img single tag without closing
parent
963d5cd0
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
54 additions
and
61 deletions
+54
-61
setup.py
setup.py
+1
-1
test_tomd.py
test_tomd.py
+1
-0
tomd.py
tomd.py
+52
-60
No files found.
setup.py
View file @
dd7f6f4a
...
...
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
setup
(
name
=
"tomd"
,
version
=
"0.1.
3
"
,
version
=
"0.1.
4
"
,
description
=
"Convert HTML to Markdown."
,
author
=
"Gaojiuli"
,
author_email
=
"gaojiuli@gmail.com"
,
...
...
test_tomd.py
View file @
dd7f6f4a
...
...
@@ -12,6 +12,7 @@ string = """
<a href="https://github.com">link</a>
<img src="https://github.com" class="dsad">img</img>
<img src="https://github.com" class="dsad"/>
<img src="https://github.com" class="dsad">
</p>
<ul>
<li>1</li>
...
...
tomd.py
View file @
dd7f6f4a
import
re
,
os
import
re
import
os
import
warnings
__all__
=
[
'Tomd'
,
'convert'
]
...
...
@@ -55,7 +57,7 @@ BlOCK_ELEMENTS = {
INLINE_ELEMENTS
=
{
'td'
:
'<td.*?>((.|
\n
)*?)</td>'
,
#
td element may span lines
'td'
:
'<td.*?>((.|
\n
)*?)</td>'
,
#
td element may span lines
'tr'
:
'<tr.*?>((.|
\n
)*?)</tr>'
,
'th'
:
'<th.*?>(.*?)</th>'
,
'b'
:
'<b.*?>(.*?)</b>'
,
...
...
@@ -77,7 +79,8 @@ INLINE_ELEMENTS = {
'tbody'
:
'<tbody.*?>((.|
\n
)*)</tbody>'
,
}
DELETE_ELEMENTS
=
[
'<span.*?>'
,
'</span>'
,
'<div.*?>'
,
'</div>'
,
'<br clear="none"/>'
,
'<center.*?>'
,
'</center>'
]
DELETE_ELEMENTS
=
[
'<span.*?>'
,
'</span>'
,
'<div.*?>'
,
'</div>'
,
'<br clear="none"/>'
,
'<center.*?>'
,
'</center>'
]
class
Element
:
def
__init__
(
self
,
start_pos
,
end_pos
,
content
,
tag
,
folder
,
is_block
=
False
):
...
...
@@ -91,10 +94,7 @@ class Element:
self
.
_result
=
None
if
self
.
is_block
:
# print "parsing tag:", self.tag, ", content: ", repr(self.content)
self
.
parse_inline
()
# if self.tag != 'table':
# print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)
def
__str__
(
self
):
wrapper
=
MARKDOWN
.
get
(
self
.
tag
)
...
...
@@ -102,44 +102,43 @@ class Element:
return
self
.
_result
def
parse_inline
(
self
):
self
.
content
=
self
.
content
.
replace
(
'
\r
'
,
''
)
#
windows \r character
self
.
content
=
self
.
content
.
replace
(
'
\xc2\xa0
'
,
' '
)
#
no break space
self
.
content
=
self
.
content
.
replace
(
'"'
,
'
\"
'
)
#
html quote mark
for
m
in
re
.
finditer
(
"<img(.*?)en_todo.*?>"
,
self
.
content
):
#remove img and change to [ ] and [x]
#evernote specific parsing
imgSrc
=
re
.
search
(
'src=".*?"'
,
m
.
group
())
imgLoc
=
imgSrc
.
group
()[
5
:
-
1
]
#
remove source and " "
imgLoc
=
imgLoc
.
replace
(
'
\\
'
,
'/'
)
#
\\ folder slash rotate
self
.
content
=
self
.
content
.
replace
(
'
\r
'
,
''
)
#
windows \r character
self
.
content
=
self
.
content
.
replace
(
'
\xc2\xa0
'
,
' '
)
#
no break space
self
.
content
=
self
.
content
.
replace
(
'"'
,
'
\"
'
)
#
html quote mark
for
m
in
re
.
finditer
(
"<img(.*?)en_todo.*?>"
,
self
.
content
):
#
remove img and change to [ ] and [x]
#
evernote specific parsing
imgSrc
=
re
.
search
(
'src=".*?"'
,
m
.
group
())
imgLoc
=
imgSrc
.
group
()[
5
:
-
1
]
#
remove source and " "
imgLoc
=
imgLoc
.
replace
(
'
\\
'
,
'/'
)
#
\\ folder slash rotate
if
os
.
stat
(
self
.
folder
+
"/"
+
imgLoc
)
.
st_size
<
250
:
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[ ] "
)
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[ ] "
)
else
:
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[x] "
)
# print self.content
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[x] "
)
if
"e_"
in
self
.
tag
:
#evernote-specific parsing
# if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
if
"e_"
in
self
.
tag
:
# evernote-specific parsing
for
m
in
re
.
finditer
(
BlOCK_ELEMENTS
[
'table'
],
self
.
content
,
re
.
I
|
re
.
S
|
re
.
M
):
#hmm can there only be one table?
# print "AHHHH THERES A TABLE\n\n"
# hmm can there only be one table?
inner
=
Element
(
start_pos
=
m
.
start
(),
end_pos
=
m
.
end
(),
content
=
''
.
join
(
m
.
groups
()),
tag
=
'table'
,
folder
=
self
.
folder
,
is_block
=
True
)
end_pos
=
m
.
end
(),
content
=
''
.
join
(
m
.
groups
()),
tag
=
'table'
,
folder
=
self
.
folder
,
is_block
=
True
)
self
.
content
=
inner
.
content
return
#
no need for further parsing ?
return
#
no need for further parsing ?
# if no table, parse as usual
self
.
content
=
self
.
content
.
replace
(
'<hr/>'
,
'
\n
---
\n
'
)
self
.
content
=
self
.
content
.
replace
(
'<br/>'
,
''
)
if
self
.
tag
==
"table"
:
#
for removing tbody
if
self
.
tag
==
"table"
:
#
for removing tbody
self
.
content
=
re
.
sub
(
INLINE_ELEMENTS
[
'tbody'
],
'
\
g<1>'
,
self
.
content
)
for
tag
,
pattern
in
INLINE_ELEMENTS
.
items
():
# print "---now looking at", tag, pattern
INLINE_ELEMENTS_LIST_KEYS
=
list
(
INLINE_ELEMENTS
.
keys
())
INLINE_ELEMENTS_LIST_KEYS
.
sort
()
for
tag
in
INLINE_ELEMENTS_LIST_KEYS
:
pattern
=
INLINE_ELEMENTS
[
tag
]
if
tag
==
'a'
:
self
.
content
=
re
.
sub
(
pattern
,
'[
\
g<2>](
\
g<1>)'
,
self
.
content
)
...
...
@@ -147,6 +146,8 @@ class Element:
self
.
content
=
re
.
sub
(
pattern
,
'![
\
g<2>](
\
g<1>)'
,
self
.
content
)
elif
tag
==
'img_single'
:
self
.
content
=
re
.
sub
(
pattern
,
'![](
\
g<1>)'
,
self
.
content
)
elif
tag
==
'img_single_no_close'
:
self
.
content
=
re
.
sub
(
pattern
,
'![](
\
g<1>)'
,
self
.
content
)
elif
self
.
tag
==
'ul'
and
tag
==
'li'
:
self
.
content
=
re
.
sub
(
pattern
,
'-
\
g<1>'
,
self
.
content
)
elif
self
.
tag
==
'ol'
and
tag
==
'li'
:
...
...
@@ -157,19 +158,17 @@ class Element:
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>'
,
self
.
content
.
replace
(
'
\n
'
,
''
))
elif
self
.
tag
==
'tr'
and
tag
==
'td'
:
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>|'
,
self
.
content
.
replace
(
'
\n
'
,
''
))
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
#end of column also needs a pipe
# print "---converting, td remove duplicate:", tag, self.content
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
# end of column also needs a pipe
elif
self
.
tag
==
'table'
and
tag
==
'td'
:
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>|'
,
self
.
content
)
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
#end of column also needs a pipe
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
#replace double new line
# print "---converting, td remove duplicate:", tag, self.content
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
# end of column also needs a pipe
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
# replace double new line
self
.
construct_table
()
else
:
wrapper
=
MARKDOWN
.
get
(
tag
)
self
.
content
=
re
.
sub
(
pattern
,
'{}
\
g<1>{}'
.
format
(
wrapper
[
0
],
wrapper
[
1
]),
self
.
content
)
if
self
.
tag
==
"e_p"
and
self
.
content
[
-
1
:]
!=
'
\n
'
and
len
(
self
.
content
)
>
2
:
if
self
.
tag
==
"e_p"
and
self
.
content
[
-
1
:]
!=
'
\n
'
and
len
(
self
.
content
)
>
2
:
# focusing on div, add new line if not there (and if content is long enough)
self
.
content
+=
'
\n
'
...
...
@@ -177,39 +176,36 @@ class Element:
# this function, after self.content has gained | for table entries,
# adds the |---| in markdown to create a proper table
temp
=
self
.
content
.
split
(
'
\n
'
,
3
)
temp
=
self
.
content
.
split
(
'
\n
'
,
3
)
for
elt
in
temp
:
if
elt
!=
""
:
count
=
elt
.
count
(
"|"
)
#
count number of pipes
count
=
elt
.
count
(
"|"
)
#
count number of pipes
break
pipe
=
"
\n
|"
#
beginning \n for safety
for
i
in
xrange
(
count
-
1
):
pipe
=
"
\n
|"
#
beginning \n for safety
for
i
in
range
(
count
-
1
):
pipe
+=
"---|"
pipe
+=
"
\n
"
self
.
content
=
pipe
+
pipe
+
self
.
content
+
"
\n
"
#
TODO: column titles?
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
#
replace double new line
self
.
content
=
self
.
content
.
replace
(
"<br/>
\n
"
,
"<br/>"
)
#
end of column also needs a pipe
self
.
content
=
pipe
+
pipe
+
self
.
content
+
"
\n
"
#
TODO: column titles?
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
#
replace double new line
self
.
content
=
self
.
content
.
replace
(
"<br/>
\n
"
,
"<br/>"
)
#
end of column also needs a pipe
class
Tomd
:
def
__init__
(
self
,
html
=
''
,
folder
=
''
,
file
=
''
,
options
=
None
):
self
.
html
=
html
#
actual data
def
__init__
(
self
,
html
=
''
,
folder
=
''
,
file
=
''
,
options
=
None
):
self
.
html
=
html
#
actual data
self
.
folder
=
folder
self
.
file
=
file
self
.
options
=
options
# haven't been implemented yet
self
.
_markdown
=
self
.
convert
(
self
.
html
,
self
.
options
)
self
.
options
=
options
# haven't been implemented yet
self
.
_markdown
=
self
.
convert
(
self
.
html
,
self
.
options
)
def
convert
(
self
,
html
=
""
,
options
=
None
):
if
html
==
""
:
html
=
self
.
html
#main function here
#
main function here
elements
=
[]
for
tag
,
pattern
in
BlOCK_ELEMENTS
.
items
():
# print "pattern is", pattern, "tag", tag
for
m
in
re
.
finditer
(
pattern
,
html
,
re
.
I
|
re
.
S
|
re
.
M
):
# now m contains the pattern without the tag
# if tag == "e_p":
# print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
element
=
Element
(
start_pos
=
m
.
start
(),
end_pos
=
m
.
end
(),
content
=
''
.
join
(
m
.
groups
()),
...
...
@@ -224,10 +220,6 @@ class Tomd:
elements
.
remove
(
e
)
if
can_append
:
elements
.
append
(
element
)
# print "\n\n\ndone with convert, element is"
# for e in elements:
# print repr(str(e))
# print "---"
elements
.
sort
(
key
=
lambda
element
:
element
.
start_pos
)
self
.
_markdown
=
''
.
join
([
str
(
e
)
for
e
in
elements
])
...
...
@@ -240,19 +232,19 @@ class Tomd:
self
.
convert
(
self
.
html
,
self
.
options
)
return
self
.
_markdown
def
export
(
self
,
folder
=
False
):
def
export
(
self
,
folder
=
False
):
if
len
(
self
.
file
)
<
1
:
warnings
.
warn
(
"file not specified, renamed to tmp.md"
)
file
=
"tmp.md"
else
:
file
=
self
.
file
.
replace
(
'.html'
,
'.md'
)
#
rename to md
file
=
self
.
file
.
replace
(
'.html'
,
'.md'
)
#
rename to md
if
len
(
self
.
folder
)
<
2
:
warnings
.
warn
(
"folder not specified, will save to pwd"
)
elif
not
folder
:
file
=
self
.
folder
+
'/'
+
file
else
:
#
if folder is specified
else
:
#
if folder is specified
file
=
folder
+
'/'
+
file
f
=
open
(
file
,
'w'
)
f
=
open
(
file
,
'w'
)
f
.
write
(
self
.
_markdown
)
f
.
close
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment