Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tomd
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Jobs
Commits
Open sidebar
Чумбаев Максим
tomd
Commits
dd7f6f4a
Commit
dd7f6f4a
authored
Nov 07, 2017
by
zsinx6
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed img single tag without closing
parent
963d5cd0
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
54 additions
and
61 deletions
+54
-61
setup.py
setup.py
+1
-1
test_tomd.py
test_tomd.py
+1
-0
tomd.py
tomd.py
+52
-60
No files found.
setup.py
View file @
dd7f6f4a
...
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
...
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
setup
(
setup
(
name
=
"tomd"
,
name
=
"tomd"
,
version
=
"0.1.
3
"
,
version
=
"0.1.
4
"
,
description
=
"Convert HTML to Markdown."
,
description
=
"Convert HTML to Markdown."
,
author
=
"Gaojiuli"
,
author
=
"Gaojiuli"
,
author_email
=
"gaojiuli@gmail.com"
,
author_email
=
"gaojiuli@gmail.com"
,
...
...
test_tomd.py
View file @
dd7f6f4a
...
@@ -12,6 +12,7 @@ string = """
...
@@ -12,6 +12,7 @@ string = """
<a href="https://github.com">link</a>
<a href="https://github.com">link</a>
<img src="https://github.com" class="dsad">img</img>
<img src="https://github.com" class="dsad">img</img>
<img src="https://github.com" class="dsad"/>
<img src="https://github.com" class="dsad"/>
<img src="https://github.com" class="dsad">
</p>
</p>
<ul>
<ul>
<li>1</li>
<li>1</li>
...
...
tomd.py
View file @
dd7f6f4a
import
re
,
os
import
re
import
os
import
warnings
__all__
=
[
'Tomd'
,
'convert'
]
__all__
=
[
'Tomd'
,
'convert'
]
...
@@ -55,7 +57,7 @@ BlOCK_ELEMENTS = {
...
@@ -55,7 +57,7 @@ BlOCK_ELEMENTS = {
INLINE_ELEMENTS
=
{
INLINE_ELEMENTS
=
{
'td'
:
'<td.*?>((.|
\n
)*?)</td>'
,
#
td element may span lines
'td'
:
'<td.*?>((.|
\n
)*?)</td>'
,
#
td element may span lines
'tr'
:
'<tr.*?>((.|
\n
)*?)</tr>'
,
'tr'
:
'<tr.*?>((.|
\n
)*?)</tr>'
,
'th'
:
'<th.*?>(.*?)</th>'
,
'th'
:
'<th.*?>(.*?)</th>'
,
'b'
:
'<b.*?>(.*?)</b>'
,
'b'
:
'<b.*?>(.*?)</b>'
,
...
@@ -77,7 +79,8 @@ INLINE_ELEMENTS = {
...
@@ -77,7 +79,8 @@ INLINE_ELEMENTS = {
'tbody'
:
'<tbody.*?>((.|
\n
)*)</tbody>'
,
'tbody'
:
'<tbody.*?>((.|
\n
)*)</tbody>'
,
}
}
DELETE_ELEMENTS
=
[
'<span.*?>'
,
'</span>'
,
'<div.*?>'
,
'</div>'
,
'<br clear="none"/>'
,
'<center.*?>'
,
'</center>'
]
DELETE_ELEMENTS
=
[
'<span.*?>'
,
'</span>'
,
'<div.*?>'
,
'</div>'
,
'<br clear="none"/>'
,
'<center.*?>'
,
'</center>'
]
class
Element
:
class
Element
:
def
__init__
(
self
,
start_pos
,
end_pos
,
content
,
tag
,
folder
,
is_block
=
False
):
def
__init__
(
self
,
start_pos
,
end_pos
,
content
,
tag
,
folder
,
is_block
=
False
):
...
@@ -91,10 +94,7 @@ class Element:
...
@@ -91,10 +94,7 @@ class Element:
self
.
_result
=
None
self
.
_result
=
None
if
self
.
is_block
:
if
self
.
is_block
:
# print "parsing tag:", self.tag, ", content: ", repr(self.content)
self
.
parse_inline
()
self
.
parse_inline
()
# if self.tag != 'table':
# print "parsed:", self.tag, self.folder, ", content: ", repr(self.content)
def
__str__
(
self
):
def
__str__
(
self
):
wrapper
=
MARKDOWN
.
get
(
self
.
tag
)
wrapper
=
MARKDOWN
.
get
(
self
.
tag
)
...
@@ -102,44 +102,43 @@ class Element:
...
@@ -102,44 +102,43 @@ class Element:
return
self
.
_result
return
self
.
_result
def
parse_inline
(
self
):
def
parse_inline
(
self
):
self
.
content
=
self
.
content
.
replace
(
'
\r
'
,
''
)
#
windows \r character
self
.
content
=
self
.
content
.
replace
(
'
\r
'
,
''
)
#
windows \r character
self
.
content
=
self
.
content
.
replace
(
'
\xc2\xa0
'
,
' '
)
#
no break space
self
.
content
=
self
.
content
.
replace
(
'
\xc2\xa0
'
,
' '
)
#
no break space
self
.
content
=
self
.
content
.
replace
(
'"'
,
'
\"
'
)
#
html quote mark
self
.
content
=
self
.
content
.
replace
(
'"'
,
'
\"
'
)
#
html quote mark
for
m
in
re
.
finditer
(
"<img(.*?)en_todo.*?>"
,
self
.
content
):
for
m
in
re
.
finditer
(
"<img(.*?)en_todo.*?>"
,
self
.
content
):
#remove img and change to [ ] and [x]
#
remove img and change to [ ] and [x]
#evernote specific parsing
#
evernote specific parsing
imgSrc
=
re
.
search
(
'src=".*?"'
,
m
.
group
())
imgSrc
=
re
.
search
(
'src=".*?"'
,
m
.
group
())
imgLoc
=
imgSrc
.
group
()[
5
:
-
1
]
#
remove source and " "
imgLoc
=
imgSrc
.
group
()[
5
:
-
1
]
#
remove source and " "
imgLoc
=
imgLoc
.
replace
(
'
\\
'
,
'/'
)
#
\\ folder slash rotate
imgLoc
=
imgLoc
.
replace
(
'
\\
'
,
'/'
)
#
\\ folder slash rotate
if
os
.
stat
(
self
.
folder
+
"/"
+
imgLoc
)
.
st_size
<
250
:
if
os
.
stat
(
self
.
folder
+
"/"
+
imgLoc
)
.
st_size
<
250
:
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[ ] "
)
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[ ] "
)
else
:
else
:
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[x] "
)
self
.
content
=
self
.
content
.
replace
(
m
.
group
(),
"[x] "
)
# print self.content
if
"e_"
in
self
.
tag
:
#evernote-specific parsing
if
"e_"
in
self
.
tag
:
# evernote-specific parsing
# if self.content != re.sub(BlOCK_ELEMENTS['table'], '\g<1>', self.content):
for
m
in
re
.
finditer
(
BlOCK_ELEMENTS
[
'table'
],
self
.
content
,
re
.
I
|
re
.
S
|
re
.
M
):
for
m
in
re
.
finditer
(
BlOCK_ELEMENTS
[
'table'
],
self
.
content
,
re
.
I
|
re
.
S
|
re
.
M
):
#hmm can there only be one table?
# hmm can there only be one table?
# print "AHHHH THERES A TABLE\n\n"
inner
=
Element
(
start_pos
=
m
.
start
(),
inner
=
Element
(
start_pos
=
m
.
start
(),
end_pos
=
m
.
end
(),
end_pos
=
m
.
end
(),
content
=
''
.
join
(
m
.
groups
()),
content
=
''
.
join
(
m
.
groups
()),
tag
=
'table'
,
folder
=
self
.
folder
,
tag
=
'table'
,
folder
=
self
.
folder
,
is_block
=
True
)
is_block
=
True
)
self
.
content
=
inner
.
content
self
.
content
=
inner
.
content
return
#
no need for further parsing ?
return
#
no need for further parsing ?
# if no table, parse as usual
# if no table, parse as usual
self
.
content
=
self
.
content
.
replace
(
'<hr/>'
,
'
\n
---
\n
'
)
self
.
content
=
self
.
content
.
replace
(
'<hr/>'
,
'
\n
---
\n
'
)
self
.
content
=
self
.
content
.
replace
(
'<br/>'
,
''
)
self
.
content
=
self
.
content
.
replace
(
'<br/>'
,
''
)
if
self
.
tag
==
"table"
:
#
for removing tbody
if
self
.
tag
==
"table"
:
#
for removing tbody
self
.
content
=
re
.
sub
(
INLINE_ELEMENTS
[
'tbody'
],
'
\
g<1>'
,
self
.
content
)
self
.
content
=
re
.
sub
(
INLINE_ELEMENTS
[
'tbody'
],
'
\
g<1>'
,
self
.
content
)
for
tag
,
pattern
in
INLINE_ELEMENTS
.
items
():
INLINE_ELEMENTS_LIST_KEYS
=
list
(
INLINE_ELEMENTS
.
keys
())
# print "---now looking at", tag, pattern
INLINE_ELEMENTS_LIST_KEYS
.
sort
()
for
tag
in
INLINE_ELEMENTS_LIST_KEYS
:
pattern
=
INLINE_ELEMENTS
[
tag
]
if
tag
==
'a'
:
if
tag
==
'a'
:
self
.
content
=
re
.
sub
(
pattern
,
'[
\
g<2>](
\
g<1>)'
,
self
.
content
)
self
.
content
=
re
.
sub
(
pattern
,
'[
\
g<2>](
\
g<1>)'
,
self
.
content
)
...
@@ -147,6 +146,8 @@ class Element:
...
@@ -147,6 +146,8 @@ class Element:
self
.
content
=
re
.
sub
(
pattern
,
'![
\
g<2>](
\
g<1>)'
,
self
.
content
)
self
.
content
=
re
.
sub
(
pattern
,
'![
\
g<2>](
\
g<1>)'
,
self
.
content
)
elif
tag
==
'img_single'
:
elif
tag
==
'img_single'
:
self
.
content
=
re
.
sub
(
pattern
,
'![](
\
g<1>)'
,
self
.
content
)
self
.
content
=
re
.
sub
(
pattern
,
'![](
\
g<1>)'
,
self
.
content
)
elif
tag
==
'img_single_no_close'
:
self
.
content
=
re
.
sub
(
pattern
,
'![](
\
g<1>)'
,
self
.
content
)
elif
self
.
tag
==
'ul'
and
tag
==
'li'
:
elif
self
.
tag
==
'ul'
and
tag
==
'li'
:
self
.
content
=
re
.
sub
(
pattern
,
'-
\
g<1>'
,
self
.
content
)
self
.
content
=
re
.
sub
(
pattern
,
'-
\
g<1>'
,
self
.
content
)
elif
self
.
tag
==
'ol'
and
tag
==
'li'
:
elif
self
.
tag
==
'ol'
and
tag
==
'li'
:
...
@@ -157,19 +158,17 @@ class Element:
...
@@ -157,19 +158,17 @@ class Element:
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>'
,
self
.
content
.
replace
(
'
\n
'
,
''
))
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>'
,
self
.
content
.
replace
(
'
\n
'
,
''
))
elif
self
.
tag
==
'tr'
and
tag
==
'td'
:
elif
self
.
tag
==
'tr'
and
tag
==
'td'
:
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>|'
,
self
.
content
.
replace
(
'
\n
'
,
''
))
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>|'
,
self
.
content
.
replace
(
'
\n
'
,
''
))
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
#end of column also needs a pipe
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
# end of column also needs a pipe
# print "---converting, td remove duplicate:", tag, self.content
elif
self
.
tag
==
'table'
and
tag
==
'td'
:
elif
self
.
tag
==
'table'
and
tag
==
'td'
:
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>|'
,
self
.
content
)
self
.
content
=
re
.
sub
(
pattern
,
'|
\
g<1>|'
,
self
.
content
)
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
#end of column also needs a pipe
self
.
content
=
self
.
content
.
replace
(
"||"
,
"|"
)
# end of column also needs a pipe
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
#replace double new line
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
# replace double new line
# print "---converting, td remove duplicate:", tag, self.content
self
.
construct_table
()
self
.
construct_table
()
else
:
else
:
wrapper
=
MARKDOWN
.
get
(
tag
)
wrapper
=
MARKDOWN
.
get
(
tag
)
self
.
content
=
re
.
sub
(
pattern
,
'{}
\
g<1>{}'
.
format
(
wrapper
[
0
],
wrapper
[
1
]),
self
.
content
)
self
.
content
=
re
.
sub
(
pattern
,
'{}
\
g<1>{}'
.
format
(
wrapper
[
0
],
wrapper
[
1
]),
self
.
content
)
if
self
.
tag
==
"e_p"
and
self
.
content
[
-
1
:]
!=
'
\n
'
and
len
(
self
.
content
)
>
2
:
if
self
.
tag
==
"e_p"
and
self
.
content
[
-
1
:]
!=
'
\n
'
and
len
(
self
.
content
)
>
2
:
# focusing on div, add new line if not there (and if content is long enough)
# focusing on div, add new line if not there (and if content is long enough)
self
.
content
+=
'
\n
'
self
.
content
+=
'
\n
'
...
@@ -177,39 +176,36 @@ class Element:
...
@@ -177,39 +176,36 @@ class Element:
# this function, after self.content has gained | for table entries,
# this function, after self.content has gained | for table entries,
# adds the |---| in markdown to create a proper table
# adds the |---| in markdown to create a proper table
temp
=
self
.
content
.
split
(
'
\n
'
,
3
)
temp
=
self
.
content
.
split
(
'
\n
'
,
3
)
for
elt
in
temp
:
for
elt
in
temp
:
if
elt
!=
""
:
if
elt
!=
""
:
count
=
elt
.
count
(
"|"
)
#
count number of pipes
count
=
elt
.
count
(
"|"
)
#
count number of pipes
break
break
pipe
=
"
\n
|"
#
beginning \n for safety
pipe
=
"
\n
|"
#
beginning \n for safety
for
i
in
xrange
(
count
-
1
):
for
i
in
range
(
count
-
1
):
pipe
+=
"---|"
pipe
+=
"---|"
pipe
+=
"
\n
"
pipe
+=
"
\n
"
self
.
content
=
pipe
+
pipe
+
self
.
content
+
"
\n
"
#
TODO: column titles?
self
.
content
=
pipe
+
pipe
+
self
.
content
+
"
\n
"
#
TODO: column titles?
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
#
replace double new line
self
.
content
=
self
.
content
.
replace
(
'|
\n\n
'
,
'|
\n
'
)
#
replace double new line
self
.
content
=
self
.
content
.
replace
(
"<br/>
\n
"
,
"<br/>"
)
#
end of column also needs a pipe
self
.
content
=
self
.
content
.
replace
(
"<br/>
\n
"
,
"<br/>"
)
#
end of column also needs a pipe
class
Tomd
:
class
Tomd
:
def
__init__
(
self
,
html
=
''
,
folder
=
''
,
file
=
''
,
options
=
None
):
def
__init__
(
self
,
html
=
''
,
folder
=
''
,
file
=
''
,
options
=
None
):
self
.
html
=
html
#
actual data
self
.
html
=
html
#
actual data
self
.
folder
=
folder
self
.
folder
=
folder
self
.
file
=
file
self
.
file
=
file
self
.
options
=
options
# haven't been implemented yet
self
.
options
=
options
# haven't been implemented yet
self
.
_markdown
=
self
.
convert
(
self
.
html
,
self
.
options
)
self
.
_markdown
=
self
.
convert
(
self
.
html
,
self
.
options
)
def
convert
(
self
,
html
=
""
,
options
=
None
):
def
convert
(
self
,
html
=
""
,
options
=
None
):
if
html
==
""
:
if
html
==
""
:
html
=
self
.
html
html
=
self
.
html
#main function here
#
main function here
elements
=
[]
elements
=
[]
for
tag
,
pattern
in
BlOCK_ELEMENTS
.
items
():
for
tag
,
pattern
in
BlOCK_ELEMENTS
.
items
():
# print "pattern is", pattern, "tag", tag
for
m
in
re
.
finditer
(
pattern
,
html
,
re
.
I
|
re
.
S
|
re
.
M
):
for
m
in
re
.
finditer
(
pattern
,
html
,
re
.
I
|
re
.
S
|
re
.
M
):
# now m contains the pattern without the tag
# now m contains the pattern without the tag
# if tag == "e_p":
# print "found", tag, m.groups(), "start", m.start(), "end", m.end(), self.folder
element
=
Element
(
start_pos
=
m
.
start
(),
element
=
Element
(
start_pos
=
m
.
start
(),
end_pos
=
m
.
end
(),
end_pos
=
m
.
end
(),
content
=
''
.
join
(
m
.
groups
()),
content
=
''
.
join
(
m
.
groups
()),
...
@@ -224,10 +220,6 @@ class Tomd:
...
@@ -224,10 +220,6 @@ class Tomd:
elements
.
remove
(
e
)
elements
.
remove
(
e
)
if
can_append
:
if
can_append
:
elements
.
append
(
element
)
elements
.
append
(
element
)
# print "\n\n\ndone with convert, element is"
# for e in elements:
# print repr(str(e))
# print "---"
elements
.
sort
(
key
=
lambda
element
:
element
.
start_pos
)
elements
.
sort
(
key
=
lambda
element
:
element
.
start_pos
)
self
.
_markdown
=
''
.
join
([
str
(
e
)
for
e
in
elements
])
self
.
_markdown
=
''
.
join
([
str
(
e
)
for
e
in
elements
])
...
@@ -240,19 +232,19 @@ class Tomd:
...
@@ -240,19 +232,19 @@ class Tomd:
self
.
convert
(
self
.
html
,
self
.
options
)
self
.
convert
(
self
.
html
,
self
.
options
)
return
self
.
_markdown
return
self
.
_markdown
def
export
(
self
,
folder
=
False
):
def
export
(
self
,
folder
=
False
):
if
len
(
self
.
file
)
<
1
:
if
len
(
self
.
file
)
<
1
:
warnings
.
warn
(
"file not specified, renamed to tmp.md"
)
warnings
.
warn
(
"file not specified, renamed to tmp.md"
)
file
=
"tmp.md"
file
=
"tmp.md"
else
:
else
:
file
=
self
.
file
.
replace
(
'.html'
,
'.md'
)
#
rename to md
file
=
self
.
file
.
replace
(
'.html'
,
'.md'
)
#
rename to md
if
len
(
self
.
folder
)
<
2
:
if
len
(
self
.
folder
)
<
2
:
warnings
.
warn
(
"folder not specified, will save to pwd"
)
warnings
.
warn
(
"folder not specified, will save to pwd"
)
elif
not
folder
:
elif
not
folder
:
file
=
self
.
folder
+
'/'
+
file
file
=
self
.
folder
+
'/'
+
file
else
:
#
if folder is specified
else
:
#
if folder is specified
file
=
folder
+
'/'
+
file
file
=
folder
+
'/'
+
file
f
=
open
(
file
,
'w'
)
f
=
open
(
file
,
'w'
)
f
.
write
(
self
.
_markdown
)
f
.
write
(
self
.
_markdown
)
f
.
close
()
f
.
close
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment