implement text.split_html()

This commit is contained in:
Mike Fährmann 2018-05-27 15:00:41 +02:00
parent 53f36176fd
commit ae9a37a528
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
2 changed files with 35 additions and 0 deletions

View File

@ -43,6 +43,17 @@ def remove_html(txt):
return ""
def split_html(txt, sep=None):
"""Split input string by html-tags"""
try:
return [
x for x in re.split("<[^>]+>", txt)
if x and not x.isspace()
]
except TypeError:
return []
def filename_from_url(url):
"""Extract the last part of an url to use as a filename"""
try:

View File

@ -64,6 +64,30 @@ class TestText(unittest.TestCase):
for value in INVALID:
self.assertEqual(f(value), "")
def test_split_html(self, f=text.split_html):
result = ["Hello", "World."]
empty = []
# standard usage
self.assertEqual(f(""), empty)
self.assertEqual(f("Hello World."), ["Hello World."])
self.assertEqual(f(" Hello World. "), [" Hello World. "])
self.assertEqual(f("Hello<br/>World."), result)
self.assertEqual(
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
# empty HTML
self.assertEqual(f("<div></div>"), empty)
self.assertEqual(f(" <div> </div> "), empty)
# malformed HTML
self.assertEqual(f("<div</div>"), empty)
self.assertEqual(f("<div<Hello World.</div>"), empty)
# invalid arguments
for value in INVALID:
self.assertEqual(f(value), empty)
def test_filename_from_url(self, f=text.filename_from_url):
result = "filename.ext"