Source code for ac_core.utils

from datetime import datetime
import time

import bs4

HTML_PARSER = 'lxml'

[docs]def remove_prefix(s: str, prefix: str) -> str: assert s.startswith(prefix) return s[len(prefix):]
[docs]def remove_suffix(s: str, suffix: str) -> str: assert s.endswith(suffix) return s[:-len(suffix)]
[docs]def time_str_2_timestamp(s: str) -> int: assert (s.endswith('+0900')) TIME_FORMART = "%Y-%m-%d %H:%M:%S%z" # return int(datetime.strptime(s, TIME_FORMART).timestamp())
[docs]def get_direct_children_text(tag: bs4.Tag) -> str: """get_direct_children_text collects the text which are direct children of the given tag. For example, this returns "A - Hello world " for a tag ``<h2>A - Hello world <a href="...">Editorial</a></h2>``. """ assert isinstance(tag, bs4.Tag) text = '' for child in tag.children: if isinstance(child, bs4.NavigableString): # print("check mypy child.string or child.strings") text += child.string # type: ignore # TODO elif isinstance(child, bs4.Tag) and == 'br': text += '\n' else: pass return text