Source code for ac_core.problem

from dataclasses import dataclass
from logging import getLogger
import re
from typing import Dict, Iterator, List, Optional, Tuple

from bs4 import BeautifulSoup
import bs4
from ac_core.constant import _SITE_URL
from ac_core.modal.problem_test_case import ProblemTestCase
from ac_core.utils import HTML_PARSER, get_direct_children_text, remove_prefix, remove_suffix
from ac_core.utils.html_parse_helper import parse_start_end, parse_url

logger = getLogger(__name__)


[docs]class SampleParseError(RuntimeError):

  def __init__(self, message: str = 'failed to parse samples'):
    super().__init__(message)


[docs]@dataclass
class ProblemResult():
  id: str
  url: str
  name: str
  score: int
  tests: List[ProblemTestCase]
  contest_name: str
  contest_url: str
  contest_start: int
  contest_end: int
  memory_limit_kb: int
  time_limit_msec: int


def _parse_score(soup: bs4.BeautifulSoup) -> Optional[int]:
  task_statement = soup.find('div', id='task-statement')
  p = task_statement.find('p')  # first

  if isinstance(p, bs4.Tag) and p.text.startswith('配点 : '):
    score = remove_suffix(remove_prefix(p.text, '配点 : '), ' 点')
    try:
      return int(score)
    except ValueError:
      # some problems have scores like "<p>配点 : \(100\) 点</p>", not "<p>配点 : 100 点</p>"
      # example: https://atcoder.jp/contests/wupc2019/tasks/wupc2019_a
      pass
  return None


# Title in/out content
def _find_sample_tags(soup: BeautifulSoup) -> Iterator[Tuple[str, int, str]]:
  # fix dup test case cause of lang-ja and lang-en
  lang_soup = soup.find('span', class_="lang-en")
  if lang_soup is None:
    lang_soup = soup.find('span', class_="lang-ja")
  if lang_soup is None:
    lang_soup = soup.find(id='task-statement')
  assert isinstance(lang_soup, bs4.Tag)

  input_strings = ('入力例', 'Sample Input')
  output_strings = ('出力例', 'Sample Output')
  expected_strings = input_strings + output_strings

  def h3_2_title_inout(s: str) -> Tuple[str, int]:
    for prefix in input_strings:
      if s.startswith(prefix):
        return (s[len(prefix):].strip(), 0)
    for prefix in output_strings:
      if s.startswith(prefix):
        return (s[len(prefix):].strip(), 1)

    raise SampleParseError('Unknown input or output:' + str(h3))

  def get_header(tag, expected_tag_name):
    if tag and tag.name == expected_tag_name and tag.string and any(s in tag.string for s in expected_strings):
      return tag
    return None

  for pre in lang_soup.find_all('pre'):
    logger.debug('pre tag: %s', str(pre))

    # the standard format: #task-statement h3+pre
    # used by AtCoder's JavaScript, sometimes used with .prettyprint
    # example: https://atcoder.jp/contests/abc114/tasks/abc114_d
    # NOTE: The AtCoder's JavaScript (at https://atcoder.jp/public/js/contest.js?v=201911110917 version) supports:
    #     -   "#task-statement h3+pre" format for Copy buttons of <h3> and <pre> tags
    #     -   "pre.prettyprint" format for Copy buttons of <pre> tags
    h3 = get_header(tag=pre.find_previous_sibling(), expected_tag_name='h3')
    if h3:
      yield h3_2_title_inout(h3.text) + (pre.text, )
      continue

    # a old format: #task-statement h3+section>pre:first-child
    # partially supported by AtCoder's JavaScript
    # NOTE: The relaxed format "#task-statement h3+section>pre" may cause false-positive. e.g. https://atcoder.jp/contests/abc003/tasks/abc003_4
    # NOTE: The format "h3+section>pre.prettyprint" sometimes cause false-negative. e.g. https://atcoder.jp/contests/tdpc/tasks/tdpc_fibonacci
    # example: https://atcoder.jp/contests/abc003/tasks/abc003_4
    if pre.find_previous_sibling() is None and pre.parent.name == 'section':
      h3 = get_header(tag=pre.parent.find_previous_sibling(), expected_tag_name='h3')
      if h3:
        yield h3_2_title_inout(h3.text) + (pre.text, )
        continue

    # a very old format: #task-statement p+pre.literal-block
    # entirely unsupported by AtCoder's JavaScript
    # example: https://atcoder.jp/contests/utpc2011/tasks/utpc2011_1
    if 'literal-block' in pre.attrs.get('class', []):
      p = get_header(tag=pre.find_previous_sibling(), expected_tag_name='p')
      if p:
        yield h3_2_title_inout(p) + (pre.text, )
        continue


def _parse_sample_cases(soup: BeautifulSoup) -> List[ProblemTestCase]:
  """
    :raises SampleParseError:
  """
  s_dict: Dict[str, ProblemTestCase] = {}

  for title, inout, content in _find_sample_tags(soup):
    if title not in s_dict:
      s_dict[title] = ProblemTestCase(title=title)
    if inout == 0:
      s_dict[title].input = content.lstrip()
    elif inout == 1:
      s_dict[title].output = content.lstrip()
    else:
      assert (False)

  samples: List[ProblemTestCase] = []
  for title, inout, content in _find_sample_tags(soup):
    if inout == 0:
      samples.append(s_dict[title])

  return samples


[docs]def parse_task(html: str) -> ProblemResult:
  """parse problem page html to structured data
  
    :param html: the html source get from ``https://atcoder.jp/contests/{contest_id}/tasks/{problem_id}``

    :examples:

    .. code-block:: 

        import requests
        from ac_core.problem import parse_task

        r = requests.get('https://atcoder.jp/contests/abc260/tasks/abc260_a')
        if r.status_code == 200:
            print(parse_task(r.text))
  """
  soup = BeautifulSoup(html, HTML_PARSER)
  h2 = soup.find('span', class_='h2')
  assert isinstance(h2, bs4.Tag)

  alphabet, _, name = get_direct_children_text(h2).strip().partition(' - ')

  time_limit, memory_limit = h2.find_next_sibling('p').text.strip().split(' / ')
  for time_limit_prefix in ('実行時間制限: ', 'Time Limit: '):
    if time_limit.startswith(time_limit_prefix):
      break
  else:
    assert False
  if time_limit.endswith(' msec'):
    time_limit_msec = int(remove_suffix(remove_prefix(time_limit, time_limit_prefix), ' msec'))
  elif time_limit.endswith(' sec'):
    time_limit_msec = int(float(remove_suffix(remove_prefix(time_limit, time_limit_prefix), ' sec')) * 1000)
  else:
    assert False

  # When login as the admin, a link is added after memory limit. See https://github.com/online-judge-tools/api-client/issues/90
  parsed_memory_limit = re.search(r'^(メモリ制限|Memory Limit): ([0-9.]+) (KB|MB)', memory_limit)
  assert parsed_memory_limit

  memory_limit_value = parsed_memory_limit.group(2)
  memory_limit_unit = parsed_memory_limit.group(3)
  if memory_limit_unit == 'KB':
    memory_limit_byte = int(float(memory_limit_value))
  elif memory_limit_unit == 'MB':
    memory_limit_byte = int(float(memory_limit_value) * 1000)
  else:
    assert False

  try:
    tests_list = _parse_sample_cases(soup) or []  # type: Optional[List[ProblemTestCase]]
  except SampleParseError as e:
    logger.error(str(e))

  score = _parse_score(soup)
  url = parse_url(soup)
  contest_info = soup.find(class_="contest-title")
  assert isinstance(contest_info, bs4.Tag)
  contest_name = contest_info.text
  contest_url = _SITE_URL + contest_info.attrs["href"]
  start_time, end_time = parse_start_end(soup)

  return ProblemResult(
      id=alphabet,
      url=url,
      name=name,
      time_limit_msec=time_limit_msec,
      memory_limit_kb=memory_limit_byte,
      tests=tests_list,
      score=score,
      contest_name=contest_name,
      contest_url=contest_url,
      contest_start=start_time,
      contest_end=end_time,
  )