Source code for cltk.corpora.grc.tlg.parse_tlg_indices

"""For loading TLG .json files and searching, then pulling author ids."""

import json
import os
from typing import Optional

import regex
from regex import Pattern

from cltk.corpora.grc.tlg.author_date import MAP_DATE_TO_AUTHORS
from cltk.corpora.grc.tlg.author_epithet import MAP_EPITHET_TO_AUTHOR_IDS
from cltk.corpora.grc.tlg.author_female import AUTHOR_FEMALE
from cltk.corpora.grc.tlg.author_geo import AUTHOR_GEO
from cltk.corpora.grc.tlg.id_author import ID_AUTHOR
from cltk.corpora.grc.tlg.index_lists import ALL_TLG_INDICES
from cltk.corpora.grc.tlg.work_numbers import WORK_NUMBERS

__author__ = [
    "Kyle P. Johnson <>",
    "Stephen Margheim <>",
    "Martín Pozzi <>",
__license__ = "MIT License. See LICENSE."

THIS_DIR = os.path.dirname(os.path.abspath(__file__))

[docs]def get_female_authors() -> set[str]: """Open female authors index and return ordered set of author ids.""" return set(AUTHOR_FEMALE["Femina"])
[docs]def get_epithet_index() -> dict[str, set[str]]: """Return dict of epithets (key) to a set of all author ids of that epithet (value). """ _dict: dict[str, set[str]] = dict() for key, val in MAP_EPITHET_TO_AUTHOR_IDS.items(): _dict[key] = set(val) return _dict
[docs]def get_epithets() -> list[str]: """Return a list of all the epithet labels.""" return sorted(MAP_EPITHET_TO_AUTHOR_IDS.keys())
[docs]def select_authors_by_epithet(query: str) -> set[str]: """Pass exact name (case-insensitive) of epithet name, return ordered set of author ids. """ for epithet, ids in MAP_EPITHET_TO_AUTHOR_IDS.items(): if epithet.casefold() == query.casefold(): return set(ids)
[docs]def get_epithet_of_author(_id: str) -> str: """Pass author id and return the name of its associated epithet.""" for epithet, ids in MAP_EPITHET_TO_AUTHOR_IDS.items(): if _id in ids: return epithet
[docs]def get_geo_index() -> dict[str, set[str]]: """Get entire index of geographic name (key) and set of associated authors (value). """ _dict: dict[str, set[str]] = dict() for key, val in MAP_EPITHET_TO_AUTHOR_IDS.items(): _dict[key] = set(val) return _dict
[docs]def get_geographies() -> list[str]: """Return a list of all the epithet labels.""" return sorted(AUTHOR_GEO.keys())
[docs]def select_authors_by_geo(query: str) -> set[str]: """Pass exact name (case-insensitive) of geography name, return ordered set of author ids. """ for geo, ids in AUTHOR_GEO.items(): if geo.casefold() == query.casefold(): return set(ids)
[docs]def get_geo_of_author(_id: str) -> str: """Pass author id and return the name of its associated epithet.""" for geo, ids in AUTHOR_GEO.items(): if _id in ids: return geo
[docs]def get_lists() -> dict[str, dict[str, str]]: """Return all of the TLG's indices.""" return ALL_TLG_INDICES
[docs]def get_id_author() -> dict[str, str]: """Returns entirety of id-author TLG index.""" return ID_AUTHOR
[docs]def select_id_by_name(query) -> list[tuple[str, str]]: """Do a case-insensitive regex match on author name, returns TLG id.""" id_author: dict[str, str] = get_id_author() comp: Pattern[str] = regex.compile( r"{}".format(query.casefold()), flags=regex.VERSION1 ) matches: list[tuple[str, str]] = list() for _id, author in id_author.items(): match: list[str] = comp.findall(author.casefold()) if match: matches.append((_id, author)) return matches
[docs]def open_json(_file): """Loads the json file as a dictionary and returns it.""" with open(_file) as f: return json.load(f)
[docs]def get_works_by_id(_id): """Pass author id and return a dictionary of its works.""" return WORK_NUMBERS[_id]
[docs]def check_id(_id): """Pass author id and return a string with the author label""" return ID_AUTHOR[_id]
[docs]def get_date_author() -> dict[str, list[str]]: """Returns entirety of date-author index.""" return MAP_DATE_TO_AUTHORS
[docs]def get_dates(): """Return a list of all the date epithet labels.""" map_date_to_authors: dict[str, list[str]] = get_date_author() return sorted(map_date_to_authors.keys())
[docs]def get_date_of_author(_id): """Pass author id and return the name of its associated date.""" map_date_to_authors: dict[str, list[str]] = get_date_author() for date, ids in map_date_to_authors.items(): if _id in ids: return date return None
[docs]def _get_epoch(_str) -> Optional[str]: """Take incoming string, return its epoch.""" _return = None if _str.startswith("A.D. "): _return = "ad" elif _str.startswith("a. A.D. "): _return = None # ? elif _str.startswith("p. A.D. "): _return = "ad" elif regex.match(r"^[0-9]+ B\.C\. *", _str): _return = "bc" elif regex.match(r"^a\. *[0-9]+ B\.C\. *", _str): _return = "bc" elif regex.match(r"^p\. *[0-9]+ B\.C\. *", _str): _return = None # ? elif _str == "Incertum" or _str == "Varia": _return = _str return _return
[docs]def _check_number(_str) -> bool: """check if the string contains only a number followed by ?""" if regex.match(r"^[0-9]+\?*", _str): return True return False
[docs]def _handle_splits(_str: str) -> dict[str, Optional[str]]: """Check if incoming date has a '-' or '/', if so do stuff.""" _str = _str.replace("/", "-") _tmp_dict: dict[str, Optional[str]] = dict() if "-" in _str: start, stop = _str.split("-") if _check_number(start): start = regex.sub(r"[0-9]+\?*", start, stop) elif _check_number(stop): stop = regex.sub(r"[0-9]+\?*", stop, start) else: start = _str stop = _str _tmp_dict["start_raw"] = start _tmp_dict["stop_raw"] = stop _tmp_dict["start_epoch"] = _get_epoch(start) _tmp_dict["stop_epoch"] = _get_epoch(stop) return _tmp_dict
[docs]def normalize_dates(): """Experiment to make sense of TLG dates. TODO: start here, parse everything with pass """ map_date_to_authors: dict[str, list[str]] = get_date_author() for tlg_date in map_date_to_authors: date = {} if tlg_date == "Varia": # give a homer-to-byz date for 'varia' pass elif tlg_date == "Incertum": # ? pass else: tmp_date = _handle_splits(tlg_date) date.update(tmp_date) print(date)