Source code for cltk.corpora.lat.phi.file_utils

"""Higher-level (i.e., user-friendly) functions for quickly reading
PHI5 data after it has been processed by ``TLGU()``.
"""

import os
from typing import Optional, Union

import regex
from regex import Pattern

from cltk.corpora.lat.phi.phi5_index import (
    MAP_PHI5_AUTHOR_ID_TO_NAME,
    MAP_PHI5_AUTHOR_ID_TO_WORKS_AND_NAME,
)
from cltk.utils.file_operations import make_cltk_path


[docs]def phi5_plaintext_cleanup(
    text, rm_punctuation: bool = False, rm_periods: bool = False
) -> str:
    """Remove and substitute post-processing for Latin PHI5 text.
    TODO: Surely more junk to pull out. Please submit bugs!
    TODO: This is a rather slow now, help in speeding up welcome.
    """
    # This works OK, doesn't get some
    # Note: rming all characters between {} and ()
    remove_comp: Pattern[str] = regex.compile(
        r"-\n|«|»|\<|\>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|\(|\)|“|#|%|⚔|&|=|/|\\|〚|†|『|⚖|–|˘|⚕|☾|◌|◄|►|⌐|⌊|⌋|≈|∷|≈|∞|”|[0-9]"
    )
    text = remove_comp.sub("", text)

    new_text: Optional[str] = None
    if rm_punctuation:
        new_text = ""
        punctuation: list[str] = [
            ",",
            ";",
            ":",
            '"',
            "'",
            "?",
            "-",
            "!",
            "*",
            "[",
            "]",
            "{",
            "}",
        ]
        if rm_periods:
            punctuation += ["."]
        for char in text:
            # rm acute combining acute accents made by TLGU
            # Could be caught by regex, tried and failed, not sure why
            if bytes(char, "utf-8") == b"\xcc\x81":
                pass
            # second try at rming some punctuation; merge with above regex
            elif char in punctuation:
                pass
            else:
                new_text += char
    if new_text:
        text = new_text

    # replace line breaks w/ space
    replace_comp: Pattern[str] = regex.compile(r"\n")
    text = replace_comp.sub(" ", text)

    comp_space: Pattern[str] = regex.compile(r"\s+")
    text = comp_space.sub(" ", text)

    return text


[docs]def assemble_phi5_author_filepaths() -> list[str]:
    """Reads PHI5 index and builds a list of absolute filepaths."""
    plaintext_dir: str = make_cltk_path("lat/text/phi5/plaintext/")
    filepaths: list[str] = [
        os.path.join(plaintext_dir, x + ".TXT") for x in MAP_PHI5_AUTHOR_ID_TO_NAME
    ]
    return filepaths


[docs]def assemble_phi5_works_filepaths() -> list[str]:
    """Reads PHI5 index and builds a list of absolute filepaths."""
    plaintext_dir: str = make_cltk_path("lat/text/phi5/individual_works/")
    all_filepaths: list[str] = list()
    for author_code in MAP_PHI5_AUTHOR_ID_TO_WORKS_AND_NAME:
        author_data: dict[
            str, Union[list[str], str]
        ] = MAP_PHI5_AUTHOR_ID_TO_WORKS_AND_NAME[author_code]
        works: Union[list[str], str] = author_data["works"]
        for work in works:
            filepath: str = os.path.join(
                plaintext_dir, author_code + ".TXT" + "-" + work + ".txt"
            )
            all_filepaths.append(filepath)
    return all_filepaths
Source code for cltk.corpora.lat.phi.file_utils

The Classical Language Toolkit

Navigation

Related Topics