Source code for cltk.corpora.lat.phi.file_utils

"""Higher-level (i.e., user-friendly) functions for quickly reading
PHI5 data after it has been processed by ``TLGU()``.
"""

import os
from typing import Optional, Union

import regex
from regex import Pattern

from cltk.corpora.lat.phi.phi5_index import (
    MAP_PHI5_AUTHOR_ID_TO_NAME,
    MAP_PHI5_AUTHOR_ID_TO_WORKS_AND_NAME,
)
from cltk.utils.file_operations import make_cltk_path


[docs]def phi5_plaintext_cleanup( text, rm_punctuation: bool = False, rm_periods: bool = False ) -> str: """Remove and substitute post-processing for Latin PHI5 text. TODO: Surely more junk to pull out. Please submit bugs! TODO: This is a rather slow now, help in speeding up welcome. """ # This works OK, doesn't get some # Note: rming all characters between {} and () remove_comp: Pattern[str] = regex.compile( r"-\n|«|»|\<|\>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|\(|\)|“|#|%|⚔|&|=|/|\\|〚|†|『|⚖|–|˘|⚕|☾|◌|◄|►|⌐|⌊|⌋|≈|∷|≈|∞|”|[0-9]" ) text = remove_comp.sub("", text) new_text: Optional[str] = None if rm_punctuation: new_text = "" punctuation: list[str] = [ ",", ";", ":", '"', "'", "?", "-", "!", "*", "[", "]", "{", "}", ] if rm_periods: punctuation += ["."] for char in text: # rm acute combining acute accents made by TLGU # Could be caught by regex, tried and failed, not sure why if bytes(char, "utf-8") == b"\xcc\x81": pass # second try at rming some punctuation; merge with above regex elif char in punctuation: pass else: new_text += char if new_text: text = new_text # replace line breaks w/ space replace_comp: Pattern[str] = regex.compile(r"\n") text = replace_comp.sub(" ", text) comp_space: Pattern[str] = regex.compile(r"\s+") text = comp_space.sub(" ", text) return text
[docs]def assemble_phi5_author_filepaths() -> list[str]: """Reads PHI5 index and builds a list of absolute filepaths.""" plaintext_dir: str = make_cltk_path("lat/text/phi5/plaintext/") filepaths: list[str] = [ os.path.join(plaintext_dir, x + ".TXT") for x in MAP_PHI5_AUTHOR_ID_TO_NAME ] return filepaths
[docs]def assemble_phi5_works_filepaths() -> list[str]: """Reads PHI5 index and builds a list of absolute filepaths.""" plaintext_dir: str = make_cltk_path("lat/text/phi5/individual_works/") all_filepaths: list[str] = list() for author_code in MAP_PHI5_AUTHOR_ID_TO_WORKS_AND_NAME: author_data: dict[ str, Union[list[str], str] ] = MAP_PHI5_AUTHOR_ID_TO_WORKS_AND_NAME[author_code] works: Union[list[str], str] = author_data["works"] for work in works: filepath: str = os.path.join( plaintext_dir, author_code + ".TXT" + "-" + work + ".txt" ) all_filepaths.append(filepath) return all_filepaths