Source code for cltk.corpora.grc.tlg.tlgu

"""Wrapper for `tlgu` command line utility.

Original software at: ``http://tlgu.carmen.gr/``.

TLGU software written by Dimitri Marinakis and available at
`<http://tlgu.carmen.gr/>`_ under GPLv2 license.

TODO: the arguments to ``convert_corpus()`` need some rationalization, and
``divide_works()`` should be incorporated into it.

"""

__author__ = [
    "Kyle P. Johnson <kyle@kyle-p-johnson.com>",
    "Stephen Margheim <stephen.margheim@gmail.com>",
]
__license__ = "MIT License. See LICENSE."

import os
import subprocess
from typing import Optional

from cltk.core.cltk_logger import logger
from cltk.core.exceptions import CLTKException
from cltk.data.fetch import FetchCorpus
from cltk.utils.file_operations import make_cltk_path
from cltk.utils.utils import query_yes_no

# These TLGU args not currently in use
ARGS: dict[str, str] = {
    "book_breaks": "-b",
    "page_breaks": "-p",
    "lat_text": "-r",
    "level_1": "-v",
    "level_2": "-w",
    "level_3": "-x",
    "level_4": "-y",
    "level_5": "-z",
    "line_tab": "-B",
    "higher_levels": "-X",
    "lower_levels": "-Y",
    "no_spaces": "-N",  # rm_newlines
    "citation_debug": "-C",
    "code_debug": "-S",
    "verbose": "-V",
    "split_works": "-W",
}


[docs]class TLGU: """Check, install, and call TLGU.""" def __init__(self, interactive: bool = True): """Check whether tlgu is installed, if not, import and install.""" self.interactive: bool = interactive self._check_and_download_tlgu_source() self._check_install()
[docs] def _check_and_download_tlgu_source(self) -> None: """Check if tlgu downloaded, if not download it.""" path: str = make_cltk_path("grc/software/grc_software_tlgu/tlgu.h") if not os.path.isfile(path): dl_msg: str = f"This part of the CLTK depends upon TLGU, software written by Dimitri Marinakis `<http://tlgu.carmen.gr/>`_." print(dl_msg) repo_url: str = "https://github.com/cltk/grc_software_tlgu.git" dl_dir: str = os.path.split(path)[0] dl_question: str = ( f"Do you want to download TLGU from '{repo_url}' to '{dl_dir}'?" ) do_download: Optional[bool] = None if self.interactive: do_download = query_yes_no(question=dl_question) else: do_download = True if do_download: fetch_corpus: FetchCorpus = FetchCorpus(language="grc") fetch_corpus.import_corpus(corpus_name="grc_software_tlgu") else: raise CLTKException(f"TLGU software required for this class to work.")
[docs] def _check_install(self) -> None: """Check if tlgu installed, if not install it.""" try: subprocess.check_output(["which", "tlgu"]) except subprocess.SubprocessError as sub_err: print("TLGU not installed.") logger.info("TLGU not installed: %s", sub_err) logger.info("Installing TLGU.") if not subprocess.check_output(["which", "gcc"]): logger.error("GCC seems not to be installed.") else: tlgu_path: str = make_cltk_path("grc/software/grc_software_tlgu") if self.interactive: install_question: str = "Do you want to install TLGU?" do_install: bool = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) else: print("Non-interactive installation. Continuing ...") command: str = "cd {0} && make install".format(tlgu_path) print(f"Going to run command: ``{command}``") try: p_out: int = subprocess.call(command, shell=True) except subprocess.SubprocessError as sub_err: print( "Error executing installation. Going to check output of ``subprocess.call()`` ..." ) raise CLTKException(sub_err) if p_out == 0: msg: str = "TLGU installed." print(msg) logger.info(msg) else: msg: str = "TLGU install without sudo failed. Going to try again with sudo (usually required for Linux) ..." print(msg) logger.error(msg) command: str = "cd {0} && sudo make install".format(tlgu_path) if self.interactive: install_question: str = "Do you want to install TLGU? with sudo?" do_install: bool = query_yes_no(question=install_question) if not do_install: raise CLTKException( "TLGU installation required for this class to work." ) p_out: int = subprocess.call(command, shell=True) else: print("Going to run command:", command) p_out: int = subprocess.call(command, shell=True) if p_out == 0: msg: str = "TLGU installed." print(msg) logger.info(msg) else: msg: str = "TLGU install with sudo failed." print(msg) logger.error(msg) raise CLTKException( "TLGU installation required for this class to work." )
[docs] @staticmethod def convert( input_path: Optional[str] = None, output_path: Optional[str] = None, markup: Optional[str] = None, rm_newlines: bool = False, divide_works: bool = False, lat: bool = False, extra_args: Optional[list[str]] = None, ) -> None: """Do conversion. :param input_path: TLG filepath to convert. :param output_path: filepath of new converted text. :param markup: Specificity of inline markup. Default None removes all numerical markup; 'full' gives most detailed, with reference numbers included before each text line. :param rm_newlines: No spaces; removes line ends and hyphens before an ID code; hyphens and spaces before page and column ends are retained. :param divide_works: Each work (book) is output as a separate file in the form output_file-xxx.txt; if an output file is not specified, this option has no effect. :param lat: Primarily Latin text (PHI). Some TLG texts, notably doccan1.txt and doccan2.txt are mostly roman texts lacking explicit language change codes. Setting this option will force a change to Latin text after each citation block is encountered. :param extra_args: Any other tlgu args to be passed, in list form and without dashes, e.g.: ['p', 'b', 'B']. """ # setup file paths input_path: str = os.path.expanduser(input_path) output_path: str = os.path.expanduser(output_path) # check input path exists assert os.path.isfile(input_path), "File {0} does not exist.".format(input_path) # setup tlgu flags tlgu_options: list[str] = list() if markup == "full": full_args = ["v", "w", "x", "y", "z"] [tlgu_options.append(x) for x in full_args] # pylint: disable=W0106 if rm_newlines: tlgu_options.append("N") if divide_works: tlgu_options.append("W") if lat: tlgu_options.append("r") # setup extra args if extra_args is None: extra_args: list[str] = list() else: try: extra_args = list(extra_args) except Exception as exc: logger.error("Argument 'extra_args' must be a list: %s.", exc) raise tlgu_options: list[str] = tlgu_options + extra_args # Assemble all tlgu flags tlgu_options = list(set(tlgu_options)) tlgu_options: Optional[str] = None if tlgu_options: tlgu_flags = "-" + " -".join(tlgu_options) else: tlgu_flags = "" # make tlgu call tlgu_call: str = "tlgu {0} {1} {2}".format(tlgu_flags, input_path, output_path) logger.info(tlgu_call) try: p_out: int = subprocess.call(tlgu_call, shell=True) if p_out == 1: logger.error("Failed to convert %s to %s.", input_path, output_path) except Exception as exc: logger.error("Failed to convert %s to %s: %s", input_path, output_path, exc) raise
[docs] def convert_corpus( self, corpus: str, markup: Optional[str] = None, lat: Optional[bool] = None ) -> None: # pylint: disable=W0613 """Look for imported TLG or PHI files and convert them all to ``~/cltk_data/grc/text/tlg/<plaintext>``. TODO: Add markup options to input. TODO: Add rm_newlines, divide_works, and extra_args """ orig_path: str = make_cltk_path("originals") target_path: str = make_cltk_path() assert corpus in [ "tlg", "phi5", "phi7", ], "Corpus must be 'tlg', 'phi5', or 'phi7'" if corpus in ["tlg", "phi5", "phi7"]: orig_path: str = os.path.join(orig_path, corpus) if corpus in ["tlg", "phi7"]: if "phi7" and lat is True: lat = True target_path = os.path.join(target_path, "lat", "text", corpus) else: lat = None target_path = os.path.join(target_path, "grc", "text", corpus) else: target_path = os.path.join(target_path, "lat", "text", corpus) lat = True try: corpus_files: list[str] = os.listdir(orig_path) except Exception as exception: logger.error("Failed to find TLG files: %s", exception) raise # make a list of files to be converted txts: list[str] = [x for x in corpus_files if x.endswith("TXT")] # loop through list and convert one at a time for txt in txts: orig_txt_path: str = os.path.join(orig_path, txt) target_txt_dir: Optional[str] = None if markup is None: target_txt_dir = os.path.join(target_path, "plaintext") else: target_txt_dir = os.path.join(target_path, str(markup)) if not os.path.isdir(target_txt_dir): os.makedirs(target_txt_dir) target_txt_path: str = os.path.join(target_txt_dir, txt) try: self.convert( orig_txt_path, target_txt_path, markup=None, rm_newlines=False, divide_works=False, lat=lat, extra_args=None, ) except Exception as exception: logger.error( "Failed to convert file '%s' to '%s': %s", orig_txt_path, target_txt_path, exception, )
[docs] def divide_works(self, corpus: str) -> None: """Use the work-breaking option. TODO: Maybe incorporate this into ``convert_corpus()`` TODO: Write test for this """ if corpus == "tlg": orig_dir = make_cltk_path("originals/tlg") works_dir = make_cltk_path("grc/text/tlg/individual_works") file_prefix = "TLG" lat = False elif corpus == "phi5": orig_dir = make_cltk_path("originals/phi5") works_dir = make_cltk_path("lat/text/phi5/individual_works") file_prefix = "LAT" lat = True # this is for the optional TLGU argument to convert() elif corpus == "phi7": raise CLTKException("``phi7`` cannot be divided into individual works.") else: raise CLTKException(f"Invalid corpus '{corpus}'. This should never happen.") if not os.path.exists(works_dir): os.makedirs(works_dir) files: list[str] = os.listdir(orig_dir) texts: list[str] = [ x for x in files if x.endswith(".TXT") and x.startswith(file_prefix) ] for file in texts: orig_file_path: str = os.path.join(orig_dir, file) new_file_path: str = os.path.join(works_dir, file) try: self.convert(orig_file_path, new_file_path, divide_works=True, lat=lat) logger.info("Writing files at %s to %s.", orig_file_path, works_dir) except Exception as err: logger.error("Failed to convert files: %s.", err)
# assemble_tlg_author_filepaths