Source code for cltk.corpora.grc.tei

"""Work with TEI XML files."""

import glob
import os

bs4_installed = True
    from bs4 import BeautifulSoup
except ImportError:
    bs4_installed = False

mycapitains_installed = True
    from lxml.etree import tostring
    from MyCapytain.common.constants import XPATH_NAMESPACES, Mimetypes
    from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
except ImportError:
    mycapitains_installed = False

from cltk.core.cltk_logger import logger
from cltk.utils.file_operations import make_cltk_path

[docs]def onekgreek_tei_xml_to_text(): """Find TEI XML dir of TEI XML for the First 1k Years of Greek corpus.""" if not bs4_installed: logger.error("Install `bs4` and `lxml` to parse these TEI files.") raise ImportError xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml") xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( "1K Greek corpus not installed. Use ``FetchCorpus`` to get `First1KGreek`." ) raise FileNotFoundError xml_paths = [path for path in xml_paths if "__cts__" not in path] # new dir new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/") if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip(".xml") xml_name += ".txt" with open(xml_path) as file_open: soup = BeautifulSoup(file_open, "lxml") body = soup.body text = body.get_text() new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, "w") as file_open: file_open.write(text)
[docs]def onekgreek_tei_xml_to_text_capitains(): """Use MyCapitains program to convert TEI to plaintext.""" file = make_cltk_path( "grc/text/grc_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml" ) xml_dir = make_cltk_path("grc/text/grc_text_first1kgreek/data/*/*/*.xml") xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( "1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`." ) raise FileNotFoundError xml_paths = [path for path in xml_paths if "__cts__" not in path] # new dir new_dir = make_cltk_path("grc/text/grc_text_first1kgreek_plaintext/") if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip(".xml") xml_name += ".txt" plain_text = "" with open(xml_path) as file_open: text = CapitainsCtsText(resource=file_open) for ref in text.getReffs(level=len(text.citation)): psg = text.getTextualNode(subreference=ref, simple=True) text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]) plain_text += text_line new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, "w") as file_open: file_open.write(plain_text)