Source code for cltk.languages.pipelines

"""Default processing pipelines for languages. The purpose of
these dataclasses is to represent:

1. the types of NLP processes that the CLTK can do
2. the order in which processes are to be executed
3. specifying what downstream features a particular implemented process requires
"""

from dataclasses import dataclass, field
from typing import Type

from cltk.alphabet.processes import GreekNormalizeProcess, LatinNormalizeProcess
from cltk.core.data_types import Language, Pipeline, Process
from cltk.dependency.processes import (
    ChineseStanzaProcess,
    CopticStanzaProcess,
    GothicStanzaProcess,
    GreekStanzaProcess,
    LatinSpacyProcess,
    LatinStanzaProcess,
    OCSStanzaProcess,
    OldFrenchStanzaProcess,
)
from cltk.embeddings.processes import (
    ArabicEmbeddingsProcess,
    AramaicEmbeddingsProcess,
    GothicEmbeddingsProcess,
    GreekEmbeddingsProcess,
    LatinEmbeddingsProcess,
    MiddleEnglishEmbeddingsProcess,
    OldEnglishEmbeddingsProcess,
    PaliEmbeddingsProcess,
    SanskritEmbeddingsProcess,
)
from cltk.languages.utils import get_lang
from cltk.lemmatize.processes import (
    GreekLemmatizationProcess,
    LatinLemmatizationProcess,
    OldEnglishLemmatizationProcess,
    OldFrenchLemmatizationProcess,
)
from cltk.lexicon.processes import LatinLexiconProcess, OldNorseLexiconProcess
from cltk.ner.processes import (  # GreekNERProcess,; LatinNERProcess,; OldEnglishNERProcess,
    OldFrenchNERProcess,
)
from cltk.stops.processes import StopsProcess
from cltk.tokenizers.processes import (
    AkkadianTokenizationProcess,
    ArabicTokenizationProcess,
    GreekTokenizationProcess,
    LatinTokenizationProcess,
    MiddleEnglishTokenizationProcess,
    MiddleFrenchTokenizationProcess,
    MiddleHighGermanTokenizationProcess,
    MultilingualTokenizationProcess,
    OldFrenchTokenizationProcess,
    OldNorseTokenizationProcess,
)


[docs]@dataclass class AkkadianPipeline(Pipeline): """Default ``Pipeline`` for Akkadian. >>> from cltk.languages.pipelines import AkkadianPipeline >>> a_pipeline = AkkadianPipeline() >>> a_pipeline.description 'Pipeline for the Akkadian language.' >>> a_pipeline.language Language(name='Akkadian', glottolog_id='akka1240', latitude=33.1, longitude=44.1, family_id='afro1255', parent_id='east2678', level='language', iso_639_3_code='akk', type='a', dates=[]) >>> a_pipeline.language.name 'Akkadian' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.AkkadianTokenizationProcess'> """ description: str = "Pipeline for the Akkadian language." language: Language = get_lang("akk") processes: list[Type[Process]] = field( default_factory=lambda: [AkkadianTokenizationProcess, StopsProcess] )
[docs]@dataclass class ArabicPipeline(Pipeline): """Default ``Pipeline`` for Arabic. >>> from cltk.languages.pipelines import ArabicPipeline >>> a_pipeline = ArabicPipeline() >>> a_pipeline.description 'Pipeline for the Arabic language' >>> a_pipeline.language Language(name='Standard Arabic', glottolog_id='stan1318', latitude=27.9625, longitude=43.8525, family_id='afro1255', parent_id='arab1395', level='language', iso_639_3_code='arb', type='', dates=[]) >>> a_pipeline.language.name 'Standard Arabic' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.ArabicTokenizationProcess'> """ description: str = "Pipeline for the Arabic language" language: Language = get_lang("arb") processes: list[Type[Process]] = field( default_factory=lambda: [ ArabicTokenizationProcess, ArabicEmbeddingsProcess, StopsProcess, ] )
[docs]@dataclass class AramaicPipeline(Pipeline): """Default ``Pipeline`` for Aramaic. TODO: Confirm with specialist what encodings should be expected. TODO: Replace ``ArabicTokenizationProcess`` with a multilingual one or a specific Aramaic. >>> from cltk.languages.pipelines import AramaicPipeline >>> a_pipeline = AramaicPipeline() >>> a_pipeline.description 'Pipeline for the Aramaic language' >>> a_pipeline.language Language(name='Official Aramaic (700-300 BCE)', glottolog_id='', latitude=0.0, longitude=0.0, family_id='', parent_id='', level='', iso_639_3_code='arc', type='a', dates=[]) >>> a_pipeline.language.name 'Official Aramaic (700-300 BCE)' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.ArabicTokenizationProcess'> """ description: str = "Pipeline for the Aramaic language" language: Language = get_lang("arc") processes: list[Type[Process]] = field( default_factory=lambda: [ ArabicTokenizationProcess, # Note: Using Arabic tokenizer for Aramaic. Is this OK? AramaicEmbeddingsProcess, ] )
[docs]@dataclass class ChinesePipeline(Pipeline): """Default ``Pipeline`` for Classical Chinese. >>> from cltk.languages.pipelines import ChinesePipeline >>> a_pipeline = ChinesePipeline() >>> a_pipeline.description 'Pipeline for the Classical Chinese language' >>> a_pipeline.language Language(name='Literary Chinese', glottolog_id='lite1248', latitude=0.0, longitude=0.0, family_id='sino1245', parent_id='clas1255', level='language', iso_639_3_code='lzh', type='h', dates=[]) >>> a_pipeline.language.name 'Literary Chinese' >>> a_pipeline.processes[0] <class 'cltk.dependency.processes.ChineseStanzaProcess'> """ description: str = "Pipeline for the Classical Chinese language" language: Language = get_lang("lzh") processes: list[Type[Process]] = field( default_factory=lambda: [ChineseStanzaProcess] )
[docs]@dataclass class CopticPipeline(Pipeline): """Default ``Pipeline`` for Coptic. >>> from cltk.languages.pipelines import CopticPipeline >>> a_pipeline = CopticPipeline() >>> a_pipeline.description 'Pipeline for the Coptic language' >>> a_pipeline.language Language(name='Coptic', glottolog_id='copt1239', latitude=29.472, longitude=31.2053, family_id='afro1255', parent_id='egyp1245', level='language', iso_639_3_code='cop', type='', dates=[]) >>> a_pipeline.language.name 'Coptic' >>> a_pipeline.processes[0] <class 'cltk.dependency.processes.CopticStanzaProcess'> """ description: str = "Pipeline for the Coptic language" language: Language = get_lang("cop") processes: list[Type[Process]] = field( default_factory=lambda: [CopticStanzaProcess, StopsProcess] )
[docs]@dataclass class GothicPipeline(Pipeline): """Default ``Pipeline`` for Gothic. >>> from cltk.languages.pipelines import GothicPipeline >>> a_pipeline = GothicPipeline() >>> a_pipeline.description 'Pipeline for the Gothic language' >>> a_pipeline.language Language(name='Gothic', glottolog_id='goth1244', latitude=46.9304, longitude=29.9786, family_id='indo1319', parent_id='east2805', level='language', iso_639_3_code='got', type='a', dates=[]) >>> a_pipeline.language.name 'Gothic' >>> a_pipeline.processes[0] <class 'cltk.dependency.processes.GothicStanzaProcess'> >>> a_pipeline.processes[1] <class 'cltk.embeddings.processes.GothicEmbeddingsProcess'> """ description: str = "Pipeline for the Gothic language" language: Language = get_lang("got") processes: list[Type[Process]] = field( default_factory=lambda: [GothicStanzaProcess, GothicEmbeddingsProcess] )
[docs]@dataclass class GreekPipeline(Pipeline): """Default ``Pipeline`` for Ancient Greek. >>> from cltk.languages.pipelines import GreekPipeline >>> a_pipeline = GreekPipeline() >>> a_pipeline.description 'Pipeline for the Greek language' >>> a_pipeline.language Language(name='Ancient Greek', glottolog_id='anci1242', latitude=39.8155, longitude=21.9129, family_id='indo1319', parent_id='east2798', level='language', iso_639_3_code='grc', type='h', dates=[]) >>> a_pipeline.language.name 'Ancient Greek' >>> a_pipeline.processes[0] <class 'cltk.alphabet.processes.GreekNormalizeProcess'> """ description: str = "Pipeline for the Greek language" language: Language = get_lang("grc") processes: list[Type[Process]] = field( default_factory=lambda: [ # GreekTokenizationProcess, GreekNormalizeProcess, GreekStanzaProcess, GreekEmbeddingsProcess, StopsProcess, # GreekNERProcess, ] )
[docs]@dataclass class HindiPipeline(Pipeline): """Default ``Pipeline`` for Hindi. >>> from cltk.languages.pipelines import HindiPipeline >>> a_pipeline = HindiPipeline() >>> a_pipeline.description 'Pipeline for the Hindi language.' >>> a_pipeline.language Language(name='Hindi', glottolog_id='hind1269', latitude=25.0, longitude=77.0, family_id='indo1319', parent_id='hind1270', level='language', iso_639_3_code='hin', type='', dates=[]) >>> a_pipeline.language.name 'Hindi' >>> a_pipeline.processes[1] <class 'cltk.stops.processes.StopsProcess'> """ description: str = "Pipeline for the Hindi language." language: Language = get_lang("hin") processes: list[Type[Process]] = field( default_factory=lambda: [MultilingualTokenizationProcess, StopsProcess] )
[docs]@dataclass class LatinPipeline(Pipeline): """Default ``Pipeline`` for Latin. TODO: Add stopword annotation for all relevant pipelines. >>> from cltk.languages.pipelines import LatinPipeline >>> a_pipeline = LatinPipeline() >>> a_pipeline.description 'Pipeline for the Latin language' >>> a_pipeline.language Language(name='Latin', glottolog_id='lati1261', latitude=41.9026, longitude=12.4502, family_id='indo1319', parent_id='impe1234', level='language', iso_639_3_code='lat', type='a', dates=[]) >>> a_pipeline.language.name 'Latin' >>> a_pipeline.processes[0] <class 'cltk.alphabet.processes.LatinNormalizeProcess'> """ description: str = "Pipeline for the Latin language" language: Language = get_lang("lat") processes: list[Type[Process]] = field( default_factory=lambda: [ LatinNormalizeProcess, # LatinTokenizationProcess, # LatinStanzaProcess, LatinSpacyProcess, LatinEmbeddingsProcess, StopsProcess, # Necessary since Spacy has .is_stop? # LatinNERProcess, LatinLexiconProcess, ] )
[docs]@dataclass class MiddleHighGermanPipeline(Pipeline): """Default ``Pipeline`` for Middle High German. >>> a_pipeline = MiddleHighGermanPipeline() >>> a_pipeline.description 'Pipeline for the Middle High German language.' >>> a_pipeline.language Language(name='Middle High German', glottolog_id='midd1343', latitude=0.0, longitude=0.0, family_id='indo1319', parent_id='midd1349', level='language', iso_639_3_code='gmh', type='h', dates=[]) >>> a_pipeline.language.name 'Middle High German' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.MiddleHighGermanTokenizationProcess'> """ description: str = "Pipeline for the Middle High German language." language: Language = get_lang("gmh") processes: list[Type[Process]] = field( default_factory=lambda: [MiddleHighGermanTokenizationProcess, StopsProcess] )
[docs]@dataclass class MiddleEnglishPipeline(Pipeline): """Default ``Pipeline`` for Middle English. TODO: Figure out whether this the dedicated tokenizer is good enough or necessary; we have stanza for Old English, which might be able to tokenizer fine. >>> from cltk.languages.pipelines import MiddleEnglishPipeline >>> a_pipeline = MiddleEnglishPipeline() >>> a_pipeline.description 'Pipeline for the Middle English language' >>> a_pipeline.language Language(name='Middle English', glottolog_id='midd1317', latitude=0.0, longitude=0.0, family_id='indo1319', parent_id='merc1242', level='language', iso_639_3_code='enm', type='h', dates=[]) >>> a_pipeline.language.name 'Middle English' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.MiddleEnglishTokenizationProcess'> >>> from cltk import NLP >>> middle_english_nlp = NLP(language="enm", suppress_banner=True) >>> from cltk.languages.example_texts import get_example_text >>> doc = middle_english_nlp.analyze(get_example_text("enm")) >>> doc[2].embedding.shape (50,) """ description: str = "Pipeline for the Middle English language" language: Language = get_lang("enm") processes: list[Type[Process]] = field( default_factory=lambda: [ MiddleEnglishTokenizationProcess, StopsProcess, MiddleEnglishEmbeddingsProcess, ] )
[docs]@dataclass class MiddleFrenchPipeline(Pipeline): """Default ``Pipeline`` for Middle French. TODO: Figure out whether this the dedicated tokenizer is good enough or necessary; we have stanza for Old French, which might be able to tokenizer fine. >>> from cltk.languages.pipelines import MiddleFrenchPipeline >>> a_pipeline = MiddleFrenchPipeline() >>> a_pipeline.description 'Pipeline for the Middle French language' >>> a_pipeline.language Language(name='Middle French', glottolog_id='midd1316', latitude=0.0, longitude=0.0, family_id='indo1319', parent_id='stan1290', level='dialect', iso_639_3_code='frm', type='h', dates=[]) >>> a_pipeline.language.name 'Middle French' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.MiddleFrenchTokenizationProcess'> """ description: str = "Pipeline for the Middle French language" language: Language = get_lang("frm") processes: list[Type[Process]] = field( default_factory=lambda: [MiddleFrenchTokenizationProcess] )
[docs]@dataclass class OCSPipeline(Pipeline): """Default ``Pipeline`` for Old Church Slavonic. >>> from cltk.languages.pipelines import OCSPipeline >>> a_pipeline = OCSPipeline() >>> a_pipeline.description 'Pipeline for the Old Church Slavonic language' >>> a_pipeline.language Language(name='Church Slavic', glottolog_id='chur1257', latitude=43.7171, longitude=22.8442, family_id='indo1319', parent_id='east2269', level='language', iso_639_3_code='chu', type='a', dates=[]) >>> a_pipeline.language.name 'Church Slavic' >>> a_pipeline.processes[0] <class 'cltk.dependency.processes.OCSStanzaProcess'> """ description: str = "Pipeline for the Old Church Slavonic language" language: Language = get_lang("chu") processes: list[Type[Process]] = field(default_factory=lambda: [OCSStanzaProcess])
[docs]@dataclass class OldEnglishPipeline(Pipeline): """Default ``Pipeline`` for Old English. >>> from cltk.languages.pipelines import OldEnglishPipeline >>> a_pipeline = OldEnglishPipeline() >>> a_pipeline.description 'Pipeline for the Old English language' >>> a_pipeline.language Language(name='Old English (ca. 450-1100)', glottolog_id='olde1238', latitude=51.06, longitude=-1.31, family_id='indo1319', parent_id='angl1265', level='language', iso_639_3_code='ang', type='h', dates=[]) >>> a_pipeline.language.name 'Old English (ca. 450-1100)' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.MultilingualTokenizationProcess'> """ description: str = "Pipeline for the Old English language" language: Language = get_lang("ang") processes: list[Type[Process]] = field( default_factory=lambda: [ MultilingualTokenizationProcess, OldEnglishLemmatizationProcess, OldEnglishEmbeddingsProcess, StopsProcess, # OldEnglishNERProcess, ] )
[docs]@dataclass class OldFrenchPipeline(Pipeline): """Default ``Pipeline`` for Old French. >>> from cltk.languages.pipelines import OldFrenchPipeline >>> a_pipeline = OldFrenchPipeline() >>> a_pipeline.description 'Pipeline for the Old French language' >>> a_pipeline.language Language(name='Old French (842-ca. 1400)', glottolog_id='oldf1239', latitude=0.0, longitude=0.0, family_id='indo1319', parent_id='oila1234', level='language', iso_639_3_code='fro', type='h', dates=[]) >>> a_pipeline.language.name 'Old French (842-ca. 1400)' >>> a_pipeline.processes[0] <class 'cltk.dependency.processes.OldFrenchStanzaProcess'> """ description: str = "Pipeline for the Old French language" language: Language = get_lang("fro") processes: list[Type[Process]] = field( default_factory=lambda: [ # OldFrenchTokenizationProcess, OldFrenchStanzaProcess, StopsProcess, OldFrenchNERProcess, ] )
# TODO: Add Old Marathi ("omr")
[docs]@dataclass class OldNorsePipeline(Pipeline): """Default ``Pipeline`` for Old Norse. >>> from cltk.languages.pipelines import OldNorsePipeline >>> a_pipeline = OldNorsePipeline() >>> a_pipeline.description 'Pipeline for the Old Norse language' >>> a_pipeline.language Language(name='Old Norse', glottolog_id='oldn1244', latitude=63.42, longitude=10.38, family_id='indo1319', parent_id='west2805', level='language', iso_639_3_code='non', type='h', dates=[]) >>> a_pipeline.language.name 'Old Norse' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.OldNorseTokenizationProcess'> """ description: str = "Pipeline for the Old Norse language" language: Language = get_lang("non") processes: list[Type[Process]] = field( default_factory=lambda: [ OldNorseTokenizationProcess, StopsProcess, OldNorseLexiconProcess, ] )
[docs]@dataclass class PaliPipeline(Pipeline): """Default ``Pipeline`` for Pali. TODO: Make better tokenizer for Pali. >>> from cltk.languages.pipelines import PaliPipeline >>> a_pipeline = PaliPipeline() >>> a_pipeline.description 'Pipeline for the Pali language' >>> a_pipeline.language Language(name='Pali', glottolog_id='pali1273', latitude=24.5271, longitude=82.251, family_id='indo1319', parent_id='biha1245', level='language', iso_639_3_code='pli', type='a', dates=[]) >>> a_pipeline.language.name 'Pali' >>> a_pipeline.processes[0] <class 'cltk.tokenizers.processes.MultilingualTokenizationProcess'> """ description: str = "Pipeline for the Pali language" language: Language = get_lang("pli") processes: list[Type[Process]] = field( default_factory=lambda: [MultilingualTokenizationProcess, PaliEmbeddingsProcess] )
[docs]@dataclass class PanjabiPipeline(Pipeline): """Default ``Pipeline`` for Panjabi. >>> from cltk.languages.pipelines import SanskritPipeline >>> a_pipeline = PanjabiPipeline() >>> a_pipeline.description 'Pipeline for the Panjabi language.' >>> a_pipeline.language Language(name='Eastern Panjabi', glottolog_id='panj125', latitude=30.0368, longitude=75.6702, family_id='indo1319', parent_id='east2727', level='language', iso_639_3_code='pan', type='', dates=[]) >>> a_pipeline.language.name 'Eastern Panjabi' >>> a_pipeline.processes[1] <class 'cltk.stops.processes.StopsProcess'> """ description: str = "Pipeline for the Panjabi language." language: Language = get_lang("pan") processes: list[Type[Process]] = field( default_factory=lambda: [MultilingualTokenizationProcess, StopsProcess] )
[docs]@dataclass class SanskritPipeline(Pipeline): """Default ``Pipeline`` for Sanskrit. TODO: Make better tokenizer for Sanskrit. >>> from cltk.languages.pipelines import SanskritPipeline >>> a_pipeline = SanskritPipeline() >>> a_pipeline.description 'Pipeline for the Sanskrit language.' >>> a_pipeline.language Language(name='Sanskrit', glottolog_id='sans1269', latitude=20.0, longitude=77.0, family_id='indo1319', parent_id='indo1321', level='language', iso_639_3_code='san', type='a', dates=[]) >>> a_pipeline.language.name 'Sanskrit' >>> a_pipeline.processes[1] <class 'cltk.embeddings.processes.SanskritEmbeddingsProcess'> """ description: str = "Pipeline for the Sanskrit language." language: Language = get_lang("san") processes: list[Type[Process]] = field( default_factory=lambda: [ MultilingualTokenizationProcess, SanskritEmbeddingsProcess, StopsProcess, ] )