Source code for cltk.phonology.transcription_processes

"""This module provides phonological/phonetic transcribers for several languages.
**PhonologicalTranscriptionProcess** is the parent-class for all other custom transcription processes.
"""


from copy import copy
from dataclasses import dataclass

from boltons.cacheutils import cachedproperty

from cltk.core.data_types import Doc, Process
from cltk.phonology.ang.phonology import OldEnglishTranscription
from cltk.phonology.gmh.phonology import MiddleHighGermanTranscription

# from cltk.phonology.akk import AkkadianPhonologicalTranscriber
# from cltk.phonology.arb import ArabicPhonologicalTranscriber
from cltk.phonology.got.phonology import GothicTranscription
from cltk.phonology.grc.phonology import GreekTranscription
from cltk.phonology.lat.phonology import LatinTranscription
from cltk.phonology.non.old_swedish.phonology import OldSwedishTranscription
from cltk.phonology.non.orthophonology import OldNorsePhonologicalTranscriber

__author__ = ["Clément Besnier <clem@clementbesnier.fr>"]


[docs]@dataclass
class PhonologicalTranscriptionProcess(Process):
    """General phonological transcription `Process`."""

[docs]    def run(self, input_doc: Doc) -> Doc:
        transcriber = self.algorithm

        output_doc = copy(input_doc)
        for word in output_doc.words:
            word.phonetic_transcription = transcriber(word.string.lower())
        return output_doc


# class AkkadianPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
#     """
#     >>> from cltk.core.data_types import Process, Pipeline
#     >>> from cltk.tokenizers.processes import AkkadianTokenizationProcess
#     >>> from cltk.languages.utils import get_lang
#     >>> from cltk.languages.example_texts import get_example_text
#     >>> from cltk.nlp import NLP
#     >>> pipe = Pipeline(description="A custom Akkadian pipeline", \
#     processes=[AkkadianTokenizationProcess, AkkadianPhonologicalTranscriberProcess], \
#     language=get_lang("akk"))
#     >>> nlp = NLP(language='akk', custom_pipeline=pipe)
#     >>> nlp(get_example_text("akk")).phonetic_transcription
#
#     """
#
#     description = "The default Akkadian transcription process"
#
#     @cachedproperty
#     def algorithm(self):
#         return AkkadianPhonologicalTranscriber()


# class ArabicPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
#     """
#     >>> from cltk.core.data_types import Process, Pipeline
#     >>> from cltk.tokenizers.processes import ArabicTokenizationProcess
#     >>> from cltk.languages.utils import get_lang
#     >>> from cltk.languages.example_texts import get_example_text
#     >>> from cltk.nlp import NLP
#     >>> pipe = Pipeline(description="A custom Old Norse pipeline", \
#     processes=[ArabicTokenizationProcess, ArabicPhonologicalTranscriberProcess], \
#     language=get_lang("arb"))
#     >>> nlp = NLP(language='arb', custom_pipeline=pipe)
#     >>> text = get_example_text("arb")
#     >>> [word.phonetic_transcription for word in nlp(text)]
#
#     """
#
#     description = "The default Arabic transcription process"
#
#     @cachedproperty
#     def algorithm(self):
#         return ArabicPhonologicalTranscriber()


[docs]class GothicPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
    """Phonological transcription `Process` for Gothic.

    >>> from cltk.core.data_types import Process, Pipeline
    >>> from cltk.tokenizers.processes import OldNorseTokenizationProcess
    >>> from cltk.text.processes import DefaultPunctuationRemovalProcess
    >>> from cltk.languages.utils import get_lang
    >>> from cltk.languages.example_texts import get_example_text
    >>> from cltk.nlp import NLP
    >>> pipe = Pipeline(description="A custom Gothic pipeline", \
    processes=[OldNorseTokenizationProcess, DefaultPunctuationRemovalProcess, \
    GothicPhonologicalTranscriberProcess], language=get_lang("got"))
    >>> nlp = NLP(language='got', custom_pipeline=pipe, suppress_banner=True)
    >>> text = get_example_text("got")
    >>> cltk_doc = nlp(text)
    >>> [word.phonetic_transcription for word in cltk_doc.words[:5]]
    ['swa', 'liuhtjɛ', 'liuhaθ', 'jzwar', 'jn']
    """

    description = "The default Gothic transcription process"

    @cachedproperty
    def algorithm(self):
        return GothicTranscription()


[docs]class GreekPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
    """Phonological transcription `Process` for Ancient Greek.

    >>> from cltk.core.data_types import Process, Pipeline
    >>> from cltk.tokenizers.processes import GreekTokenizationProcess
    >>> from cltk.text.processes import DefaultPunctuationRemovalProcess
    >>> from cltk.languages.utils import get_lang
    >>> from cltk.languages.example_texts import get_example_text
    >>> from cltk.nlp import NLP
    >>> pipe = Pipeline(description="A custom Greek pipeline", \
    processes=[GreekTokenizationProcess, DefaultPunctuationRemovalProcess,\
    GreekPhonologicalTranscriberProcess], language=get_lang("grc"))
    >>> nlp = NLP(language='grc', custom_pipeline=pipe, suppress_banner=True)
    >>> text = get_example_text("grc")
    >>> cltk_doc = nlp(text)
    >>> [word.phonetic_transcription for word in cltk_doc.words[:5]]
    ['hó.ti', 'men', 'hy.mệːs', 'ɔ̂ː', 'ɑ́n.dres']
    """

    description = "The default Greek transcription process"

    @cachedproperty
    def algorithm(self):
        return GreekTranscription()


[docs]class LatinPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
    """Phonological transcription `Process` for Latin.

    >>> from cltk.core.data_types import Process, Pipeline
    >>> from cltk.tokenizers.processes import LatinTokenizationProcess
    >>> from cltk.text.processes import DefaultPunctuationRemovalProcess
    >>> from cltk.languages.utils import get_lang
    >>> from cltk.languages.example_texts import get_example_text
    >>> from cltk import NLP
    >>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess, DefaultPunctuationRemovalProcess, LatinPhonologicalTranscriberProcess], language=get_lang("lat"))
    >>> nlp = NLP(language="lat", custom_pipeline=a_pipeline, suppress_banner=True)
    >>> text = get_example_text("lat")
    >>> cltk_doc = nlp.analyze(text)
    >>> [word.phonetic_transcription for word in cltk_doc.words][:5]
    ['[gaɫlɪ̣ja]', '[ɛst̪]', '[ɔmn̪ɪs]', '[d̪ɪwɪsa]', '[ɪn̪]']
    """

    description = "The default Latin transcription process"

    @cachedproperty
    def algorithm(self):
        return LatinTranscription()


[docs]class MiddleHighGermanPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
    """Phonological transcription `Process` for Middle High German.
    >>> from cltk.core.data_types import Process, Pipeline
    >>> from cltk.tokenizers.processes import MiddleHighGermanTokenizationProcess
    >>> from cltk.text.processes import DefaultPunctuationRemovalProcess
    >>> from cltk.languages.utils import get_lang
    >>> from cltk.languages.example_texts import get_example_text
    >>> from cltk.nlp import NLP
    >>> pipe = Pipeline(description="A custom Middle High German pipeline", \
    processes=[MiddleHighGermanTokenizationProcess, DefaultPunctuationRemovalProcess, \
    MiddleHighGermanPhonologicalTranscriberProcess], language=get_lang("gmh"))
    >>> nlp = NLP(language='gmh', custom_pipeline=pipe, suppress_banner=True)
    >>> text = get_example_text("gmh")
    >>> cltk_doc = nlp(text)
    >>> [word.phonetic_transcription for word in cltk_doc.words[:5]]
    ['ʊns', 'ɪst', 'ɪn', 'alten', 'mɛren']
    """

    description = "The default Middle High German transcription process"

    @cachedproperty
    def algorithm(self):
        return MiddleHighGermanTranscription()


[docs]class OldEnglishPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
    """Phonological transcription `Process` for Old English.
    >>> from cltk.core.data_types import Process, Pipeline
    >>> from cltk.tokenizers.processes import MiddleEnglishTokenizationProcess
    >>> from cltk.text.processes import DefaultPunctuationRemovalProcess
    >>> from cltk.languages.utils import get_lang
    >>> from cltk.languages.example_texts import get_example_text
    >>> from cltk.nlp import NLP
    >>> pipe = Pipeline(description="A custom Old English pipeline", \
    processes=[MiddleEnglishTokenizationProcess, DefaultPunctuationRemovalProcess, \
    OldEnglishPhonologicalTranscriberProcess], language=get_lang("ang"))
    >>> nlp = NLP(language='ang', custom_pipeline=pipe, suppress_banner=True)
    >>> text = get_example_text("ang")
    >>> cltk_doc = nlp(text)
    >>> [word.phonetic_transcription for word in cltk_doc.words[:5]]
    ['ʍæt', 'we', 'gɑrˠdenɑ', 'in', 'gæːɑrˠdɑgum']
    """

    description = "The default Old English transcription process"

    @cachedproperty
    def algorithm(self):
        return OldEnglishTranscription()


[docs]class OldNorsePhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
    """Phonological transcription `Process` for Old Norse.

    >>> from cltk.core.data_types import Process, Pipeline
    >>> from cltk.tokenizers.processes import OldNorseTokenizationProcess
    >>> from cltk.text.processes import DefaultPunctuationRemovalProcess
    >>> from cltk.languages.utils import get_lang
    >>> from cltk.languages.example_texts import get_example_text
    >>> from cltk.nlp import NLP
    >>> pipe = Pipeline(description="A custom Old Norse pipeline", \
    processes=[OldNorseTokenizationProcess, DefaultPunctuationRemovalProcess, \
    OldNorsePhonologicalTranscriberProcess], language=get_lang("non"))
    >>> nlp = NLP(language='non', custom_pipeline=pipe, suppress_banner=True)
    >>> text = get_example_text("non")
    >>> cltk_doc = nlp(text)
    >>> [word.phonetic_transcription for word in cltk_doc.words[:5]]
    ['gylvi', 'kɔnunɣr', 'reːð', 'θar', 'lœndum']

    """

    description = "The default Old Norse poetry process"

    @cachedproperty
    def algorithm(self):
        return OldNorsePhonologicalTranscriber()


[docs]class OldSwedishPhonologicalTranscriberProcess(PhonologicalTranscriptionProcess):
    """Phonological transcription `Process` for Old Swedish.

    >>> from cltk.core.data_types import Process, Pipeline
    >>> from cltk.tokenizers.processes import OldNorseTokenizationProcess
    >>> from cltk.text.processes import DefaultPunctuationRemovalProcess
    >>> from cltk.languages.utils import get_lang
    >>> from cltk.languages.example_texts import get_example_text
    >>> from cltk.nlp import NLP
    >>> pipe = Pipeline(description="A custom Old Swedish pipeline", \
    processes=[OldNorseTokenizationProcess, DefaultPunctuationRemovalProcess, \
    OldSwedishPhonologicalTranscriberProcess], language=get_lang("non"))
    >>> nlp = NLP(language='non', custom_pipeline=pipe, suppress_banner=True)
    >>> text = "Far man kunu oc dör han för en hun far barn. oc sigher hun oc hænnæ frændær."
    >>> cltk_doc = nlp(text)
    >>> [word.phonetic_transcription for word in cltk_doc.words[:5]]
    ['far', 'man', 'kunu', 'ok', 'dør']
    """

    description = "The default Old Swedish transcription process"

    @cachedproperty
    def algorithm(self):
        return OldSwedishTranscription()
Source code for cltk.phonology.transcription_processes

The Classical Language Toolkit

Navigation

Related Topics