Source code for cltk.phonology.lat.transcription

"""Convert a word from Latin orthography into its hypothesized
pronunciation in the International Phonetic Alphabet (IPA).

https://raw.githubusercontent.com/j-duff/cltk/ipa/
cltk/phonology/lat/transcription.py
"""
import re
import unicodedata

from nltk.tokenize import wordpunct_tokenize

from cltk.core.cltk_logger import logger
from cltk.prosody.lat import macronizer as m

try:
    # James Tauber's greek_accentuation package
    from greek_accentuation import characters as chars
except ImportError as import_error:
    message = (
        'Missing "greek_accentuation" package. Install with '
        "`pip install greek-accentuation`."
    )
    logger.error(message)
    logger.error(import_error)
    raise

__author__ = ["Jack Duff <jmunroeduff@gmail.com>"]
__license__ = "MIT License. See LICENSE."

# Dictionaries of phonological reconstructions for use in transcribing.
# Allen, W. Sidney. 1965. Vox Latina.

LATIN = {
    "Classical": {
        "Allen": {
            "correspondence": {
                "p": "p",
                "t": "t̪",
                "c": "k",
                "k": "k",
                "qu": "kʷ",
                "b": "b",
                "d": "d̪",
                "g": "g",
                "gu": "gʷ",
                "ph": "pʰ",
                "th": "t̪ʰ",
                "ch": "kʰ",
                "n": "n̪",
                "m": "m",
                "r": "r",
                "rh": "r",  # Voiceless r was spelled but not pronounced.
                "l": "l",
                "f": "f",
                "s": "s",
                "h": "h",
                "j": "j",
                "v": "w",
                "x": "ks",
                "z": "z",
                "ī": "iː",
                "ū": "uː",
                "i": "ɪ",
                "u": "ʊ",
                "e": "ɛ",
                "o": "ɔ",
                "ē": "eː",
                "ō": "oː",
                "a": "a",
                "ā": "aː",
                "y": "y",
                "ȳ": "y:",
                "ae": "aj",
                "au": "aw",
                "oe": "oj",
                "eu": "ew",
                "ei": "ej",
            },
            "diphthongs": [  # and digraphs
                "qu",
                "gu",
                "ph",
                "th",
                "ch",
                "rh",
                "ae",
                "au",
                "oe",
                "eu",
                "ei",
            ],
            "punctuation": [
                ".",
                ",",
                ";",
                ":",
                "-",
                "–",
                "?",
                "!",
                "(",
                ")",
                "'",
                '"',
                "[",
                "]",
            ],
            "alternations": [
                "j_maker",  # word initial and intervocalic i is assumed j
                "w_maker",  # word initial and intervocalic u is assumed w
                "wj_block",  # prevents accidental sequence wj
                "uj_diph_maker",  # after w and j have been created, recognizes
                # <ui> = [uj]
                "b_devoice",  # b devoices before /t/, /s/
                "g_n_nasality_assimilation",  # only before n
                "n_place_assimilation",  # should also do labial, and
                # labio-dental before f.
                "final_m_drop",  # m drops and lengthens + nasalizes preceding
                # vowel word-finally
                "ns_nf_lengthening",  # vowels lengthen before ns or nf
                "l_darken",  # l darkens to ɫ in coda
                "j_z_doubling",  # intervocalic j and z > jj and zz
                "long_vowel_catcher",  # corrects accidental instances of ɪː
                # and similar.
                "e_i_closer_before_vowel",  # ɛ to ɛ̣, ɪ to ɪ̣ before another vowel
                "intervocalic_j",  # j glide between vowels
            ],
        }
    }
}

# Unhandled exceptions: preposition "ad" becomes [at̪] not [ad̪] before s and t
# subf > suff, subm > summ, subg > sugg, subc > succ, subr > rr
# j exceptions like ad*j*ectivum and con*j*unx

# All IPA characters used sorted by natural classes.
# WILL NEED ADDITIONS AS MORE RECONSTRUCTIONS USED

IPA = {
    "voiced": [  # [+voice]
        "b",
        "d̪",
        "g",
        "gʷ",
        "m",
        "n̪",
        "ŋ",
        "ɱ" "l",
        "ɫ",
        "r",
        "z",
    ],
    "labial": ["b", "p", "pʰ", "m"],  # [+labial, -labiodental]
    "labiodental": ["f", "ɱ"],  # [+labial, +labiodental]
    "coronal": ["d̪", "t̪", "t̪ʰ", "n̪", "s", "z", "r", "l", "ɫ"],  # [+coronal]
    "velar": ["g", "k", "kʰ", "kʷ", "gʷ", "ŋ"],  # [+velar]
    "nasal": ["m", "ɱ", "n", "ŋ"],  # [+consonantal, +nasal]
    "approximant": ["l", "ɫ", "r", "j", "w"],  # [+approximant]
    "continuant": ["h", "f", "s", "z", "l", "ɫ", "r"],  # [+continuant, +consonantal]
    "vowel": [  # [-consonantal -approximant]
        "a",
        "aː",
        "ɛ",
        "ɛ̣",
        "eː",
        "ɪ",
        "ɪ̣",
        "iː",
        "ɔ",
        "oː",
        "ʊ",
        "u",
        "uː",
        "y",
        "yː",
        "ãː",
        "ẽː",
        "ĩː",
        "õː",
        "ũː",
    ],
    "high": [  # [-consonantal, +high]
        "ɪ",
        "ɪ̣",
        "iː",
        "ʊ",
        "u",
        "uː",
        "y",
        "yː",
        "ɪ̃",
        "ɪ̣̃",
        "ĩː",
        "ʊ̃",
        "ũ",
        "ũː",
        "ỹ",
        "ỹː",
    ],
    "mid": [  # [-consonantal, -high, -low]
        "ɛ",
        "ɛ̣",
        "eː",
        "ɔ",
        "oː",
        "ɛ̃",
        "ɛ̣̃",
        "ẽː",
        "ɔ̃",
        "õː",
    ],
    "low": ["a", "aː", "ã", "ãː"],  # [-consonantal, +low]
    "front": [  # [-consonantal, +front]
        "ɪ",
        "ɪ̣",
        "iː",
        "y",
        "yː",
        "ɛ",
        "ɛ̣",
        "eː",
        "ɪ̃",
        "ɪ̣̃",
        "ĩː",
        "ỹ",
        "ỹː",
        "ɛ̃",
        "ɛ̣̃",
        "ẽː",
    ],
    "central": ["a", "aː", "ã", "ãː"],  # [-consonantal, -front, -back]
    "back": [  # [-consonantal, +back]
        "ʊ",
        "u",
        "uː",
        "ɔ",
        "oː",
        "ʊ̃",
        "ũ",
        "ũː",
        "ɔ̃",
        "õː",
    ],
    "boundary": ["#"],
}


[docs]class Phone:
    """A phonological unit to be manipulated and represented as an IPA string."""

    # Has a bundle of feature values that help classify it so that it can
    # trigger contextual pronunciation changes.

    def __init__(self, ipa_ch: str):
        """
        Analyzes features of phonetic signs
        :param ipa_ch: phonetic sign from IPA
        """

        # eventually exported to output string
        self.ipa = unicodedata.normalize("NFC", ipa_ch)
        # will be assigned once in Word, as the pre-context of this phone
        self.left = ""
        # .... as the post-context of this phone
        self.right = ""

        # bundle of features, stored as booleans:
        self.vce = self.ipa in IPA["voiced"]
        self.lab = self.ipa in IPA["labial"]
        self.lbd = self.ipa in IPA["labiodental"]
        self.cor = self.ipa in IPA["coronal"]
        self.vel = self.ipa in IPA["velar"]
        self.nas = self.ipa in IPA["nasal"]
        self.app = self.ipa in IPA["approximant"]
        self.cont = self.ipa in IPA["continuant"]
        self.vow = self.ipa in IPA["vowel"]
        self.hi = self.ipa in IPA["high"]
        self.mid = self.ipa in IPA["mid"]
        self.lo = self.ipa in IPA["low"]
        self.fr = self.ipa in IPA["front"]
        self.ctr = self.ipa in IPA["central"]
        self.bk = self.ipa in IPA["back"]
        self.bound = self.ipa in IPA["boundary"]

    def __repr__(self):
        return self.ipa


[docs]class Word:
    """Max. phonological unit, contains phones and triggers alternations."""

    # An ordered collection of Phones, which are bundles of
    # features/IPA strings.

    def __init__(self, ipa_str: str, root: dict):
        """

        :param ipa_str:
        :param root:
        """
        self.string = unicodedata.normalize("NFC", ipa_str)
        # Appropriate directory in the reconstruction dictionary
        self.root = root
        # list of contextual pronunciation alternations
        self.alts = self.root["alternations"]
        # Turns string of IPA characters into list of Phones
        self.phones = [Phone(c) for c in re.findall(r".[̪̣̃ʷʰ]*ː?", self.string)]

        self.syllables = []

[docs]    def _refresh(self):
        """
        Assigns left and right contexts for every phone
        """
        for n in range(len(self.phones)):
            p = self.phones[n]
            if n != 0:
                p.left = self.phones[n - 1]
            else:
                p.left = Phone("#")
            if n != len(self.phones) - 1:
                p.right = self.phones[n + 1]
            else:
                p.right = Phone("#")

[docs]    def _j_maker(self):
        """
        Assume word-initial or intervocalic i to be j
        """
        out_phones = self.phones
        target = Phone("j")
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.ipa == "ɪ" and (
                (p.left.bound and p.right.vow) or (p.left.vow and p.right.vow)
            ):
                out_phones[n] = target
        self.phones = out_phones
        self._refresh()

[docs]    def _w_maker(self):
        """
        Assume word-initial or intervocalic u to be w
        """
        out_phones = self.phones
        target = Phone("w")
        for n in range(len(self.phones)):
            p = self.phones[n]
            if ((p.ipa == "ʊ") or (p.ipa == "u")) and (
                (p.left.bound and (p.right.vow or p.right.ipa == "j"))
                or (p.left.vow and p.right.vow)
            ):
                out_phones[n] = target
        self.phones = out_phones
        self._refresh()

[docs]    def _wj_block(self):
        """
        Addendum to correct possible 'wj' sequences
        """
        out_phones = self.phones
        target = Phone("ɪ")
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.left.ipa == "w" and p.ipa == "j":
                out_phones[n] = target
        self.phones = out_phones
        self._refresh()

[docs]    def _uj_diph_maker(self):
        """
        Find accidental "ʊɪ" instances and treat as diphthong [uj].
        """
        out_phones = self.phones
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.left.ipa == "ʊ" and p.ipa == "ɪ":
                out_phones[n - 1] = Phone("u")
                out_phones[n] = Phone("j")
        self.phones = out_phones
        self._refresh()

[docs]    def _b_devoice(self):
        """
        Pronounce b as p when followed by s or t.
        """
        out_phones = self.phones
        target = Phone("p")
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.ipa == "b" and (p.right.ipa == "s" or p.right.ipa == "t̪"):
                out_phones[n] = target
        self.phones = out_phones
        self._refresh()

[docs]    def _final_m_drop(self):
        """
        Final m nasalizes and lengthens nucleus and drops.
        """
        out_phones = self.phones
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.left.vow and p.ipa == "m" and p.right.bound:
                out_phones[n - 1] = Phone(p.left.ipa + "̃ː")
                del out_phones[n]
        self.phones = out_phones
        self._refresh()

[docs]    def _n_place_assimilation(self):
        """
        Pronounce n as ŋ when followed by velar.
        """
        out_phones = self.phones
        target = Phone("ŋ")
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.ipa == "n̪" and p.right.vel:
                out_phones[n] = target
        self.phones = out_phones
        self._refresh()

[docs]    def _g_n_nasality_assimilation(self):
        """
        Pronounce g as ŋ when followed by n.
        """

        out_phones = self.phones
        target = Phone("ŋ")
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.ipa == "g" and p.right.ipa == "n̪":
                out_phones[n] = target
        self.phones = out_phones
        self._refresh()

[docs]    def _ns_nf_lengthening(self):
        """
        Lengthen vowel before ns or nf.
        """
        out_phones = self.phones
        for n in range(len(self.phones)):
            p = self.phones[n]
            if (
                p.left.vow
                and "ː" not in p.left.ipa
                and p.ipa == "n̪"
                and (p.right.ipa == "s" or p.right.ipa == "f")
            ):
                out_phones[n - 1] = Phone(p.left.ipa + "ː")
        self.phones = out_phones
        self._refresh()

[docs]    def _l_darken(self):
        """
        Pronounce l as ɫ in coda.
        """
        out_phones = self.phones
        target = Phone("ɫ")
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.ipa == "l" and ((not p.right.vow) or p.right.bound):
                out_phones[n] = target
        self.phones = out_phones
        self._refresh()

[docs]    def _j_z_doubling(self):
        """
        Double j and z between vowels.
        """
        out_phones = self.phones
        dupl = []
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.right.vow and (p.ipa == "j" or p.ipa == "z") and p.left.vow:
                dupl.append((True, n - len(self.phones), p.ipa))
            else:
                dupl.append((False, n - len(self.phones), None))
        for t in sorted(dupl, key=lambda tup: tup[1]):
            if t[0]:
                out_phones.insert(t[1], Phone(t[2]))
        self.phones = out_phones
        self._refresh()

[docs]    def _long_vowel_catcher(self):
        """
        Replace ɪː with iː, ʊː with uː, and ɛː with eː.
        """
        out_phones = self.phones
        target_dict = {
            "ɪː": "iː",
            "ʊː": "uː",
            "ɛː": "eː",
            "ɪ̃ː": "ĩː",
            "ʊ̃ː": "ũː",
            "ɛ̃ː": "ẽː",
        }
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.ipa in target_dict.keys():
                out_phones[n] = Phone(target_dict[p.ipa])
        self.phones = out_phones
        self._refresh()

[docs]    def _e_i_closer_before_vowel(self):
        """
        e and i become closer (̣) when followed by a vowel.
        """
        out_phones = self.phones
        for n in range(len(self.phones)):
            p = self.phones[n]
            if (p.ipa == "ɛ" or p.ipa == "ɪ") and p.right.vow:
                out_phones[n] = Phone(p.ipa + "̣")
        self.phones = out_phones
        self._refresh()

[docs]    def _intervocalic_j(self):
        """
        epenthesize j between vowels
        """
        out_phones = self.phones
        target = Phone("j")
        j = []
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.left.vow and p.vow:
                j.append((True, n - len(self.phones)))
            else:
                j.append((False, n - len(self.phones)))
        for t in sorted(j, key=lambda tup: tup[1]):
            if t[0]:
                out_phones.insert(t[1], target)
        self.phones = out_phones
        self._refresh()

    # list of all possible alternations
    ALTERNATIONS = [
        ("j_maker", _j_maker),
        ("w_maker", _w_maker),
        ("wj_block", _wj_block),
        ("uj_diph_maker", _uj_diph_maker),
        ("b_devoice", _b_devoice),
        ("final_m_drop", _final_m_drop),
        ("n_place_assimilation", _n_place_assimilation),
        ("g_n_nasality_assimilation", _g_n_nasality_assimilation),
        ("ns_nf_lengthening", _ns_nf_lengthening),
        ("l_darken", _l_darken),
        ("j_z_doubling", _j_z_doubling),
        ("long_vowel_catcher", _long_vowel_catcher),
        ("e_i_closer_before_vowel", _e_i_closer_before_vowel),
        ("intervocalic_j", _intervocalic_j),
    ]

[docs]    def _alternate(self):
        """
        After setting left and right contexts for every phone...
        """
        self._refresh()
        # runs all alternations
        for a in Word.ALTERNATIONS:
            if a[0] in self.alts:
                a[1](self)

[docs]    def syllabify(self) -> list[list[Phone]]:
        """
        Takes Word input and returns a list of syllables
        as (onset, nucleus, coda) tuples
        where onset, nucleus, and coda are all lists of Phones.
        :return: list of syllables
        """
        nuclei = []
        for n in range(len(self.phones)):
            p = self.phones[n]
            if p.vow:
                nuclei.append(n)
        # initialize syllables with a tuple for the first syllable
        # where onset is everything before the first nucleus
        # and coda remains unknown.
        syllables = [[self.phones[0 : nuclei[0]], [self.phones[nuclei[0]]], []]]
        # continue for every nucleus, assuming that everything between
        # the previous nucleus and it is the onset.
        for x in range(len(nuclei) - 1):
            i = nuclei[x + 1]
            onset = self.phones[nuclei[x] + 1 : i]
            nucleus = [self.phones[i]]
            syllables.append([onset, nucleus, []])
        # assume that everything after the final nucleus is final coda.
        syllables[-1][2] = self.phones[nuclei[-1] + 1 :]
        # now go through and check onset viability
        for x in range(len(syllables) - 1):
            onset = syllables[x + 1][0]
            nucleus = syllables[x + 1][1]
            coda = syllables[x + 1][2]
            # trim all onsets greater than the maximum 2 phones
            # removing extra phones from the left
            # and appending them to the previous coda
            if len(onset) > 2:
                trim = onset[:-2]
                del onset[:-2]
                syllables[x][2] = trim
            # once onset is 2 phones...
            if len(onset) == 2:
                # stop + liquid is the only viable sequence and passes
                if (
                    (not onset[0].cont)
                    and (not onset[0].app)
                    and (onset[1].nas or onset[1].app)
                ):
                    break
                # otherwise, onset must be right Phone only
                # the left phone is appended to the previous coda
                else:
                    trim = onset[0]
                    del onset[0]
                    syllables[x][2] += [trim]
        self.syllables = syllables
        return syllables

[docs]    def _print_ipa(self, syllabify, accentuate):
        """
        Depending on the syllabify and accentuate parameters
        Prints an appropriately marked up version of the transcription

        :param syllabify:
        :param accentuate:
        :return:
        """

        out = ""
        if syllabify:
            syllables = self.syllabify()
            # the ultima is the final syllable
            ultima = syllables[-1]
            # identify which syllable has stress and store index as accent
            if accentuate:
                # one syllable words have ultimate stress
                if len(syllables) == 1:
                    accent = -1
                # two syllable words have penultimate stress
                elif len(syllables) == 2:
                    accent = -2
                else:
                    # penult is second to last syllable
                    penult = syllables[-2]
                    # if penult is diphthong (long), penultimate stress
                    if len(penult[1]) > 1:
                        accent = -2
                    # if penult is long vowel, penultimate stress
                    elif "ː" in penult[1][0].ipa:
                        accent = -2
                    # if penult has coda (closed/long by position),
                    # penultimate stress
                    elif len(penult[2]) > 0:
                        accent = -2
                    # otherwise (penult is short) antepenultimate stress
                    else:
                        accent = -3
                # loop over syllables by index
                for x in range(len(syllables)):
                    s = syllables[x]
                    # if index matches accent index set above
                    if x - len(syllables) == accent:
                        # precede that syllable with
                        # IPA stress punctuation: '
                        out += "'"
                    # then, print IPA by syllable segment as usual
                    for n in s:
                        for p in n:
                            out += p.ipa
                    # seperate all syllables with IPA syllable punctuation: .
                    if s != ultima:
                        out += "."
            # if no accentuation flag, proceed with syllabified printing
            else:
                for s in syllables:
                    for n in s:
                        for p in n:
                            out += p.ipa
                    # seperate all syllables with IPA syllable punctuation: .
                    if s != ultima:
                        out += "."
        # if no syllabification flag, proceed with
        # unsyllabified IPA printing
        else:
            for p in self.phones:
                out += p.ipa
        return out


[docs]class Transcriber:
    """Uses a reconstruction to transcribe a orthographic string into IPA."""

    def __init__(self, dialect: str, reconstruction: str):
        """

        :param dialect: Latin dialect
        :param reconstruction: reconstruction method
        """
        self.lect = dialect
        self.recon = reconstruction
        self.root = LATIN[self.lect][self.recon]
        self.table = self.root["correspondence"]
        self.diphs = self.root["diphthongs"]
        self.punc = self.root["punctuation"]
        self.macronizer = m.Macronizer("tag_ngram_123_backoff")

[docs]    def _parse_diacritics(self, ch: str) -> str:
        """

        EG: input with base a -> a/LENGTH/DIAERESIS/

        :param ch: character
        :return: a string with separated and organized diacritics for easier access later.
        """

        out = chars.base(ch).lower()  # Initialize out as base of character.

        length = chars.length(ch)
        dia = chars.diaeresis(ch)

        out += "/"  # Create 1st boundary

        # If any length, place between 1st and 2nd boundary
        if length:
            out += length

        out += "/"  # Create 2nd boundary

        if dia:  # If any diaeresis,
            out += dia  # place between second and final boundary

        out += "/"  # Create final boundary

        return out

[docs]    def _prep_text(self, text: str):
        """
        Performs preparatory tasks grouping and reordering characters
        in order to make transcription formulaic.

        :param text:
        :return:
        """
        string_in = "".join([self._parse_diacritics(ch) for ch in text])

        # searches for diphthongs and treats them as one phone
        for d in self.diphs:
            d1 = d[0]
            d2 = d[1]
            pattern = r"(" + d1 + r")\/\/\/(" + d2 + r")(\/\/\/)"
            string_in = re.sub(pattern, r"\1\2\3", string_in)

        tup_out = re.findall(r"(..?)\/([̄̆]*)\/(¨?)\/", string_in)

        return tup_out

[docs]    def transcribe(
        self,
        text,
        macronize=True,
        syllabify=True,
        accentuate=True,
        with_squared_brackets=True,
    ):
        """
        >>> allen_transcriber = Transcriber("Classical", "Allen")
        >>> example = allen_transcriber.transcribe("Quo usque tandem, O Catilina, " + "abutere nostra patientia?")
        >>> example
        "['kʷoː 'ʊs.kʷɛ 't̪an̪.d̪ẽː 'oː ka.t̪ɪ.'liː.n̪aː a.buː.'t̪eː.rɛ 'n̪ɔs.t̪raː pa.t̪ɪ̣.'jɛn̪.t̪ɪ̣.ja]"

        :param text: text to transcribe
        :param macronize: if True, macronize result
        :param syllabify: if True, syllabify result
        :param accentuate: if True, accentuate result
        :param with_squared_brackets: if True, put squared brackets around transcription
        :return: transcribed text
        """
        # if macronize, will first use the tagger to macronize input
        # otherwise, input will be the raw input string
        if macronize:
            text = self.macronizer.macronize_text(text)
        # input is word-tokenized, stripped of non-diacritic punctuation,
        # and diphthongs and diacritics are handled
        inp = [
            self._prep_text(w) for w in wordpunct_tokenize(text) if w not in self.punc
        ]
        words = []
        for w in inp:
            out = ""
            for c in w:
                if "̄" in c[1]:
                    macron_added = c[0] + "̄"
                    ipa = self.table.get(macron_added, macron_added)
                else:
                    ipa = self.table.get(c[0], c[0])
                out += ipa
            transcription = Word(out, self.root)
            transcription._alternate()
            words.append(transcription)
        # Encloses output in brackets, proper notation for surface form.
        result = " ".join([w._print_ipa(syllabify, accentuate) for w in words])
        if with_squared_brackets:
            result = "[" + result + "]"
        return result
Source code for cltk.phonology.lat.transcription

The Classical Language Toolkit

Navigation

Related Topics