Source code for cltk.phonology.lat.syllabifier

"""Split Latin words into a list of syllables, based on a set of Latin
language syllable specifications and the original work of Father Matthew
Spencer in C# and Javascript. Original documentation from Fr. Spencer
is preserved where applicable.
"""

import re

__author__ = ["Luke Hollis <lukehollis@gmail.com>"]
__license__ = "MIT License. See LICENSE."

# nota bene: ui is only a diphthong in the exceptional
# cases below (according to Wheelock's Latin)
LATIN = {
    "diphthongs": ["ae", "au", "ei", "eu", "oe"],
    "exceptions": {
        "huius": ["hui", "us"],
        "cuius": ["cui", "us"],
        "huic": ["huic"],
        "cui": ["cui"],
        "hui": ["hui"],
    },
    # y is treated as a vowel; not native to Latin but useful
    # for words borrowed from Greek
    "vowels": [
        "a",
        "e",
        "i",
        "o",
        "u",
        "á",
        "é",
        "í",
        "ó",
        "ú",
        "ā",
        "ē",
        "ī",
        "ō",
        "ū",
        "æ",
        "œ",
        "ǽ",  # no accented œ in unicode?
        "y",
    ],
    "mute_consonants_and_f": ["b", "c", "d", "g", "p", "t", "f"],
    "liquid_consonants": ["l", "r"],
    "prefixes": [
        "a",
        "ab",
        "abs",
        "ad",
        "ac",
        "amb",
        "ambi",
        "ante",
        "circum",
        "co",
        "con",
        "com",
        "contra",
        "counter",
        "de",
        "dis",
        "di",
        "dif",
        "e",
        "ex",
        "ef",
        "extra",
        "extro",
        "in",
        "en",
        "infra",
        "inter",
        "intro",
        "juxta",
        "ne",
        "non",
        "ob",
        "per",
        "post",
        "prae",
        "pre",
        "preter",
        "pro",
        "quasi",
        "re",
        "red",
        "retro",
        "se",
        "sed",
        "sin",
        "sine",
        "sub",
        "subter",
        "super",
        "sur",
        "supra",
        "trans",
        "tra",
        "tran",
        "ultra",
        "outr",
    ],
    "single_syllable_prefixes": ["in", "ex", "ob"],
}


[docs]def _is_consonant(char: str) -> bool:
    """Checks if char is in the list of vowels in the language"""
    return char not in LATIN["vowels"]


[docs]def _is_vowel(char: str) -> bool:
    """Checks if char is in the list of vowels in the language"""
    return char in LATIN["vowels"]


[docs]def _is_diphthong(char_1: str, char_2: str) -> bool:
    """Checks if two sequential characters compose a diphthong"""
    return char_1 + char_2 in LATIN["diphthongs"]


[docs]def _is_mute_consonant_or_f(char: str) -> bool:
    """Checks if char is in the mute_consonants_and_f list"""
    return char in LATIN["mute_consonants_and_f"]


[docs]def _is_liquid_consonant(char: str) -> bool:
    """Checks if char is in the mute_consonants_and_f list"""
    return char in LATIN["liquid_consonants"]


[docs]def syllabify(word: str) -> list[str]:
    """
    Splits input Latin word into a list of syllables

    >>> syllabify('sidere')
    ['si', 'de', 're']
    """

    prefixes = LATIN["single_syllable_prefixes"]
    prefixes.sort(key=len, reverse=True)

    # Check if word is in exception dictionary
    if word in LATIN["exceptions"]:
        syllables = LATIN["exceptions"][word]

    # Else, break down syllables for word
    else:
        syllables = []

        # Remove prefixes
        for prefix in prefixes:
            if word.startswith(prefix):
                syllables.append(prefix)
                word = re.sub("^%s" % prefix, "", word)
                break

        # Initialize syllable to build by iterating through over characters
        syllable = ""

        # Get word length for determining character position in word
        word_len = len(word)

        # Iterate over characters to build syllables
        for i, char in enumerate(word):
            # Build syllable
            syllable = syllable + char
            syllable_complete = False

            # Checks to process syllable logic
            char_is_vowel = _is_vowel(char)
            has_next_char = i < word_len - 1
            has_prev_char = i > 0

            # If it's the end of the word, the syllable is complete
            if not has_next_char:
                syllable_complete = True

            else:
                next_char = word[i + 1]
                if has_prev_char:
                    prev_char = word[i - 1]

                # 'i' is a special case for a vowel. when i is at the
                # beginning of the word (Iesu) or i is between
                # vowels (alleluia), then the i is treated as a
                # consonant (y) Note: what about compounds like 'adiungere'
                if char == "i" and has_next_char and _is_vowel(next_char):
                    if i == 0:
                        char_is_vowel = False
                    elif _is_vowel(prev_char):
                        char_is_vowel = False

                # Determine if the syllable is complete
                if char_is_vowel:
                    if (  # If the next character's a vowel
                        _is_vowel(
                            next_char
                        )  # And it doesn't compose a dipthong with the current character
                        and not _is_diphthong(
                            char, next_char
                        )  # And the current character isn't preceded by a q, unless followed by a u
                        and not (
                            has_prev_char
                            and prev_char == "q"
                            and char == "u"
                            and next_char != "u"
                        )
                    ) or (
                        # If the next character's a consonant
                        # but not a double consonant,
                        # unless it's a mute consonant followed
                        # by a liquid consonant
                        i < word_len - 2
                        and (
                            (
                                (
                                    has_prev_char
                                    and prev_char != "q"
                                    and char == "u"
                                    and _is_vowel(word[i + 2])
                                )
                                or (
                                    not has_prev_char
                                    and char == "u"
                                    and _is_vowel(word[i + 2])
                                )
                            )
                            or (
                                char != "u"
                                and _is_vowel(word[i + 2])
                                and not _is_diphthong(char, next_char)
                            )
                            or (
                                _is_mute_consonant_or_f(next_char)
                                and _is_liquid_consonant(word[i + 2])
                            )
                        )
                    ):
                        syllable_complete = True

                # Otherwise, it's a consonant
                else:
                    if (  # If the next character's also a consonant (but it's not the last in the word)
                        (
                            not _is_vowel(next_char) and i < word_len - 2
                        )  # If the char's not a mute consonant followed by a liquid consonant
                        and not (
                            _is_mute_consonant_or_f(char)
                            and _is_liquid_consonant(next_char)
                        )  # If the char's not a c, p, or t followed by an h
                        and not (
                            (
                                has_prev_char
                                and not _is_vowel(prev_char)
                                and char in ["c", "p", "t"]
                                and next_char == "h"
                            )
                            or (
                                not has_prev_char
                                and char in ["c", "p", "t"]
                                and next_char == "h"
                            )
                        )  # And it's not the only letter in the syllable
                        and not len(syllable) == 1
                    ):
                        syllable_complete = True

            # If it's a complete syllable, append it to syllables list and reset syllable
            if syllable_complete:
                syllables.append(syllable)
                syllable = ""

    return syllables
Source code for cltk.phonology.lat.syllabifier

The Classical Language Toolkit

Navigation

Related Topics