Source code for cltk.phonology.non.utils

"""To define sounds, phonetic rules for phonetic transcription.
"""

import re
from enum import Enum, auto

from cltk.core.cltk_logger import logger

__author__ = ["Clément Besnier <clem@clementbesnier.fr>"]


[docs]class AutoName(Enum):
    def _generate_next_value_(name, a, b, d):
        return name


# Definition of consonants
[docs]class Manner(AutoName):
    nasal = auto()
    stop = auto()
    lateral = auto()
    fricative = auto()
    trill = auto()
    spirant = auto()
    affricate = auto()
    approximant = auto()


[docs]class Place(AutoName):
    bilabial = auto()
    labio_dental = auto()
    dental = auto()
    alveolar = auto()
    post_alveolar = auto()
    retroflex = auto()
    palatal = auto()
    velar = auto()
    uvular = auto()
    glottal = auto()


[docs]class AbstractConsonant:
    """
    Used with AbstractPosition to define an environment of a sound
    """

    def __init__(self, place=None, manner=None, voiced=None, ipar=None, geminate=None):
        if isinstance(place, Place) or place is None:
            self.place = place
        else:
            logger.error("Incorrect argument")
        if isinstance(manner, Manner) or manner is None:
            self.manner = manner
        else:
            logger.error("Incorrect argument")
            raise ValueError
        if type(voiced) == bool or voiced is None:
            self.voiced = voiced
        else:
            logger.error("Incorrect argument")
            raise TypeError
        if type(geminate) == bool or geminate is None:
            self.geminate = geminate
        else:
            logger.error("Incorrect argument")
            raise TypeError
        self.ipar = ipar

    def __str__(self):
        return self.ipar


[docs]class Consonant(AbstractConsonant):
    """A `consonant <https://en.wikipedia.org/wiki/Consonant>`_ is defined mostly by the its place (where in the vocal tract the obstruction of the consonant occurs,
    and which speech organs are involved), its manner  how air escapes from the vocal tract when the consonant or
    approximant (vowel-like) sound is made), by if it is voiced or not, its length (if it is geminate). An IPA
    standard is at: <https://en.wikipedia.org/wiki/International_Phonetic_Alphabet>_.
    """

    def __init__(self, place, manner, voiced, ipar, geminate):
        assert place is not None
        assert manner is not None
        assert voiced is not None
        assert ipar is not None
        assert geminate is not None
        AbstractConsonant.__init__(self, place, manner, voiced, ipar, geminate)

[docs]    def match(self, abstract_consonant: AbstractConsonant) -> bool:
        """
        A real consonant matches an abstract consonant if and only if the required features of the abstract consonant
        are also features of the real consonant.
        :param abstract_consonant: AbstractConsonant
        :return: bool
        """
        if isinstance(abstract_consonant, AbstractConsonant):
            res = True
            if abstract_consonant.place is not None:
                res = res and abstract_consonant.place == self.place
            if abstract_consonant.manner is not None:
                res = res and abstract_consonant.manner == self.manner
            if abstract_consonant.voiced is not None:
                res = res and abstract_consonant.voiced == self.voiced
            if abstract_consonant.geminate is not None:
                res = res and abstract_consonant.geminate == self.geminate
            return res
        elif abstract_consonant is None:
            return True
        else:
            return False

[docs]    def match_list(self, abstract_consonant_list):
        if type(abstract_consonant_list) == list:
            if len(abstract_consonant_list) == 0:
                return True
            else:
                res = False
                for ac in abstract_consonant_list:
                    if isinstance(ac, AbstractConsonant):
                        res = self.match(ac) or res
                return res
        else:
            return False

[docs]    def lengthen(self):
        """

        :return: a new lengthened Consonant
        """
        geminate = True
        if not self.geminate:
            ipar = self.ipar + "ː"
        else:
            ipar = self.ipar

        return Consonant(self.place, self.manner, self.voiced, ipar, geminate)

[docs]    def to_abstract(self):
        return AbstractConsonant(
            self.place, self.manner, self.voiced, self.ipar, self.geminate
        )

    def __add__(self, other):
        return Consonant(
            self.place, self.manner, self.voiced, self.ipar + other.ipar, False
        )

    def __str__(self):
        return self.ipar

    __repr__ = __str__

[docs]    def is_equal(self, other_consonnant):
        """
        >>> v_consonant = Consonant(Place.labio_dental, Manner.fricative, True, "v", False)
        >>> f_consonant = Consonant(Place.labio_dental, Manner.fricative, False, "f", False)
        >>> v_consonant.is_equal(f_consonant)
        False

        :param other_consonnant:
        :return:
        """
        return (
            self.place == other_consonnant.place
            and self.manner == other_consonnant.manner
            and self.voiced == other_consonnant.voiced
            and self.geminate == other_consonnant.geminate
        )


# Vowels
[docs]class Height(AutoName):
    open = auto()
    near_open = auto()
    open_mid = auto()
    mid = auto()
    close_mid = auto()
    near_close = auto()
    close = auto()


[docs]class Backness(AutoName):
    front = auto()
    central = auto()
    back = auto()


[docs]class Length(AutoName):
    short = auto()
    long = auto()
    overlong = auto()


[docs]class AbstractVowel:
    """
    Used with AbstractPosition to define an environment of a sound
    """

    def __init__(
        self, height=None, backness=None, rounded=None, length=None, ipar=None
    ):
        if isinstance(height, Height) or height is None:
            self.height = height
        else:
            logger.error("Incorrect argument")
            raise ValueError
        if isinstance(backness, Backness) or backness is None:
            self.backness = backness
        else:
            logger.error("Incorrect argument")
            raise ValueError
        if type(rounded) == bool or rounded is None:
            self.rounded = rounded
        else:
            logger.error("Incorrect argument")
            raise TypeError
        if isinstance(length, Length) or length is None:
            self.length = length
        else:
            logger.error("Incorrect argument")
            raise ValueError
        self.ipar = ipar

    def __str__(self):
        return self.ipar


[docs]class Vowel(AbstractVowel):
    """
    https://en.wikipedia.org/wiki/Vowel

    """

    def __init__(self, height, backness, rounded, length, ipar):
        assert height is not None
        assert backness is not None
        assert rounded is not None
        assert length is not None
        assert ipar is not None
        AbstractVowel.__init__(self, height, backness, rounded, length, ipar)

[docs]    def lengthen(self):
        """

        :return: a new lengthened Vowel
        """
        if self.length == Length.short:
            length = Length.long
            ipar = self.ipar + "ː"
        else:
            ipar = self.ipar
            length = Length.short
        return Vowel(self.height, self.backness, self.rounded, length, ipar)

[docs]    def match(self, abstract_vowel):
        if isinstance(abstract_vowel, AbstractVowel):
            res = True
            if abstract_vowel.height is not None:
                res = res and abstract_vowel.height == self.height
            if abstract_vowel.backness is not None:
                res = res and abstract_vowel.backness == self.backness
            if abstract_vowel.rounded is not None:
                res = res and abstract_vowel.rounded == self.rounded
            if abstract_vowel.length is not None:
                res = res and abstract_vowel.length == self.length
            return res
        elif abstract_vowel is None:
            return True
        else:
            return False

[docs]    def match_list(self, abstract_vowel_list):
        if type(abstract_vowel_list) == list:
            if len(abstract_vowel_list) == 0:
                return True
            else:
                res = False
                for av in abstract_vowel_list:
                    if isinstance(av, AbstractVowel):
                        res = self.match(av) or res
                return res
        else:
            return False

[docs]    def to_abstract(self):
        return AbstractVowel(
            self.height, self.backness, self.rounded, self.length, self.ipar
        )

    # def overlengthen(self):
    #     self.length = "overlong"

[docs]    def i_umlaut(self):
        pass

[docs]    def u_umlaut(self):
        pass

    def __str__(self):
        return self.ipar

    __repr__ = __str__

[docs]    def is_equal(self, other_sound):
        """

        :param other_sound:
        :return:
        """
        return (
            self.height == other_sound.height
            and self.backness == other_sound.backness
            and self.rounded == other_sound.rounded
            and self.length == other_sound.length
        )

    def __add__(self, other):
        return Vowel(
            self.height,
            self.backness,
            self.rounded,
            self.length,
            self.ipar + other.ipar,
        )


[docs]class Rank(AutoName):
    first = auto()
    inner = auto()
    last = auto()


[docs]class AbstractPosition:
    """
    This is a position (at the beginning, inside or at the end) that a rule can be applied at,
     a sound or a set of sounds before and a sound or a set of sounds after
    """

    def __init__(self, position, before, after):
        assert isinstance(position, Rank)

        self.position = position
        # assert isinstance(before, AbstractConsonant) or isinstance(before, AbstractVowel)
        self.before = before
        # assert isinstance(after, AbstractConsonant) or isinstance(after, AbstractVowel)
        self.after = after

    def __eq__(self, other):
        assert isinstance(other, AbstractPosition)
        return (
            self.position == other.position
            and self.before == other.before
            and self.after == other.after
        )

[docs]    def same_place(self, other):
        assert isinstance(other, AbstractPosition)
        return self.position == other.position

    def __add__(self, other):
        assert self.position == other.position
        if self.before is None and other.before:
            before = None
        elif self.before is None:
            before = other.before
        elif other.before is None:
            before = self.before
        else:
            before = []
            before.extend(self.before)
            before.extend(other.before)
        if self.after is None and other.after is None:
            after = None
        elif self.after is None:
            after = other.after
        elif other.after is None:
            after = self.after
        else:
            after = []
            after.extend(self.after)
            after.extend(other.after)
        return AbstractPosition(self.position, before, after)


[docs]class Position:
    """
    This is a position (at the beginning, inside or at the end) of a an observed word, a sound before and a sound after
    """

    def __init__(self, position, before, after):
        assert isinstance(position, Rank)
        self.position = position
        assert (
            isinstance(before, Consonant) or isinstance(before, Vowel) or before is None
        )
        self.before = before
        assert isinstance(after, Consonant) or isinstance(after, Vowel) or after is None
        self.after = after

[docs]    def real_sound_match_abstract_sound(self, abstract_pos: AbstractPosition) -> bool:
        """
        If an observed position
        :param abstract_pos:
        :return:
        """
        assert isinstance(abstract_pos, AbstractPosition)
        if self.before is not None and self.after is not None:
            return (
                self.position == abstract_pos.position
                and self.before.match_list(abstract_pos.before)
                and self.after.match_list(abstract_pos.after)
            )
        elif self.before is None and self.after is None:
            return self.position == abstract_pos.position
        elif self.before is None:
            return self.position == abstract_pos.position and self.after.match_list(
                abstract_pos.after
            )
        else:
            return self.position == abstract_pos.position and self.before.match_list(
                abstract_pos.before
            )


[docs]class Rule:
    """
    A Rule is used to transform one sound to another according to its direct environment
    (the letter before and the letter after). If a rule is applicable, then it is applied.
    """

    def __init__(self, position, temp_sound, estimated_sound):
        """
        :param position: AbstractPosition
        :param temp_sound: Vowel or Consonant
        :param estimated_sound: Vowel or Consonant
        """
        assert isinstance(position, AbstractPosition)
        self.position = position
        assert isinstance(temp_sound, Vowel) or isinstance(temp_sound, Consonant)
        self.temp_sound = temp_sound
        assert isinstance(estimated_sound, Vowel) or isinstance(
            estimated_sound, Consonant
        )
        self.estimated_sound = estimated_sound

[docs]    def can_apply(self, current_position: Position) -> bool:
        """
        A Rule is applied if and only if a letter has a direct environment (the sound just before and the sound just
        after) which matches the environment of Rule
        :param current_position:
        :return: bool
        """
        return current_position.real_sound_match_abstract_sound(self.position)

[docs]    def ipa_to_regular_expression(self, phonology):
        """

        :param phonology: list of Vowel or Consonant instances
        :return: pattern which can be the first argument of re.sub
        """
        if self.position.position == Rank.first:
            re_before = r"^"
        elif self.position.before is None:
            re_before = r""
        else:
            re_before = r"(?<=["
            for phoneme in phonology:
                if phoneme.match_list(self.position.before):
                    re_before += phoneme.ipar
            re_before += r"])"

        if self.position.position == Rank.last:
            re_after = r"$"
        elif self.position.after is None:
            re_after = r""
        else:
            re_after = r"(?=["
            for phoneme in phonology:
                if phoneme.match_list(self.position.after):
                    re_after += phoneme.ipar
            re_after += "])"
        return re_before + self.temp_sound.ipar + re_after

[docs]    @staticmethod
    def from_regular_expression(re_rule, estimated_sound, ipa_class):
        """

        :param re_rule: pattern (first argument of re.sub)
        :param estimated_sound: an IPA character (second argument of re.sub)
        :param ipa_class: dict whose keys are IPA characters and values are Vowel or Consonant instances
        :return: corresponding Rule instance
        """
        assert len(re_rule) > 0
        if re_rule[0] == "^":
            place = Rank.first
        elif re_rule[-1] == "$":
            place = Rank.last
        else:
            place = Rank.inner

        before_pattern = r"(?<=\(\?\<\=\[)\w*"
        core_pattern = r"(?<=\))\w(?=\(\?\=)|(?<=\^)\w(?=\(\?\=)|(?<=\))\w(?=\$)"
        after_pattern = r"(?<=\(\?\=\[)\w*"
        before_search = re.search(before_pattern, re_rule)
        core_search = re.search(core_pattern, re_rule)
        after_search = re.search(after_pattern, re_rule)
        if before_search is None:
            before = None
        else:
            before = [ipa_class[ipar].to_abstract() for ipar in before_search.group(0)]
        if core_search is not None:
            core = ipa_class[core_search.group(0)]
        else:
            logger.error("No core")
            raise ValueError
        if after_search is None:
            after = None
        else:
            after = [ipa_class[ipar].to_abstract() for ipar in after_search.group(0)]
        abstract_position = AbstractPosition(place, before, after)
        return Rule(abstract_position, core, ipa_class[estimated_sound])

    def __add__(self, other):
        assert isinstance(other, Rule)
        assert self.position.same_place(other.position)
        assert self.temp_sound.ipar == other.temp_sound.ipar
        assert self.estimated_sound.ipar == other.estimated_sound.ipar
        position = self.position + other.position
        return Rule(position, self.temp_sound, self.estimated_sound)


[docs]class Transcriber:
    """
    There are two steps to transcribe words:
        - firstly, a greedy approximation of the pronunciation of word
        - then, use of rules to precise pronunciation of a preprocessed list of transcribed words
    """

    def __init__(
        self,
        diphthongs_ipa: dict,
        diphthongs_ipa_class: dict,
        ipa_class: dict,
        rules: list,
    ):
        """

        :param diphthongs_ipa: dict whose keys are written diphthongs and and values IPA trasncription of them
        :param diphthongs_ipa_class: dict whose keys are written diphthongs and and values are Vowel instances
        :param ipa_class: dict whose keys are written characters and and values are Vowel or Consonant instances
        :param rules: list of Rule instances
        """
        self.diphthongs_ipa = diphthongs_ipa
        self.diphthongs_ipa_class = diphthongs_ipa_class
        self.ipa_class = ipa_class
        self.rules = rules

[docs]    def word_to_phonetic_representation(self, word, with_squared_brackets=True):
        """

        :param word: normalized word
        :param with_squared_brackets:
        :return:
        """
        phonemes = self.text_to_phonemes(word)
        phonetic_representation = self.phonemes_to_phonetic_representation(phonemes)
        if with_squared_brackets:
            return f"[{phonetic_representation}]"
        return phonetic_representation

[docs]    def text_to_phonetic_representation(
        self, sentence: str, with_squared_brackets=True
    ) -> str:
        """

        :param sentence:
        :param with_squared_brackets:
        :return:
        """
        transliterated = []
        sentence = sentence.lower()
        sentence = re.sub(r"[.\";,:\[\]()!&?‘]", "", sentence)
        for word in sentence.split(" "):
            transliterated.append(self.word_to_phonetic_representation(word, False))
        if with_squared_brackets:
            return "[" + " ".join(transliterated) + "]"
        return " ".join(transliterated)

[docs]    def text_to_phonemes(self, word: str) -> list:
        """
        Give a greedy approximation of the pronunciation of word
        :param word:
        :return:
        """
        phonemes = []
        is_repeated = False
        if len(word) >= 2:
            for index in range(len(word) - 1):
                if is_repeated:
                    is_repeated = False
                    continue
                if word[index : index + 2] in self.diphthongs_ipa:  # diphthongs
                    phonemes.append(
                        self.diphthongs_ipa_class[word[index] + word[index + 1]]
                    )
                    is_repeated = True
                elif word[index] == word[index + 1]:
                    phonemes.append(self.ipa_class[word[index]].lengthen())
                    is_repeated = True
                else:
                    phonemes.append(self.ipa_class[word[index]])
            if not is_repeated:
                phonemes.append(self.ipa_class[word[len(word) - 1]])
        else:
            phonemes.append(self.ipa_class[word[0]])
        return phonemes

[docs]    def phonemes_to_phonetic_representation(self, phonemes: list) -> str:
        """
        Use of rules to precise pronunciation of a preprocessed list of transcribed words
        :param phonemes: list(Vowel or Consonant)
        :return: str
        """
        phonetic_representation = []
        if len(phonemes) >= 2:
            for i in range(len(phonemes)):
                if i == 0:
                    current_pos = Position(Rank.first, None, phonemes[i])
                elif i < len(phonemes) - 1:
                    current_pos = Position(Rank.inner, phonemes[i - 1], phonemes[i + 1])
                else:
                    current_pos = Position(Rank.last, phonemes[i - 1], None)
                found = False
                for rule in self.rules:
                    if rule.temp_sound.ipar == phonemes[i].ipar:
                        if rule.can_apply(current_pos):
                            phonetic_representation.append(rule.estimated_sound.ipar)
                            found = True
                            break
                if not found:
                    phonetic_representation.append(phonemes[i].ipar)
        else:
            phonetic_representation.append(phonemes[0].ipar)
        return "".join(phonetic_representation)


[docs]class BasePhonologicalRule:
    def __init__(self, condition, action):
        self.condition = condition
        self.action = action

[docs]    def perform_action(self, phonemes, pos):
        return self.action(phonemes[pos])

    def __call__(self, phonemes, pos):
        return self.perform_action(phonemes, pos)


[docs]class PhonologicalRule(BasePhonologicalRule):
[docs]    def check_environment(self, phonemes, pos):
        before = phonemes[pos - 1] if pos > 0 else None
        after = phonemes[pos + 1] if pos < len(phonemes) - 1 else None
        return self.condition(before, phonemes[pos], after)


[docs]class WordInitialPhonologicalRule(BasePhonologicalRule):
[docs]    def check_environment(self, phonemes, pos):
        return (
            self.condition(phonemes[0], phonemes[1])
            if pos == 0 and len(phonemes) > 1
            else False
        )

[docs]    def perform_action(self, phonemes, _):
        return self.action(phonemes[0])


[docs]class WordFinalPhonologicalRule(BasePhonologicalRule):
[docs]    def check_environment(self, phonemes, pos):
        last = len(phonemes) - 1
        return (
            self.condition(phonemes[last - 1], phonemes[last])
            if pos == last and len(phonemes) > 1
            else False
        )

[docs]    def perform_action(self, phonemes, _):
        return self.action(phonemes[len(phonemes) - 2])


[docs]class IPATranscriber:
    def __init__(self, digraphs: dict, dipthongs: dict, alphabet: dict, rules: list):
        self.digraphs = digraphs
        self.dipthongs = dipthongs
        self.alphabet = alphabet
        self.rules = rules

[docs]    @staticmethod
    def tokenize(text):
        text = text.lower()
        text = re.sub(r"[.\";,:\[\]()!&?‘]", "", text)
        return text.split(" ")

[docs]    def transcribe_word(self, word):
        phonemes = [self.alphabet[letter] for letter in word]
        for i in range(len(phonemes)):
            for rule in self.rules:
                if rule.check_environment(phonemes, i):
                    phonemes[i] = rule(phonemes, i)
        return phonemes

[docs]    def transcribe(self, text):
        return [self.transcribe_word(word) for word in self.tokenize(text)]
Source code for cltk.phonology.non.utils

The Classical Language Toolkit

Navigation

Related Topics