Source code for cltk.phonology.orthophonology

"""
A module for representing the orthophonology of a language:
the mapping from orthographic representations to IPA symbols.

Pre-modern languages are characterized by their non-standardized writing rules.
Writers attempt to follow rules that fit morphology (words of same family tend to have close spelling)
and phonology (words of similar pronunciations are written the same way).
As languages evolve, their phonology changes faster than their writing rules.
This module aims to unify writing rules with phonological rules by borrowing
the representation of sound changes used by historical linguistics.

Based on many ideas in cltk.phonology.non.utils by Clément Besnier <clem@clementbesnier.fr>.
"""

import re
from copy import copy
from enum import auto
from typing import Union

from cltk.utils.utils import CLTKEnum

# The list of features and their values are from the IPA charts.
# Features for non-pulmonic consonants (e.g. clicks, implosives) are not yet provided.


__author__ = [
    "John Stewart <johnstewart@aya.yale.edu>",
    "Clément Besnier <clem@clementbesnier.fr>",
]


# ------------------- Phonological Features -------------------


[docs]class PhonologicalFeature(CLTKEnum): def __sub__(self, other): return make_phoneme(self) - other def __rshift__(self, other): return make_phoneme(self) >> other def __le__(self, other): return make_phoneme(self) <= other def __ge__(self, other): return make_phoneme(self) >= other
[docs] def matches(self, other): return make_phoneme(self).matches(other)
def __floordiv__(self, other): return make_phoneme(self) // other
[docs]class Consonantal(PhonologicalFeature): neg = auto() pos = auto()
[docs]class Voiced(PhonologicalFeature): neg = auto() pos = auto()
[docs]class Aspirated(PhonologicalFeature): neg = auto() pos = auto()
[docs]class Geminate(PhonologicalFeature): neg = auto() pos = auto()
[docs]class Roundedness(PhonologicalFeature): neg = auto() pos = auto()
[docs]class Length(PhonologicalFeature): short = auto() long = auto() overlong = auto()
# order for Height, Backness, and Manner is important # the feature values must be ordered by *increasing sonority*
[docs]class Height(PhonologicalFeature): close = auto() near_close = auto() close_mid = auto() mid = auto() open_mid = auto() near_open = auto() open = auto()
[docs]class Backness(PhonologicalFeature): front = auto() central = auto() back = auto()
[docs]class Manner(PhonologicalFeature): stop = auto() fricative = auto() affricate = auto() nasal = auto() lateral = auto() trill = auto() spirant = auto() approximant = auto()
[docs]class Place(PhonologicalFeature): bilabial = auto() labio_dental = auto() dental = auto() alveolar = auto() post_alveolar = auto() retroflex = auto() palatal = auto() velar = auto() uvular = auto() glottal = auto()
# ------------------- Phonemes -------------------
[docs]class AbstractPhoneme: """ An abstract phoneme is just a bundle of phonological features. """ def __init__(self, features=None, ipa=None): features = {} if features is None else features # ensure unique features if len(set(features.keys())) != len(features.keys()): raise ValueError("non-unique features") # ensure feature values correctly match their types # this is a barbaric bit of type checking that the language should provide for feature_name, feature_value in features.items(): if not issubclass(feature_name, PhonologicalFeature): raise TypeError(str(feature_name) + " is not a phonological feature") if type(feature_value) != feature_name: raise TypeError(str(feature_value) + " is not a " + str(feature_name)) self.features = features self.ipa = ipa
[docs] def is_vowel(self): return self[Consonantal] == Consonantal.neg
[docs] def merge(self, other): """ Returns a *copy* of this phoneme, with the features of other merged into this feature bundle. Other can be a list of phonemes, in which case the list is returned (for technical reasons). Other may also be a single feature value or a list of feature values. """ phoneme = copy(self) # special case for list of phonemes if ( isinstance(other, list) and len(other) > 0 and isinstance(other[0], AbstractPhoneme) ): return other if isinstance(other, AbstractPhoneme): feature_values = other.features.values() elif type(other) != list and type(other) != tuple: feature_values = [other] else: feature_values = other for f in feature_values: if type(f) == list: for inner_f in f: phoneme[type(inner_f)] = inner_f elif isinstance(f, AbstractPhoneme): phoneme = phoneme << f else: phoneme[type(f)] = f if isinstance(other, AbstractPhoneme) and other.ipa is not None: phoneme.ipa = other.ipa return phoneme
[docs] def is_equal(self, other): """ Phonemes are equal if they share the same features. Note that the IPA symbol is *not* taken into account. """ return other is not None and self.features == other.features
[docs] def matches(self, other): """ This phoneme matches other if other contains all the features of this phoneme, i.e. if this phoneme has an improper subset of other's. If other is a disjunctive list, then a match is sought for any of the list. If other is a feature value or list of feature values, it is promoted to a phoneme first. """ if other is None: return False if isinstance(other, PhonemeDisjunction): return any([self <= phoneme for phoneme in other]) if isinstance(other, list) or isinstance(other, PhonologicalFeature): other = make_phoneme(other) return other.features.items() >= self.features.items()
def __getitem__(self, feature_name): """ Use dict-type syntax for accessing the values of features. """ if not issubclass(feature_name, PhonologicalFeature): raise TypeError(str(feature_name) + " is not a phonological feature") return self.features.get(feature_name, None) def __setitem__(self, feature_name, feature_value): """ Use dict-type syntax to set the value of features. """ if not issubclass(feature_name, PhonologicalFeature): raise TypeError(str(feature_name) + " is not a phonological feature") if type(feature_value) != feature_name: raise TypeError(str(feature_value) + " is not a " + str(feature_name)) self.features[feature_name] = feature_value return self def __str__(self): return ("IPA:{0} ".format(self.ipa) if self.ipa is not None else "") + " ".join( [str(v) for v in self.features.values()] ) __repr__ = __str__ def __eq__(self, other): return self.is_equal(other) def __le__(self, other): return self.matches(other) def __ge__(self, other): if type(other) == list: other = make_phoneme(other) return other.matches(self) def __lt__(self, other): return other.is_more_sonorous(self) def __gt__(self, other): return self.is_more_sonorous(other) def __rshift__(self, other): """ Creates a phonological rule, merging other with self when applied to phonemes matching self. """ return PhonologicalRule( condition=lambda _, target, __: self <= target, action=lambda target: target << other, ) def __lshift__(self, other): return self.merge(other) def __sub__(self, other): """ Creates environment functions: boolean functions of the position before and after the target. """ other = ( make_phoneme(other) if not ( isinstance(other, AbstractPhoneme) or isinstance(other, PhonemeDisjunction) ) else other ) env_start = PositionedPhoneme(self, env_start=True) env_end = PositionedPhoneme(other, env_end=True) return lambda before, _, after: env_start <= before and env_end <= after def __floordiv__(self, other): """ Creates disjunctive lists of phonemes. """ return PhonemeDisjunction(self, other)
[docs]def make_phoneme(*feature_values) -> AbstractPhoneme: """ Creates an abstract phoneme made of the feature specifications given in the vararg. """ phoneme = AbstractPhoneme({}) phoneme = phoneme << feature_values return phoneme
[docs]def PositionedPhoneme( phoneme, word_initial=False, word_final=False, syllable_initial=False, syllable_final=False, env_start=False, env_end=False, ): """ A decorator for phonemes, used in applying rules over words. Returns a copy of the input phoneme, with additional attributes, specifying whether the phoneme occurs at a word or syllable boundary, or its position in an environment. """ pos_phoneme = copy(phoneme) pos_phoneme.word_initial = word_initial pos_phoneme.word_final = word_final pos_phoneme.syllable_initial = syllable_initial pos_phoneme.syllable_final = syllable_final pos_phoneme.env_start = env_start pos_phoneme.env_end = env_end return pos_phoneme
[docs]class AlwaysMatchingPseudoPhoneme(AbstractPhoneme): """ A pseudo-phoneme that matches all other phonemes. """ def __init__(self): AbstractPhoneme.__init__(self, ipa="*")
[docs] def matches(self, other: AbstractPhoneme) -> bool: return True
[docs]class WordBoundaryPseudoPhoneme(AbstractPhoneme): """ A pseudo-phoneme that only matches at the start or end of a word. """ def __init__(self): AbstractPhoneme.__init__(self, ipa="#")
[docs] def matches(self, other) -> bool: return other is None
[docs] def is_equal(self, other) -> bool: return self is other
[docs]class SyllableBoundaryPseudoPhoneme(AbstractPhoneme): """ A pseudo-phoneme that matches at word boundaries and matches positioned phonemes that are at syllable boundaries. """ def __init__(self): AbstractPhoneme.__init__(self, ipa="$")
[docs] def matches(self, other) -> bool: if other is None: return True elif getattr(self, "env_start", False) and getattr( other, "syllable_final", False ): return True elif getattr(self, "env_end", False) and getattr( other, "syllable_initial", False ): return True else: return False
ANY = AlwaysMatchingPseudoPhoneme() W = WordBoundaryPseudoPhoneme() S = SyllableBoundaryPseudoPhoneme()
[docs]class PhonemeDisjunction(list): """ A list of phonemes, with special properties for disjunctive ("or") matching. """ def __init__(self, *phonemes): super().__init__(self) if any( [ not isinstance(p, AbstractPhoneme) and not isinstance(p, PhonologicalFeature) and not isinstance(p, list) for p in phonemes ] ): raise TypeError(phonemes) true_phonemes = [ make_phoneme(p) if not isinstance(p, AbstractPhoneme) else p for p in phonemes ] self.extend(true_phonemes) def __floordiv__(self, other): """ Adds other to this list of phonemes. If other is a feature or list of features it is promoted to a phoneme. """ other = ( make_phoneme(other) if (isinstance(other, PhonologicalFeature) or isinstance(other, list)) else other ) if isinstance(other, AbstractPhoneme): self.append(other) return self else: raise TypeError(other) def __rshift__(self, other): """ Creates a phonological rule that fires when any member of the list matches the target. """ return PhonologicalRule( condition=lambda _, target, __: any( [phoneme <= target for phoneme in self] ), action=lambda target: target << other, )
[docs] def matches(self, other) -> bool: """ A disjunctive list matches a phoneme if any of its members matches the phoneme. If other is also a disjunctive list, any match between this list and the other returns true. """ if other is None: return False if isinstance(other, PhonemeDisjunction): return any([phoneme.matches(other) for phoneme in self]) if isinstance(other, list) or isinstance(other, PhonologicalFeature): other = make_phoneme(other) return any([phoneme <= other for phoneme in self])
def __sub__(self, other): """ Creates a boolean environmental function whose before is this list of phonemes. """ other = ( make_phoneme(other) if not ( isinstance(other, AbstractPhoneme) or isinstance(other, PhonemeDisjunction) ) else other ) env_start = [PositionedPhoneme(phoneme, env_start=True) for phoneme in self] env_end = PositionedPhoneme(other, env_end=True) return ( lambda before, _, after: any([phoneme <= before for phoneme in env_start]) and env_end <= after ) def __le__(self, other): return False if other is None else self.matches(other) def __ge__(self, other): return False if other is None else other.matches(self)
[docs]class Consonant(AbstractPhoneme): """ Based on cltk.phonology.utils by @clemsciences. A consonant is a phoneme that is specified for the features listed in the IPA chart for consonants: Place, Manner, Voicing. These may be read directly off the IPA chart, which also gives the IPA symbol. The Consonantal feature is set to positive, and the aspirated is defaulted to negative. See http://www.ipachart.com/ """ def __init__( self, place, manner, voiced, ipa, geminate=Geminate.neg, aspirated=Aspirated.neg ): assert place is not None assert manner is not None assert voiced is not None assert geminate is not None assert ipa is not None AbstractPhoneme.__init__( self, { Consonantal: Consonantal.pos, Place: place, Manner: manner, Voiced: voiced, Aspirated: aspirated, Geminate: geminate, }, ipa, )
[docs] def is_more_sonorous(self, other) -> bool: """ compare this phoneme to another for sonority. Used for SSP considerations. """ return ( True if isinstance(other, Consonant) and self[Manner] > other[Manner] else False )
[docs] def merge(self, other): if isinstance(other, Vowel): return other else: return AbstractPhoneme.merge(self, other)
[docs] def geminate(self): """ Returns a new Consonant with its Geminate pos, and "ː" appended to its IPA symbol. """ consonant = copy(self) if consonant[Geminate] == Geminate.neg: consonant[Geminate] = Geminate.pos consonant.ipa += "ː" return consonant
[docs]class Vowel(AbstractPhoneme): """ The representation of a vowel by its features, as given in the IPA chart for vowels. See http://www.ipachart.com/ """ def __init__(self, height, backness, rounded, length, ipa): assert height is not None assert backness is not None assert rounded is not None assert length is not None assert ipa is not None AbstractPhoneme.__init__( self, { Consonantal: Consonantal.neg, Height: height, Backness: backness, Roundedness: rounded, Length: length, }, ipa, ) def __add__(self, other): """ Summed vowels produce diphthongs, returning a copy of the first vowel and the concatenation of the IPA symbols. A hack because the features of the second vowel are lost. """ diphthong = copy(self) diphthong.ipa += other.ipa return diphthong
[docs] def lengthen(self): """ Returns a new Vowel with its Length lengthened, and "ː" appended to its IPA symbol. """ vowel = copy(self) if vowel[Length] == Length.short: vowel[Length] = Length.long elif vowel[Length] == Length.long: vowel[Length] = Length.overlong vowel.ipa += "ː" return vowel
[docs] def is_more_sonorous(self, other) -> bool: """ compare this phoneme to another for sonority. Used for SSP considerations. """ if isinstance(other, Consonant): return True elif self[Height] > other[Height]: return True elif self[Height] == other[Height]: return self[Backness] > other[Backness] else: return False
[docs] def merge(self, other): if isinstance(other, Consonant): return other else: return AbstractPhoneme.merge(self, other)
# ------------------- Phonological Rule Templates -------------------
[docs]class BasePhonologicalRule: """ Base class for conditional phonological rules. A phonological rule relates an item (a phoneme) to its environment to define a transformation. Specifically, a rule specifies a condition and an action. * The condition characterizes the phonological environment of a phoneme in terms of the characteristics of the phomeme before it (if any), and after it (if any). In general it is a function taking three arguments: before, target, after, the phonemes in the environment, an returning a boolean for whether the rule should fire. * The action defines a transformation of the target phoneme, e.g. its vocalization. It is a function taking only the action, which returns the replacement phoneme OR a *list* of phonemes. """ def __init__(self, condition, action): self.condition = condition self.action = action
[docs] def perform_action(self, phonemes, pos): return self.action(phonemes[pos])
def __call__(self, phonemes, pos): return self.perform_action(phonemes, pos) def __or__(self, other_condition): prev_function = self.condition self.condition = lambda before, target, after: prev_function( before, target, after ) and other_condition(before, target, after) return self
[docs]class PhonologicalRule(BasePhonologicalRule): """ The most general phonological rule can apply anywhere in the word. before and after phonemes may therefore be null when calling the condition. """
[docs] def check_environment(self, phonemes, pos): if pos >= len(phonemes): return False before = None if pos == 0 else phonemes[pos - 1] after = None if pos == len(phonemes) - 1 else phonemes[pos + 1] return self.condition(before, phonemes[pos], after)
# ------------------- The ortho-phonology of a language -------------------#
[docs]class PhonemeNotFound(Exception): """ Exception raised when a search for a phoneme in the investory fails. """ def __init__(self, phoneme): self.unfound_phoneme = phoneme
[docs]class LetterNotFound(Exception): """ Exception raised when a search for a letter in the alphabet fails. """ def __init__(self, letter): self.unfound_letter = letter
# A mapping of IPA symbols to English orthographic approximations. # Thousands of problems here. ipa_to_pde = { "m": "m", "n": "n", "n̥": "ng", "ŋ": "ng", "p": "p", "b": "b", "t": "t", "d": "d", "k": "k", "g": "g", "t͡ʃ": "ch", "d͡ʒ": "ge", "f": "f", "v": "v", "θ": "th", "ð": "th", "s": "s", "z": "z", "ʃ": "sh", "ç": "ch", "x": "ch", # tough one -- like in Scottish loch "y": "y", "h": "h", "l": "l", "l̥": "l", "j": "y", "w": "w", "r": "r", "r̥": "r", "i": "i", "i:": "ee", "y:": "y", "u": "u", "u:": "oo", "e": "e", "e:": "ee", "ø": "e", "ø:": "ee", "o": "o", "o:": "oo", "æ": "a", "æ:": "aa", "ɑ": "o", "ɑ:": "oo", "æɑ": "ao", "æ:ɑ": "ao", "eo": "eo", "e:o": "eeo", "iu": "iu", "i:u": "iiu", } # this is just the barest of beginnings! pde_phonotactics = [(r"(^|(?<= ))hw", "wh"), (r"oo(.)(^|(?= ))", "o\\1e")]
[docs]class Orthophonology: """ The ortho-phonology of a language is described by: * The inventory of all the phonemes of the language. * A mapping of orthographic symbols to phonemes. * mappings of orthographic symbols pairs to: * diphthongs * phonemes (i.e. digraphs) * phonological rules for the contextual transformation of phonological representations. The class is very clearly aimed at alphabetic orthographies. Its usefulness for e.g. pictographic orthographies is questionable. """ def __init__( self, sound_inventory, alphabet, diphthongs, digraphs, to_modern=(ipa_to_pde, pde_phonotactics), ): self.sound_inventory = sound_inventory self.alphabet = alphabet self.diphthongs = diphthongs self.digraphs = digraphs self.di = {**self.diphthongs, **self.digraphs} self.rules = [] self.to_modern = to_modern
[docs] def add_rule(self, rule): """ Adds a rule to the orthophonology. The *order* in which rules are added is critcial, since the first rule that matches fires. """ self.rules.append(rule)
# these are not static because language-specific subclasses probably need access to the sound inventory
[docs] def is_syllable_initial(self, phonemes, pos) -> bool: if pos == len(phonemes) - 1: return False # start of word is always syllable-initial, otherwise use SSP return pos == 0 or ( phonemes[pos - 1].is_more_sonorous(phonemes[pos]) and not phonemes[pos].is_more_sonorous(phonemes[pos + 1]) )
[docs] def is_syllable_final(self, phonemes, pos) -> bool: # end of word is always syllable-final, otherwise use SSP return pos == len(phonemes) - 1 or self.is_syllable_initial(phonemes, pos + 1)
@staticmethod def _tokenize(text): text = text.lower() text = re.sub(r"[.\";,:\[\]()!&?‘]", "", text) return text.split(" ")
[docs] def _position_phonemes(self, phonemes): """ Mark syllable boundaries, and, in future, other positional/suprasegmental features? """ for i in range(len(phonemes)): phonemes[i] = PositionedPhoneme(phonemes[i]) phonemes[i].syllable_initial = self.is_syllable_initial(phonemes, i) phonemes[i].syllable_final = self.is_syllable_final(phonemes, i) return phonemes
def _find_sound(self, phoneme): for sound in self.sound_inventory: if sound.is_equal(phoneme): return sound raise PhonemeNotFound(phoneme)
[docs] def transcribe_word(self, word): """ The heart of the transcription process. Similar to the system in in cltk.phonology.utils, the algorithm: 1) Applies digraphs and diphthongs to the text of the word. 2) Carries out a naive ("greedy", per @clemsciences) substitution of letters to phonemes, according to the alphabet. 3) Applies the conditions of the rules to the environment of each phoneme in turn. The first rule matched fires. There is no restart and later rules are not tested. Also, if a rule returns multiple phonemes, these are never re-tested by the rule set. """ phonemes = [] i = 0 while i < len(word): # check for digraphs and diphthongs if i < len(word) - 1 and word[i : i + 2] in self.di: letter_pair = word[i : i + 2] replacement = self.di[letter_pair] replacement = ( replacement if isinstance(replacement, list) else [replacement] ) phonemes.extend(replacement) i += 2 else: phonemes.append(self[word[i]]) i += 1 # apply phonological rules. Note: no restart! i = 0 while i < len(phonemes): for rule in self.rules: phonemes = self._position_phonemes(phonemes) if rule.check_environment(phonemes, i): replacement = rule(phonemes, i) replacement = ( [replacement] if not isinstance(replacement, list) else replacement ) new_phonemes = [self._find_sound(p) for p in replacement] phonemes[i : i + 1] = new_phonemes i += len(replacement) - 1 break i += 1 return phonemes
[docs] def transcribe(self, text: str, as_phonemes=False) -> Union[str, list]: """ Transcribes a text, which is first tokenized for words, then each word is transcribed. If as_phonemes is true, returns a list of list of phoneme objects, else returns a string concatenation of the IPA symbols of the phonemes. """ phoneme_words = [self.transcribe_word(word) for word in self._tokenize(text)] if not as_phonemes: words = [ "".join([phoneme.ipa for phoneme in word]) for word in phoneme_words ] return " ".join(words) else: return phoneme_words
[docs] def transcribe_to_modern(self, text: str) -> str: """ A very first attempt at transcribing from IPA to some modern orthography. The method is intended to provide the student with clues to the pronunciation of old orthographies. """ # first transcribe letter by letter phoneme_words = self.transcribe(text, as_phonemes=True) words = [ "".join([self.to_modern[0][phoneme.ipa] for phoneme in word]) for word in phoneme_words ] modern_text = " ".join(words) # then apply phonotactic fixes for regexp, replacement in self.to_modern[1]: modern_text = re.sub(regexp, replacement, modern_text) return modern_text
[docs] def voice(self, consonant: Consonant) -> Consonant: """ Voices a consonant, by searching the sound inventory for a consonant having the same features as the argument, but +voice. """ voiced_consonant = copy(consonant) voiced_consonant[Voiced] = Voiced.pos return self._find_sound(voiced_consonant)
[docs] def aspirate(self, consonant: Consonant) -> Consonant: """ Aspirates a consonant, by searching the sound inventory for a consonant having the same features as the argument, but +aspirated. """ aspirated_consonant = copy(consonant) aspirated_consonant[Aspirated] = Aspirated.pos return self._find_sound(aspirated_consonant)
[docs] def geminate(self, consonant: Consonant) -> Consonant: """ :param consonant: :return: """ geminate_consonant = copy(consonant) geminate_consonant[Geminate] = Geminate.pos return self._find_sound(geminate_consonant)
[docs] @staticmethod def lengthen(vowel) -> Vowel: """ Returns a lengthened copy of the vowel argument. """ return vowel.lengthen()
def __call__(self, text, as_phonemes=False) -> Union[str, list]: """ syntactic sugar for call the transcribe method """ return self.transcribe(text, as_phonemes) def __getitem__(self, letter): """ Returns the phoneme associated with a letter, or None. """ phoneme = self.alphabet.get(letter, None) if phoneme is not None: return phoneme else: raise LetterNotFound(letter) def __lshift__(self, rule): """ Syntactic sugar for adding a rule """ self.add_rule(rule)