Source code for cltk.phonology.ang.transcription

"""This module attempts to reconstruct the approximate phonology of Old English.
"""

import logging
import re
import unicodedata

__author__ = [
    "John Stewart <johnstewart@aya.yale.edu>",
    "Clément Besnier <clem@clementbesnier.fr>",
]


IPA_rules = {
    "ea": "æːɑ",
    "ēa": "æːɑ",
    "nk": "ŋk",
    "ng": "ŋg",
    "hw": "ʍ",
    "hl": "l",
    "hn": "n̥",
    "hr": "r̥",
    "sċ": "ʃ",
}

IPA = {
    "a": "ɑ",
    "æ": "æ",
    "b": "b",
    "c": "k",
    "ċ": "tʃ",
    "d": "d",
    "ð": "ð",
    "e": "e",
    "f": "f",
    "g": "g",
    "ġ": "j",
    "h": "h",
    "i": "i",
    "l": "l",
    "m": "m",
    "n": "n",
    "o": "o",
    "p": "p",
    "r": "r",
    "s": "s",
    "t": "t",
    "u": "u",
    "w": "w",
    "ƿ": "ƿ",
    "x": "x",
    "y": "y",
    "þ": "θ",
    "ǣ": "æː",
    "ā": "ɑː",
    "ē": "eː",
    "ī": "iː",
    "ū": "uː",
    "ō": "oː",
    "ȳ": "yː",
}

Normalize = {
    "Æ": "Ae",
    "Ƿ": "W",
    "Þ": "Th",
    "Ð": "D",
    "æ": "ae",
    "ƿ": "w",
    "þ": "th",
    "ð": "d",
}

L_Transliteration = Transliteration = {
    "ᚪ": "a",
    "ᚫ": "æ",
    "ᛒ": "b",
    "ᚳ": "c",
    "ᛞ": "d",
    "ᛖ": "e",
    "ᛠ": "ea",
    "ᚠ": "f",
    "ᚷ": "g",
    "ᚸ": "g",
    "ᚻ": "h",
    "ᛁ": "i",
    "ᛇ": "i",
    "ᛄ": "j",
    "ᛣ": "k",
    "ᛤ": "k",
    "ᛚ": "l",
    "ᛗ": "m",
    "ᚾ": "n",
    "ᛝ": "n",
    "ᚩ": "o",
    "ᛟ": "oe",
    "ᛈ": "p",
    "ᚱ": "r",
    "ᛋ": "s",
    "ᛏ": "t",
    "ᚦ": "th",
    "ᚢ": "u",
    "ᚹ": "w",
    "ᛉ": "x",
    "ᚣ": "y",
}

R_Transliteration = {
    "ae": "ᚫ",
    "ea": "ᛠ",
    "eo": "ᛇ",
    "oe": "ᛟ",
    "ia": "ᛡ",
    "io": "ᛡ",
    "a": "ᚪ",
    "æ": "ᚫ",
    "b": "ᛒ",
    "c": "ᚳ",
    "d": "ᛞ",
    "e": "ᛖ",
    "f": "ᚠ",
    "g": "ᚷ",
    "h": "ᚻ",
    "i": "ᛁ",
    "j": "ᛄ",
    "k": "ᛣ",
    "l": "ᛚ",
    "m": "ᛗ",
    "n": "ᚾ",
    "ŋ": "ᛝ",
    "o": "ᚩ",
    "œ": "ᛟ",
    "p": "ᛈ",
    "þ": "ᚦ",
    "ð": "ᚦ",
    "ƿ": "ᚹ",
    "q": "ᛤ",
    "r": "ᚱ",
    "s": "ᛋ",
    "t": "ᛏ",
    "u": "ᚢ",
    "v": "ᚹ",
    "w": "ᚹ",
    "x": "ᛉ",
    "y": "ᚣ",
    "z": "ᛉ",
}


LOG = logging.getLogger(__name__)
LOG.addHandler(logging.NullHandler())


[docs]class Transcriber:
    """"""

    def __init__(self):
        pass

[docs]    @staticmethod
    def transcribe(text: str, punctuation=True, with_squared_brackets=True) -> str:
        """
        :param text: The text to be transcribed
        :param punctuation: Retain punctuation
        :param with_squared_brackets: if True, add squared bracked around transcription

        The algorithm first tries the substitutions defined in
        IPA_rules and IPA.

        The following exceptions are considered:

        - Geminants are pronounced as long consonants/vowels

        - [v ð z] are allophones of the fricatives /f θ s/ between vowels
        - [ŋ] is an allophone of /n/ occurring before /k/ and /ɡ/
        - [ɣ] is an allophone of /g/ after a vowel or liquid
        - /l r/ were velarized when geminated or before a consonant
        - [ç, x]  are allophones of /h/ when occuring in the coda of a syllable and
          preceded by front and back vowels respectively

        Examples:

        >>> Transcriber().transcribe('Fæder ūre þū þe eeart on heofonum,', punctuation = True)
        '[fæder uːre θuː θe eːɑrˠt on heovonum,]'

        >>> Transcriber().transcribe('Hwæt! wē Gār-Dena in ġēar-dagum', punctuation = False)
        '[ʍæt weː gɑːrdenɑ in jæːɑrdɑgum]'
        """

        if not punctuation:
            text = re.sub(r"[\.\";\,\:\[\]\(\)!&?‘]", "", text)

        text = text.lower()
        text = re.sub(r"rr", "rˠ", text)
        text = re.sub(r"(\w)\1", r"\1ː", text)

        text = re.sub(r"(?<=[iīæǣeē])h", "ç", text)
        text = re.sub(r"(?<=[aāoōuū])h", "x", text)

        text = re.sub(r"r(?=[bcdðfgġhlmnprstwƿxþ])", "rˠ", text)
        text = re.sub(r"l(?=[bcdðfgġhlmnprstwƿxþ])", "ɫ", text)

        text = re.sub(r"(?<=[aæeiouyǣāēīūōȳ])f(?=[aæeiouyǣāēīūōȳ])", "v", text)
        text = re.sub(r"(?<=[aæeiouyǣāēīūōȳ])þ(?=[aæeiouyǣāēīūōȳ])", "ð", text)
        text = re.sub(r"(?<=[aæeiouyǣāēīūōȳ])s(?=[aæeiouyǣāēīūōȳ])", "z", text)

        for w, val in zip(IPA_rules.keys(), IPA_rules.values()):
            text = text.replace(w, val)

        for w, val in zip(IPA.keys(), IPA.values()):
            text = text.replace(w, val)

        if with_squared_brackets:
            return "[" + text.replace("-", "") + "]"
        else:
            return text.replace("-", "")


[docs]class Word:
    """
    Transcription of Old English words
    """

    def __init__(self, w: str):
        self.word = w

[docs]    def remove_diacritics(self) -> str:
        """
        :return: str: the input string stripped of its diacritics

        Examples:

        >>> Word('ġelǣd').remove_diacritics()
        'gelæd'

        """

        w = ""
        for c in unicodedata.normalize("NFKD", self.word):
            if "LATIN" == unicodedata.name(c)[:5]:
                w += c

        return w

[docs]    def ascii_encoding(self):
        """
        :return: str: Returns the ASCII-encoded string

        Thorn (Þ, þ) and Ash(Æ, æ) are substituted by the digraphs
        'th' and 'ae' respectively. Wynn(Ƿ, ƿ) and Eth(Ð, ð) are replaced
        by 'w' and 'd'.

        Examples:

        >>> Word('ġelǣd').ascii_encoding()
        'gelaed'

        >>> Word('ƿeorðunga').ascii_encoding()
        'weordunga'

        """

        w = self.remove_diacritics()

        for k, val in zip(Normalize.keys(), Normalize.values()):
            w = w.replace(k, val)

        return w
Source code for cltk.phonology.ang.transcription

The Classical Language Toolkit

Navigation

Related Topics