Source code for cltk.prosody.lat.string_utils

"""Utillity class for processing scansion and text."""

import re
import sys
import unicodedata

__author__ = ["Todd Cook <todd.g.cook@gmail.com>"]
__license__ = "MIT License"

"""Helper methods for processing scansion"""
qu_matcher = re.compile("[qQ][uU]")


[docs]def remove_punctuation_dict() -> dict[int, None]: """ Provide a dictionary for removing punctuation, swallowing spaces. :return dict with punctuation from the unicode table >>> print("I'm ok! Oh #%&*()[]{}!? Fine!".translate( ... remove_punctuation_dict()).lstrip()) Im ok Oh Fine """ tmp = dict( (i, None) for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") ) return tmp
[docs]def punctuation_for_spaces_dict() -> dict[int, str]: """ Provide a dictionary for removing punctuation, keeping spaces. Essential for scansion to keep stress patterns in alignment with original vowel positions in the verse. :return dict with punctuation from the unicode table >>> print("I'm ok! Oh #%&*()[]{}!? Fine!".translate( ... punctuation_for_spaces_dict()).strip()) I m ok Oh Fine """ return dict( (i, " ") for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P") )
[docs]def differences(scansion: str, candidate: str) -> list[int]: """ Given two strings, return a list of index positions where the contents differ. :param scansion: :param candidate: :return: >>> differences("abc", "abz") [2] """ before = scansion.replace(" ", "") after = candidate.replace(" ", "") diffs = [] for idx, tmp in enumerate(before): if before[idx] != after[idx]: diffs.append(idx) return diffs
[docs]def mark_list(line: str) -> list[int]: """ Given a string, return a list of index positions where a character/non blank space exists. :param line: :return: >>> mark_list(" a b c") [1, 3, 5] """ marks = [] for idx, car in enumerate(list(line)): if car != " ": marks.append(idx) return marks
[docs]def space_list(line: str) -> list[int]: """ Given a string, return a list of index positions where a blank space occurs. :param line: :return: >>> space_list(" abc ") [0, 1, 2, 3, 7] """ spaces = [] for idx, car in enumerate(list(line)): if car == " ": spaces.append(idx) return spaces
[docs]def flatten(list_of_lists): """ Given a list of lists, flatten all the items into one list. :param list_of_lists: :return: >>> flatten([ [1, 2, 3], [4, 5, 6]]) [1, 2, 3, 4, 5, 6] """ return [val for sublist in list_of_lists for val in sublist]
[docs]def to_syllables_with_trailing_spaces(line: str, syllables: list[str]) -> list[str]: """ Given a line of syllables and spaces, and a list of syllables, produce a list of the syllables with trailing spaces attached as approriate. :param line: :param syllables: :return: >>> to_syllables_with_trailing_spaces(' arma virumque cano ', ... ['ar', 'ma', 'vi', 'rum', 'que', 'ca', 'no' ]) [' ar', 'ma ', 'vi', 'rum', 'que ', 'ca', 'no '] """ syllabs_spaces = [] idx = 0 linelen = len(line) for position, syl in enumerate(syllables): if not syl in line and re.match("w", syl, flags=re.IGNORECASE): syl = syl.replace("w", "u").replace("W", "U") start = line.index(syl, idx) idx = start + len(syl) if ( position == 0 and start > 0 ): # line starts with punctuation, substituted w/ spaces syl = (start * " ") + syl if idx + 1 > len(line): syllabs_spaces.append(syl) return syllabs_spaces nextchar = line[idx] if nextchar != " ": syllabs_spaces.append(syl) continue else: tmpidx = idx while tmpidx < linelen and nextchar == " ": syl += " " tmpidx += 1 if tmpidx == linelen: syllabs_spaces.append(syl) return syllabs_spaces nextchar = line[tmpidx] idx = tmpidx - 1 syllabs_spaces.append(syl) return syllabs_spaces
[docs]def join_syllables_spaces(syllables: list[str], spaces: list[int]) -> str: """ Given a list of syllables, and a list of integers indicating the position of spaces, return a string that has a space inserted at the designated points. :param syllables: :param spaces: :return: >>> join_syllables_spaces(["won", "to", "tree", "dun"], [3, 6, 11]) 'won to tree dun' """ syllable_line = list("".join(syllables)) for space in spaces: syllable_line.insert(space, " ") return "".join(flatten(syllable_line))
[docs]def starts_with_qu(word) -> bool: """ Determine whether or not a word start with the letters Q and U. :param word: :return: >>> starts_with_qu("qui") True >>> starts_with_qu("Quirites") True """ return qu_matcher.search(word) is not None
[docs]def stress_positions(stress: str, scansion: str) -> list[int]: """ Given a stress value and a scansion line, return the index positions of the stresses. :param stress: :param scansion: :return: >>> stress_positions("-", " - U U - UU - U U") [0, 3, 6] """ line = scansion.replace(" ", "") stresses = [] for idx, char in enumerate(line): if char == stress: stresses.append(idx) return stresses
[docs]def merge_elisions(elided: list[str]) -> str: """ Given a list of strings with different space swapping elisions applied, merge the elisions, taking the most without compounding the omissions. :param elided: :return: >>> merge_elisions([ ... "ignavae agua multum hiatus", "ignav agua multum hiatus" ,"ignavae agua mult hiatus"]) 'ignav agua mult hiatus' """ results = list(elided[0]) for line in elided: for idx, car in enumerate(line): if car == " ": results[idx] = " " return "".join(results)
[docs]def move_consonant_right(letters: list[str], positions: list[int]) -> list[str]: """ Given a list of letters, and a list of consonant positions, move the consonant positions to the right, merging strings as necessary. :param letters: :param positions: :return: >>> move_consonant_right(list("abbra"), [ 2, 3]) ['a', 'b', '', '', 'bra'] """ for pos in positions: letters[pos + 1] = letters[pos] + letters[pos + 1] letters[pos] = "" return letters
[docs]def move_consonant_left(letters: list[str], positions: list[int]) -> list[str]: """ Given a list of letters, and a list of consonant positions, move the consonant positions to the left, merging strings as necessary. :param letters: :param positions: :return: >>> move_consonant_left(['a', 'b', '', '', 'bra'], [1]) ['ab', '', '', '', 'bra'] """ for pos in positions: letters[pos - 1] = letters[pos - 1] + letters[pos] letters[pos] = "" return letters
[docs]def merge_next(letters: list[str], positions: list[int]) -> list[str]: """ Given a list of letter positions, merge each letter with its next neighbor. :param letters: :param positions: :return: >>> merge_next(['a', 'b', 'o', 'v', 'o' ], [0, 2]) ['ab', '', 'ov', '', 'o'] >>> # Note: because it operates on the original list passed in, the effect is not cummulative: >>> merge_next(['a', 'b', 'o', 'v', 'o' ], [0, 2, 3]) ['ab', '', 'ov', 'o', ''] """ for pos in positions: letters[pos] = letters[pos] + letters[pos + 1] letters[pos + 1] = "" return letters
[docs]def remove_blanks(letters: list[str]): """ Given a list of letters, remove any empty strings. :param letters: :return: >>> remove_blanks(['a', '', 'b', '', 'c']) ['a', 'b', 'c'] """ cleaned = [] for letter in letters: if letter != "": cleaned.append(letter) return cleaned
[docs]def split_on(word: str, section: str) -> tuple[str, str]: """ Given a string, split on a section, and return the two sections as a tuple. :param word: :param section: :return: >>> split_on('hamrye', 'ham') ('ham', 'rye') """ return ( word[: word.index(section)] + section, word[word.index(section) + len(section) :], )
[docs]def remove_blank_spaces(syllables: list[str]) -> list[str]: """ Given a list of letters, remove any blank spaces or empty strings. :param syllables: :return: >>> remove_blank_spaces(['', 'a', ' ', 'b', ' ', 'c', '']) ['a', 'b', 'c'] """ cleaned = [] for syl in syllables: if syl == " " or syl == "": pass else: cleaned.append(syl) return cleaned
[docs]def overwrite( char_list: list[str], regexp: str, quality: str, offset: int = 0 ) -> list[str]: """ Given a list of characters and spaces, a matching regular expression, and a quality or character, replace the matching character with a space, overwriting with an offset and a multiplier if provided. :param char_list: :param regexp: :param quality: :param offset: :return: >>> overwrite(list("multe igne"), r"e\s[aeiou]", " ") ['m', 'u', 'l', 't', ' ', ' ', 'i', 'g', 'n', 'e'] """ long_matcher = re.compile(regexp) line = "".join(char_list) long_positions = long_matcher.finditer(line) for match in long_positions: (start, end) = match.span() # pylint: disable=unused-variable char_list[start + offset] = quality return char_list
[docs]def overwrite_dipthong(char_list: list[str], regexp: str, quality: str) -> list[str]: """ Given a list of characters and spaces, a matching regular expression, and a quality or character, replace the matching character with a space, overwriting with an offset and a multiplier if provided. :param char_list: a list of characters :param regexp: a matching regular expression :param quality: a quality or character to replace :return: a list of characters with the dipthong overwritten >>> overwrite_dipthong(list("multae aguae"), r"ae\s[aeou]", " ") ['m', 'u', 'l', 't', ' ', ' ', ' ', 'a', 'g', 'u', 'a', 'e'] """ long_matcher = re.compile(regexp) line = "".join(char_list) long_positions = long_matcher.finditer(line) for match in long_positions: (start, end) = match.span() # pylint: disable=unused-variable char_list[start] = quality char_list[start + 1] = quality return char_list
[docs]def get_unstresses(stresses: list[int], count: int) -> list[int]: """ Given a list of stressed positions, and count of possible positions, return a list of the unstressed positions. :param stresses: a list of stressed positions :param count: the number of possible positions :return: a list of unstressed positions >>> get_unstresses([0, 3, 6, 9, 12, 15], 17) [1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16] """ return list(set(range(count)) - set(stresses))