Spaces:

Flux9665
/

ArticulatoryFeatures

Sleeping

App Files Files Community

Flux9665 commited on Sep 23, 2024

Commit

606f0bc

verified ·

1 Parent(s): 46f4c3c

Upload 7 files

Browse files

Files changed (7) hide show

ArticulatoryTextFrontend.py +1073 -0
articulatory_features.py +966 -0
iso_lookup.json +0 -0
iso_to_fullname.json +0 -0
requirements.txt +12 -0
run_feature_visualization.py +55 -0
run_grapheme_to_feature_demo.py +33 -0

ArticulatoryTextFrontend.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+# -*- coding: utf-8 -*-
+import json
+import logging
+import re
+import torch
+from articulatory_features import generate_feature_table
+from articulatory_features import get_feature_to_index_lookup
+from articulatory_features import get_phone_to_id
+from dragonmapper.transcriptions import pinyin_to_ipa
+from phonemizer.backend import EspeakBackend
+from pypinyin import pinyin
+def load_json_from_path(path):  # redundant to the one in utils, but necessary to avoid circular imports
+    with open(path, "r", encoding="utf8") as f:
+        obj = json.loads(f.read())
+    return obj
+class ArticulatoryTextFrontend:
+    def __init__(self,
+                 language,
+                 add_silence_to_end=True,  # empirically, I found that most TTS systems benefit from having a pause token at the end
+                 use_word_boundaries=True,  # in multilingual scenarios, this can help. Only really works on languages that use whitespace
+                 use_explicit_end_of_utterance_token=True,  # this can help with autoregressive models
+                 device="cpu"):
+        """
+        Mostly preparing ID lookups
+        """
+        # this locks the device, so it has to happen here and not at the top
+        from transphone.g2p import read_g2p
+        self.language = language
+        self.use_explicit_eos = use_explicit_end_of_utterance_token
+        self.add_silence_to_end = add_silence_to_end
+        self.use_word_boundaries = use_word_boundaries
+        register_to_height = {
+            "˥": 5,
+            "˦": 4,
+            "˧": 3,
+            "˨": 2,
+            "˩": 1
+        }
+        self.rising_perms = list()
+        self.falling_perms = list()
+        self.peaking_perms = list()
+        self.dipping_perms = list()
+        for first_tone in ["˥", "˦", "˧", "˨", "˩"]:
+            for second_tone in ["˥", "˦", "˧", "˨", "˩"]:
+                if register_to_height[first_tone] > register_to_height[second_tone]:
+                    self.falling_perms.append(first_tone + second_tone)
+                else:
+                    self.rising_perms.append(first_tone + second_tone)
+                for third_tone in ["˥", "˦", "˧", "˨", "˩"]:
+                    if register_to_height[first_tone] > register_to_height[second_tone] < register_to_height[third_tone]:
+                        self.dipping_perms.append(first_tone + second_tone + third_tone)
+                    elif register_to_height[first_tone] < register_to_height[second_tone] > register_to_height[third_tone]:
+                        self.peaking_perms.append(first_tone + second_tone + third_tone)
+        if language == "eng" or language == "en-us":
+            self.g2p_lang = "en-us"  # English as spoken in USA
+            self.expand_abbreviations = english_text_expansion
+            self.phonemizer = "espeak"
+        elif language == "deu":
+            self.g2p_lang = "de"  # German
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ell":
+            self.g2p_lang = "el"  # Greek
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "spa":
+            self.g2p_lang = "es"  # Spanish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "spa-lat":
+            self.g2p_lang = "es-419"  # Spanish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "fin":
+            self.g2p_lang = "fi"  # Finnish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "rus":
+            self.g2p_lang = "ru"  # Russian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "hun":
+            self.g2p_lang = "hu"  # Hungarian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "nld":
+            self.g2p_lang = "nl"  # Dutch
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "fra":
+            self.g2p_lang = "fr-fr"  # French
+            self.expand_abbreviations = remove_french_spacing
+            self.phonemizer = "espeak"
+        elif language == "fr-be":
+            self.g2p_lang = "fr-be"  # French
+            self.expand_abbreviations = remove_french_spacing
+            self.phonemizer = "espeak"
+        elif language == "fr-sw":
+            self.g2p_lang = "fr-ch"  # French
+            self.expand_abbreviations = remove_french_spacing
+            self.phonemizer = "espeak"
+        elif language == "ita":
+            self.g2p_lang = "it"  # Italian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "por":
+            self.g2p_lang = "pt"  # Portuguese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "pt-br":
+            self.g2p_lang = "pt-br"  # Portuguese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "pol":
+            self.g2p_lang = "pl"  # Polish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "cmn":
+            self.g2p_lang = "cmn"  # Mandarin
+            self.expand_abbreviations = convert_kanji_to_pinyin_mandarin
+            self.phonemizer = "dragonmapper"
+        elif language == "vie":
+            self.g2p_lang = "vi"  # Northern Vietnamese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "vi-ctr":
+            self.g2p_lang = "vi-vn-x-central"  # Central Vietnamese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "vi-so":
+            self.g2p_lang = "vi-vn-x-south"  # Southern Vietnamese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ukr":
+            self.g2p_lang = "uk"  # Ukrainian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "pes":
+            self.g2p_lang = "fa"  # Western Farsi
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "afr":
+            self.g2p_lang = "af"  # Afrikaans
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "aln":
+            self.g2p_lang = "sq"  # Albanian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "amh":
+            self.g2p_lang = "am"  # Amharic
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "arb":
+            self.g2p_lang = "ar"  # Arabic
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "arg":
+            self.g2p_lang = "an"  # Aragonese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "hye":
+            self.g2p_lang = "hy"  # East Armenian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "hyw":
+            self.g2p_lang = "hyw"  # West Armenian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "azj":
+            self.g2p_lang = "az"  # Azerbaijani
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "bak":
+            self.g2p_lang = "ba"  # Bashkir
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "eus":
+            self.g2p_lang = "eu"  # Basque
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "bel":
+            self.g2p_lang = "be"  # Belarusian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ben":
+            self.g2p_lang = "bn"  # Bengali
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "bpy":
+            self.g2p_lang = "bpy"  # Bishnupriya Manipuri
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "bos":
+            self.g2p_lang = "bs"  # Bosnian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "bul":
+            self.g2p_lang = "bg"  # Bulgarian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "mya":
+            self.g2p_lang = "my"  # Burmese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "chr":
+            self.g2p_lang = "chr"  # Cherokee
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "yue":
+            self.g2p_lang = "yue"  # Chinese	Cantonese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "hak":
+            self.g2p_lang = "hak"  # Chinese	Hakka
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "haw":
+            self.g2p_lang = "haw"  # Hawaiian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "hrv":
+            self.g2p_lang = "hr"  # Croatian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ces":
+            self.g2p_lang = "cs"  # Czech
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "dan":
+            self.g2p_lang = "da"  # Danish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ekk":
+            self.g2p_lang = "et"  # Estonian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "gle":
+            self.g2p_lang = "ga"  # Gaelic	Irish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "gla":
+            self.g2p_lang = "gd"  # Gaelic	Scottish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "en-sc":
+            self.g2p_lang = "en-gb-scotland"
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "kat":
+            self.g2p_lang = "ka"  # Georgian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "kal":
+            self.g2p_lang = "kl"  # Greenlandic
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "guj":
+            self.g2p_lang = "gu"  # Gujarati
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "heb":
+            self.g2p_lang = "he"  # Hebrew
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "hin":
+            self.g2p_lang = "hi"  # Hindi
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "isl":
+            self.g2p_lang = "is"  # Icelandic
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ind":
+            self.g2p_lang = "id"  # Indonesian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "jpn":
+            import pykakasi
+            self.kakasi = pykakasi.Kakasi()  # this is not a satisfactory solution, but it is the best one I could come up with so far.
+            self.expand_abbreviations = lambda x: " ".join([chunk["hepburn"] for chunk in self.kakasi.convert(x)])
+            self.g2p_lang = language
+            self.phonemizer = "transphone"
+            self.transphone = read_g2p(device=device)
+        elif language == "kan":
+            self.g2p_lang = "kn"  # Kannada
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "knn":
+            self.g2p_lang = "kok"  # Konkani
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "kor":
+            self.g2p_lang = "ko"  # Korean
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ckb":
+            self.g2p_lang = "ku"  # Kurdish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "kaz":
+            self.g2p_lang = "kk"  # Kazakh
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "kir":
+            self.g2p_lang = "ky"  # Kyrgyz
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "lat":
+            self.g2p_lang = "la"  # Latin
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ltz":
+            self.g2p_lang = "lb"  # Luxembourgish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "lvs":
+            self.g2p_lang = "lv"  # Latvian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "lit":
+            self.g2p_lang = "lt"  # Lithuanian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "mri":
+            self.g2p_lang = "mi"  # Māori
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "mkd":
+            self.g2p_lang = "mk"  # Macedonian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "zlm":
+            self.g2p_lang = "ms"  # Malay
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "mal":
+            self.g2p_lang = "ml"  # Malayalam
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "mlt":
+            self.g2p_lang = "mt"  # Maltese
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "mar":
+            self.g2p_lang = "mr"  # Marathi
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "nci":
+            self.g2p_lang = "nci"  # Nahuatl
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "npi":
+            self.g2p_lang = "ne"  # Nepali
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "nob":
+            self.g2p_lang = "nb"  # Norwegian Bokmål
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "nog":
+            self.g2p_lang = "nog"  # Nogai
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ory":
+            self.g2p_lang = "or"  # Oriya
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "gaz":
+            self.g2p_lang = "om"  # Oromo
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "pap":
+            self.g2p_lang = "pap"  # Papiamento
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "pan":
+            self.g2p_lang = "pa"  # Punjabi
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "ron":
+            self.g2p_lang = "ro"  # Romanian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "lav":
+            self.g2p_lang = "ru-lv"  # Russian	Latvia
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "srp":
+            self.g2p_lang = "sr"  # Serbian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "tsn":
+            self.g2p_lang = "tn"  # Setswana
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "snd":
+            self.g2p_lang = "sd"  # Sindhi
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "slk":
+            self.g2p_lang = "sk"  # Slovak
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "slv":
+            self.g2p_lang = "sl"  # Slovenian
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "smj":
+            self.g2p_lang = "smj"  # Lule Saami
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "swh":
+            self.g2p_lang = "sw"  # Swahili
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "swe":
+            self.g2p_lang = "sv"  # Swedish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "tam":
+            self.g2p_lang = "ta"  # Tamil
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "tha":
+            self.g2p_lang = "th"  # Thai
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "tuk":
+            self.g2p_lang = "tk"  # Turkmen
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "tat":
+            self.g2p_lang = "tt"  # Tatar
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "tel":
+            self.g2p_lang = "te"  # Telugu
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "tur":
+            self.g2p_lang = "tr"  # Turkish
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "uig":
+            self.g2p_lang = "ug"  # Uyghur
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "urd":
+            self.g2p_lang = "ur"  # Urdu
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "uzn":
+            self.g2p_lang = "uz"  # Uzbek
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        elif language == "cym":
+            self.g2p_lang = "cy"  # Welsh
+            self.expand_abbreviations = lambda x: x
+            self.phonemizer = "espeak"
+        else:
+            # blanket solution for the rest
+            print("Using Transphone. A specialized phonemizer might work better.")
+            self.g2p_lang = language
+            self.phonemizer = "transphone"
+            self.expand_abbreviations = lambda x: x
+            self.transphone = read_g2p(device=device)
+        # remember to also update get_language_id() below when adding something here, as well as the get_example_sentence function
+        if self.phonemizer == "espeak":
+            try:
+                self.phonemizer_backend = EspeakBackend(language=self.g2p_lang,
+                                                        punctuation_marks=';:,.!?¡¿—…()"«»“”~/。【】、‥،؟“”؛',
+                                                        preserve_punctuation=True,
+                                                        language_switch='remove-flags',
+                                                        with_stress=True,
+                                                        logger=logging.getLogger(__file__))
+            except RuntimeError:
+                print("Error in loading espeak! \n"
+                      "Maybe espeak is not installed on your system? \n"
+                      "Falling back to transphone.")
+                from transphone.g2p import read_g2p
+                self.g2p_lang = self.language
+                self.phonemizer = "transphone"
+                self.expand_abbreviations = lambda x: x
+                self.transphone = read_g2p()
+        self.phone_to_vector = generate_feature_table()
+        self.phone_to_id = get_phone_to_id()
+        self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
+        self.text_vector_to_phone_cache = dict()
+    @staticmethod
+    def get_example_sentence(lang):
+        if lang == "eng":
+            return "This is a complex sentence, it even has a pause!"
+        elif lang == "deu":
+            return "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
+        elif lang == "ell":
+            return "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
+        elif lang == "spa":
+            return "Esta es una oración compleja, ¡incluso tiene una pausa!"
+        elif lang == "fin":
+            return "Tämä on monimutkainen lause, sillä on jopa tauko!"
+        elif lang == "rus":
+            return "Это сложное предложение, в нем даже есть пауза!"
+        elif lang == "hun":
+            return "Ez egy összetett mondat, még szünet is van benne!"
+        elif lang == "nld":
+            return "Dit is een complexe zin, er zit zelfs een pauze in!"
+        elif lang == "fra":
+            return "C'est une phrase complexe, elle a même une pause !"
+        elif lang == "por":
+            return "Esta é uma frase complexa, tem até uma pausa!"
+        elif lang == "pol":
+            return "To jest zdanie złożone, ma nawet pauzę!"
+        elif lang == "ita":
+            return "Questa è una frase complessa, ha anche una pausa!"
+        elif lang == "cmn":
+            return "这是一个复杂的句子，它甚至包含一个停顿。"
+        elif lang == "vie":
+            return "Đây là một câu phức tạp, nó thậm chí còn chứa một khoảng dừng."
+        else:
+            print(f"No example sentence specified for the language: {lang}\n "
+                  f"Please specify an example sentence in the get_example_sentence function in Preprocessing/TextFrontend to track your progress.")
+            return None
+    def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
+        """
+        Fixes unicode errors, expands some abbreviations,
+        turns graphemes into phonemes and then vectorizes
+        the sequence as articulatory features
+        """
+        if input_phonemes:
+            phones = text
+        else:
+            phones = self.get_phone_string(text=text, include_eos_symbol=True, for_feature_extraction=True)
+        phones = phones.replace("ɚ", "��").replace("ᵻ", "ɨ")
+        if view:
+            print("Phonemes: \n{}\n".format(phones))
+        phones_vector = list()
+        # turn into numeric vectors
+        stressed_flag = False
+        for char in phones:
+            # affects following phoneme -----------------
+            if char.strip() == '\u02C8':
+                # primary stress
+                stressed_flag = True
+            # affects previous phoneme -----------------
+            elif char.strip() == '\u02D0':
+                # lengthened
+                phones_vector[-1][get_feature_to_index_lookup()["lengthened"]] = 1
+            elif char.strip() == '\u02D1':
+                # half length
+                phones_vector[-1][get_feature_to_index_lookup()["half-length"]] = 1
+            elif char.strip() == '\u0306':
+                # shortened
+                phones_vector[-1][get_feature_to_index_lookup()["shortened"]] = 1
+            elif char.strip() == '̃' and phones_vector[-1][get_feature_to_index_lookup()["nasal"]] != 1:
+                # nasalized (vowel)
+                phones_vector[-1][get_feature_to_index_lookup()["nasal"]] = 2
+            elif char.strip() == "̧" != phones_vector[-1][get_feature_to_index_lookup()["palatal"]] != 1:
+                # palatalized
+                phones_vector[-1][get_feature_to_index_lookup()["palatal"]] = 2
+            elif char.strip() == "ʷ" and phones_vector[-1][get_feature_to_index_lookup()["labial-velar"]] != 1:
+                # labialized
+                phones_vector[-1][get_feature_to_index_lookup()["labial-velar"]] = 2
+            elif char.strip() == "ʰ" and phones_vector[-1][get_feature_to_index_lookup()["aspirated"]] != 1:
+                # aspirated
+                phones_vector[-1][get_feature_to_index_lookup()["aspirated"]] = 2
+            elif char.strip() == "ˠ" and phones_vector[-1][get_feature_to_index_lookup()["velar"]] != 1:
+                # velarized
+                phones_vector[-1][get_feature_to_index_lookup()["velar"]] = 2
+            elif char.strip() == "ˁ" and phones_vector[-1][get_feature_to_index_lookup()["pharyngal"]] != 1:
+                # pharyngealized
+                phones_vector[-1][get_feature_to_index_lookup()["pharyngal"]] = 2
+            elif char.strip() == "ˀ" and phones_vector[-1][get_feature_to_index_lookup()["glottal"]] != 1:
+                # glottalized
+                phones_vector[-1][get_feature_to_index_lookup()["glottal"]] = 2
+            elif char.strip() == "ʼ" and phones_vector[-1][get_feature_to_index_lookup()["ejective"]] != 1:
+                # ejective
+                phones_vector[-1][get_feature_to_index_lookup()["ejective"]] = 2
+            elif char.strip() == "̹" and phones_vector[-1][get_feature_to_index_lookup()["rounded"]] != 1:
+                # rounding
+                phones_vector[-1][get_feature_to_index_lookup()["rounded"]] = 2
+            elif char.strip() == "̞" and phones_vector[-1][get_feature_to_index_lookup()["open"]] != 1:
+                # open
+                phones_vector[-1][get_feature_to_index_lookup()["open"]] = 2
+            elif char.strip() == "̪" and phones_vector[-1][get_feature_to_index_lookup()["dental"]] != 1:
+                # dental
+                phones_vector[-1][get_feature_to_index_lookup()["dental"]] = 2
+            elif char.strip() == "̬" and phones_vector[-1][get_feature_to_index_lookup()["voiced"]] != 1:
+                # voiced
+                phones_vector[-1][get_feature_to_index_lookup()["voiced"]] = 2
+            elif char.strip() == "̝" and phones_vector[-1][get_feature_to_index_lookup()["close"]] != 1:
+                # closed
+                phones_vector[-1][get_feature_to_index_lookup()["close"]] = 2
+            elif char.strip() == "̰" and phones_vector[-1][get_feature_to_index_lookup()["glottal"]] != 1 and phones_vector[-1][get_feature_to_index_lookup()["epiglottal"]] != 1:
+                # laryngalization
+                phones_vector[-1][get_feature_to_index_lookup()["glottal"]] = 2
+                phones_vector[-1][get_feature_to_index_lookup()["epiglottal"]] = 2
+            elif char.strip() == "̈" and phones_vector[-1][get_feature_to_index_lookup()["central"]] != 1:
+                # centralization
+                phones_vector[-1][get_feature_to_index_lookup()["central"]] = 2
+            elif char.strip() == "̜" and phones_vector[-1][get_feature_to_index_lookup()["unrounded"]] != 1:
+                # unrounded
+                phones_vector[-1][get_feature_to_index_lookup()["unrounded"]] = 2
+            elif char.strip() == "̥" and phones_vector[-1][get_feature_to_index_lookup()["unvoiced"]] != 1:
+                # voiceless
+                phones_vector[-1][get_feature_to_index_lookup()["unvoiced"]] = 2
+            elif char.strip() == "˥":
+                # very high tone
+                phones_vector[-1][get_feature_to_index_lookup()["very-high-tone"]] = 1
+            elif char.strip() == "˦":
+                # high tone
+                phones_vector[-1][get_feature_to_index_lookup()["high-tone"]] = 1
+            elif char.strip() == "˧":
+                # mid tone
+                phones_vector[-1][get_feature_to_index_lookup()["mid-tone"]] = 1
+            elif char.strip() == "˨":
+                # low tone
+                phones_vector[-1][get_feature_to_index_lookup()["low-tone"]] = 1
+            elif char.strip() == "˩":
+                # very low tone
+                phones_vector[-1][get_feature_to_index_lookup()["very-low-tone"]] = 1
+            elif char.strip() == "⭧":
+                # rising tone
+                phones_vector[-1][get_feature_to_index_lookup()["rising-tone"]] = 1
+            elif char.strip() == "⭨":
+                # falling tone
+                phones_vector[-1][get_feature_to_index_lookup()["falling-tone"]] = 1
+            elif char.strip() == "⮁":
+                # peaking tone
+                phones_vector[-1][get_feature_to_index_lookup()["peaking-tone"]] = 1
+            elif char.strip() == "⮃":
+                # dipping tone
+                phones_vector[-1][get_feature_to_index_lookup()["dipping-tone"]] = 1
+            else:
+                if handle_missing:
+                    try:
+                        phones_vector.append(self.phone_to_vector[char].copy())
+                    except KeyError:
+                        print("unknown phoneme: {}".format(char))
+                else:
+                    phones_vector.append(self.phone_to_vector[char].copy())  # leave error handling to elsewhere
+                # the following lines try to emulate whispering by removing all voiced features
+                # phones_vector[-1][get_feature_to_index_lookup()["voiced"]] = 0
+                # phones_vector[-1][get_feature_to_index_lookup()["unvoiced"]] = 1
+                # the following lines explore what would happen, if the system is told to produce sounds a human cannot
+                # for dim, _ in enumerate(phones_vector[-1]):
+                #     phones_vector[-1][dim] = 1
+                if stressed_flag:
+                    stressed_flag = False
+                    phones_vector[-1][get_feature_to_index_lookup()["stressed"]] = 1
+        return torch.Tensor(phones_vector, device=device)
+    def get_phone_string(self, text, include_eos_symbol=True, for_feature_extraction=False, for_plot_labels=False):
+        if text == "":
+            return ""
+        # expand abbreviations
+        utt = self.expand_abbreviations(text)
+        # convert the graphemes to phonemes here
+        if self.phonemizer == "espeak":
+            try:
+                phones = self.phonemizer_backend.phonemize([utt], strip=True)[0]  # To use a different phonemizer, this is the only line that needs to be exchanged
+            except:
+                print(f"There was an error with espeak. \nFalling back to transphone.\nSentence: {utt} \nLanguage {self.g2p_lang}")
+                from transphone.g2p import read_g2p
+                self.g2p_lang = self.language
+                self.phonemizer = "transphone"
+                self.expand_abbreviations = lambda x: x
+                self.transphone = read_g2p()
+                return self.get_phone_string(text, include_eos_symbol, for_feature_extraction, for_plot_labels)
+        elif self.phonemizer == "transphone":
+            replacements = [
+                # punctuation in languages with non-latin script
+                ("。", "~"),
+                ("，", "~"),
+                ("【", '~'),
+                ("】", '~'),
+                ("、", "~"),
+                ("‥", "~"),
+                ("؟", "~"),
+                ("،", "~"),
+                ("“", '~'),
+                ("”", '~'),
+                ("؛", "~"),
+                ("《", '~'),
+                ("》", '~'),
+                ("？", "~"),
+                ("！", "~"),
+                (" ：", "~"),
+                (" ；", "~"),
+                ("－", "~"),
+                ("·", " "),
+                ("`", ""),
+                # symbols that indicate a pause or silence
+                ('"', "~"),
+                (" - ", "~ "),
+                ("- ", "~ "),
+                ("-", ""),
+                ("…", "~"),
+                (":", "~"),
+                (";", "~"),
+                (",", "~")  # make sure this remains the final one when adding new ones
+            ]
+            for replacement in replacements:
+                utt = utt.replace(replacement[0], replacement[1])
+            utt = re.sub("~+", "~", utt)
+            utt = re.sub(r"\s+", " ", utt)
+            utt = re.sub(r"\.+", ".", utt)
+            chunk_list = list()
+            for chunk in utt.split("~"):
+                # unfortunately the transphone tokenizer is not suited for any languages besides English it seems
+                # this is not much better, but maybe a little.
+                word_list = list()
+                for word_by_whitespace in chunk.split():
+                    word_list.append(self.transphone.inference(word_by_whitespace, self.g2p_lang))
+                chunk_list.append(" ".join(["".join(word) for word in word_list]))
+            phones = "~ ".join(chunk_list)
+        elif self.phonemizer == "dragonmapper":
+            phones = pinyin_to_ipa(utt)
+        # Unfortunately tonal languages don't agree on the tone, most tonal
+        # languages use different tones denoted by different numbering
+        # systems. At this point in the script, it is attempted to unify
+        # them all to the tones in the IPA standard.
+        if self.g2p_lang == "vi":
+            phones = phones.replace('1', "˧")
+            phones = phones.replace('2', "˨˩")
+            phones = phones.replace('ɜ', "˧˥")  # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
+            phones = phones.replace('3', "˧˥")  # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
+            phones = phones.replace('4', "˦˧˥")
+            phones = phones.replace('5', "˧˩˧")
+            phones = phones.replace('6', "˧˩˨ʔ")  # very weird tone, because the tone introduces another phoneme
+            phones = phones.replace('7', "˧")
+        elif self.g2p_lang == "yue":
+            phones = phones.replace('1', "˥")
+            phones = phones.replace('2', "˧˥")
+            phones = phones.replace('3', "˧")
+            phones = phones.replace('4', "˧˩")
+            phones = phones.replace('5', "˩˧")
+            phones = phones.replace('6', "˨")
+        # more of this handling for more tonal languages can be added here, simply make an elif statement and check for the language.
+        return self.postprocess_phoneme_string(phones, for_feature_extraction, include_eos_symbol, for_plot_labels)
+    def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, include_eos_symbol, for_plot_labels):
+        """
+        Takes as input a phoneme string and processes it to work best with the way we represent phonemes as featurevectors
+        """
+        replacements = [
+            # punctuation in languages with non-latin script
+            ("。", "."),
+            ("，", ","),
+            ("【", '"'),
+            ("】", '"'),
+            ("、", ","),
+            ("‥", "…"),
+            ("؟", "?"),
+            ("،", ","),
+            ("“", '"'),
+            ("”", '"'),
+            ("؛", ","),
+            ("《", '"'),
+            ("》", '"'),
+            ("？", "?"),
+            ("！", "!"),
+            (" ：", ":"),
+            (" ；", ";"),
+            ("－", "-"),
+            ("·", " "),
+            # latin script punctuation
+            ("/", " "),
+            ("—", ""),
+            ("(", "~"),
+            (")", "~"),
+            ("...", "…"),
+            ("\n", ", "),
+            ("\t", " "),
+            ("¡", ""),
+            ("¿", ""),
+            ("«", '"'),
+            ("»", '"'),
+            # unifying some phoneme representations
+            ("N", "ŋ"),  # somehow transphone doesn't transform this to IPA
+            ("ɫ", "l"),  # alveolopalatal
+            ("ɚ", "ə"),
+            ("g", "ɡ"),
+            ("ε", "e"),
+            ("ʦ", "ts"),
+            ("ˤ", "ˁ"),
+            ('ᵻ', 'ɨ'),
+            ("ɧ", "ç"),  # velopalatal
+            ("ɥ", "j"),  # labiopalatal
+            ("ɬ", "s"),  # lateral
+            ("ɮ", "z"),  # lateral
+            ('ɺ', 'ɾ'),  # lateral
+            ('ʲ', 'j'),  # decomposed palatalization
+            ('\u02CC', ""),  # secondary stress
+            ('\u030B', "˥"),
+            ('\u0301', "˦"),
+            ('\u0304', "˧"),
+            ('\u0300', "˨"),
+            ('\u030F', "˩"),
+            ('\u0302', "⭨"),
+            ('\u030C', "⭧"),
+            ("꜖", "˩"),
+            ("꜕", "˨"),
+            ("꜔", "˧"),
+            ("꜓", "˦"),
+            ("꜒", "˥"),
+            # symbols that indicate a pause or silence
+            ('"', "~"),
+            (" - ", "~ "),
+            ("- ", "~ "),
+            ("-", ""),
+            ("…", "."),
+            (":", "~"),
+            (";", "~"),
+            (",", "~")  # make sure this remains the final one when adding new ones
+        ]
+        unsupported_ipa_characters = {'̙', '̯', '̤', '̩', '̠', '̟', 'ꜜ', '̽', '|', '•', '↘',
+                                      '‖', '‿', 'ᷝ', 'ᷠ', '̚', '↗', 'ꜛ', '̻', '̘', '͡', '̺'}
+        #  https://en.wikipedia.org/wiki/IPA_number
+        for char in unsupported_ipa_characters:
+            replacements.append((char, ""))
+        if not for_feature_extraction:
+            # in case we want to plot etc., we only need the segmental units, so we remove everything else.
+            replacements = replacements + [
+                ('\u02C8', ""),  # primary stress
+                ('\u02D0', ""),  # lengthened
+                ('\u02D1', ""),  # half-length
+                ('\u0306', ""),  # shortened
+                ("˥", ""),  # very high tone
+                ("˦", ""),  # high tone
+                ("˧", ""),  # mid tone
+                ("˨", ""),  # low tone
+                ("˩", ""),  # very low tone
+                ('\u030C', ""),  # rising tone
+                ('\u0302', ""),  # falling tone
+                ('⭧', ""),  # rising
+                ('⭨', ""),  # falling
+                ('⮃', ""),  # dipping
+                ('⮁', ""),  # peaking
+                ('̃', ""),  # nasalizing
+                ("̧", ""),  # palatalized
+                ("ʷ", ""),  # labialized
+                ("ʰ", ""),  # aspirated
+                ("ˠ", ""),  # velarized
+                ("ˁ", ""),  # pharyngealized
+                ("ˀ", ""),  # glottalized
+                ("ʼ", ""),  # ejective
+                ("̹", ""),  # rounding
+                ("̞", ""),  # open
+                ("̪", ""),  # dental
+                ("̬", ""),  # voiced
+                ("̝", ""),  # closed
+                ("̰", ""),  # laryngalization
+                ("̈", ""),  # centralization
+                ("̜", ""),  # unrounded
+                ("̥", ""),  # voiceless
+            ]
+        for replacement in replacements:
+            phoneme_string = phoneme_string.replace(replacement[0], replacement[1])
+        phones = re.sub("~+", "~", phoneme_string)
+        phones = re.sub(r"\s+", " ", phones)
+        phones = re.sub(r"\.+", ".", phones)
+        phones = phones.lstrip("~").rstrip("~")
+        # peaking tones
+        for peaking_perm in self.peaking_perms:
+            phones = phones.replace(peaking_perm, "⮁".join(peaking_perm))
+        # dipping tones
+        for dipping_perm in self.dipping_perms:
+            phones = phones.replace(dipping_perm, "⮃".join(dipping_perm))
+        # rising tones
+        for rising_perm in self.rising_perms:
+            phones = phones.replace(rising_perm, "⭧".join(rising_perm))
+        # falling tones
+        for falling_perm in self.falling_perms:
+            phones = phones.replace(falling_perm, "⭨".join(falling_perm))
+        if self.add_silence_to_end:
+            phones += "~"  # adding a silence in the end during inference produces more natural sounding prosody
+        if include_eos_symbol:
+            phones += "#"
+        if not self.use_word_boundaries:
+            phones = phones.replace(" ", "")
+        if for_plot_labels:
+            phones = phones.replace(" ", "|")
+        phones = "~" + phones
+        phones = re.sub("~+", "~", phones)
+        return phones
+    def text_vectors_to_id_sequence(self, text_vector):
+        tokens = list()
+        for vector in text_vector:
+            if vector[get_feature_to_index_lookup()["word-boundary"]] == 0:
+                # we don't include word boundaries when performing alignment, since they are not always present in audio.
+                features = vector.cpu().numpy().tolist()
+                immutable_vector = tuple(features)
+                if immutable_vector in self.text_vector_to_phone_cache:
+                    tokens.append(self.phone_to_id[self.text_vector_to_phone_cache[immutable_vector]])
+                    continue
+                features = features[13:]
+                # the first 12 dimensions are for modifiers, so we ignore those when trying to find the phoneme in the ID lookup
+                for index in range(len(features)):
+                    if features[index] == 2:
+                        # we remove all features that stem from a modifier, so we can map back to the unmodified sound
+                        features[index] = 0
+                for phone in self.phone_to_vector:
+                    if features == self.phone_to_vector[phone][13:]:
+                        tokens.append(self.phone_to_id[phone])
+                        self.text_vector_to_phone_cache[immutable_vector] = phone
+                        # this is terribly inefficient, but it's fine, since we're building a cache over time that makes this instant
+                        break
+        return tokens
+def english_text_expansion(text):
+    """
+    Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
+    See https://github.com/keithito/tacotron/
+    Careful: Only apply to english datasets. Different languages need different cleaners.
+    """
+    _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
+                      [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
+                       ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
+                       ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def remove_french_spacing(text):
+    text = text.replace(" »", '"').replace("« ", '"')
+    for punc in ["!", ";", ":", ".", ",", "?", "-"]:
+        text = text.replace(f" {punc}", punc)
+    return text
+def convert_kanji_to_pinyin_mandarin(text):
+    return " ".join([x[0] for x in pinyin(text)])
+def get_language_id(language, iso_lookup_path="./iso_lookup.json"):
+    iso_codes_to_ids = load_json_from_path(iso_lookup_path)[-1]
+    if language not in iso_codes_to_ids:
+        print("Please specify the language as ISO 639-3 code (https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes)")
+        return None
+    return torch.LongTensor([iso_codes_to_ids[language]])

articulatory_features.py ADDED Viewed

	@@ -0,0 +1,966 @@

+# -*- coding: utf-8 -*-
+# partly derived from an open-source resource provided by Papercup Technologies Limited
+# Resource-Author: Marlene Staib
+# Modified by Florian Lux, 2021
+# Further modified by Florian Lux, 2022
+"""
+All phonemes in the IPA standard are supported.
+zero-width characters are generally not supported, as
+well as some other modifiers. Tone, stress and
+lengthening are represented with placeholder dimensions,
+however they need to be set manually, this conversion
+from phonemes to features works on a character by
+character basis. In a few cases, the place of
+articulation is approximated because only one phoneme
+had such a combination, which does not warrant a new
+dimension.
+"""
+def generate_feature_lookup():
+    return {
+        '~': {'symbol_type': 'silence'},
+        '#': {'symbol_type': 'end of sentence'},
+        '?': {'symbol_type': 'questionmark'},
+        '!': {'symbol_type': 'exclamationmark'},
+        '.': {'symbol_type': 'fullstop'},
+        ' ': {'symbol_type': 'word-boundary'},
+        'ɜ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded',
+        },
+        'ə': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'mid',
+            'vowel_roundedness': 'unrounded',
+        },
+        'a': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded',
+        },
+        'ð': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'fricative'
+        },
+        'ɛ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded',
+        },
+        'ɪ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front_central',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'unrounded',
+        },
+        'ŋ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'nasal'
+        },
+        'ɔ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded',
+        },
+        'ɒ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'rounded',
+        },
+        'ɾ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'flap'
+        },
+        'ʃ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'postalveolar',
+            'consonant_manner': 'fricative'
+        },
+        'θ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'fricative'
+        },
+        'ʊ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central_back',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'unrounded'
+        },
+        'ʌ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded'
+        },
+        'ʒ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'postalveolar',
+            'consonant_manner': 'fricative'
+        },
+        'æ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid_open',
+            'vowel_roundedness': 'unrounded'
+        },
+        'b': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'plosive'
+        },
+        'ʔ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'glottal',
+            'consonant_manner': 'plosive'
+        },
+        'd': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'plosive'
+        },
+        'e': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'unrounded'
+        },
+        'f': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'fricative'
+        },
+        'ɡ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'plosive'
+        },
+        'h': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'glottal',
+            'consonant_manner': 'fricative'
+        },
+        'i': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded'
+        },
+        'j': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'approximant'
+        },
+        'k': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'plosive'
+        },
+        'l': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'lateral-approximant'
+        },
+        'm': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'nasal'
+        },
+        'n': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'nasal'
+        },
+        'ɳ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'nasal'
+        },
+        'o': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded'
+        },
+        'p': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'plosive'
+        },
+        'ɹ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'approximant'
+        },
+        'r': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'trill'
+        },
+        's': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'fricative'
+        },
+        't': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'plosive'
+        },
+        'u': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded',
+        },
+        'v': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'fricative'
+        },
+        'w': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labial-velar',
+            'consonant_manner': 'approximant'
+        },
+        'x': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'fricative'
+        },
+        'z': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'fricative'
+        },
+        'ʀ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'trill'
+        },
+        'ø': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded'
+        },
+        'ç': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'fricative'
+        },
+        'ɐ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded'
+        },
+        'œ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded'
+        },
+        'y': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded'
+        },
+        'ʏ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front_central',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'rounded'
+        },
+        'ɑ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded'
+        },
+        'c': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'plosive'
+        },
+        'ɲ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'nasal'
+        },
+        'ɣ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'fricative'
+        },
+        'ʎ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'lateral-approximant'
+        },
+        'β': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'fricative'
+        },
+        'ʝ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'fricative'
+        },
+        'ɟ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'plosive'
+        },
+        'q': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'plosive'
+        },
+        'ɕ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolopalatal',
+            'consonant_manner': 'fricative'
+        },
+        'ɭ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'lateral-approximant'
+        },
+        'ɵ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded'
+        },
+        'ʑ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolopalatal',
+            'consonant_manner': 'fricative'
+        },
+        'ʋ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'approximant'
+        },
+        'ʁ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'fricative'
+        },
+        'ɨ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded'
+        },
+        'ʂ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'fricative'
+        },
+        'ɓ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'implosive'
+        },
+        'ʙ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'vibrant'
+        },
+        'ɗ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'implosive'
+        },
+        'ɖ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'plosive'
+        },
+        'χ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'fricative'
+        },
+        'ʛ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'implosive'
+        },
+        'ʟ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'lateral-approximant'
+        },
+        'ɽ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'flap'
+        },
+        'ɢ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'plosive'
+        },
+        'ɠ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'implosive'
+        },
+        'ǂ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolopalatal',
+            'consonant_manner': 'click'
+        },
+        'ɦ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'glottal',
+            'consonant_manner': 'fricative'
+        },
+        'ǁ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'click'
+        },
+        'ĩ': {  # identical description with i except nasal
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded',
+            'consonant_manner' : 'nasal'
+        },
+        'ʍ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'labial-velar',
+            'consonant_manner': 'fricative'
+        },
+        'ʕ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'pharyngal',
+            'consonant_manner': 'fricative'
+        },
+        'ɻ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'approximant'
+        },
+        'ʄ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'implosive'
+        },
+        'ũ': {  # identical with u, but nasal
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded',
+            'consonant_manner' : 'nasal'
+        },
+        'ɤ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'unrounded',
+        },
+        'ɶ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'rounded',
+        },
+        'õ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded',
+            'consonant_manner' : 'nasal'
+        },
+        'ʡ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'epiglottal',
+            'consonant_manner': 'plosive'
+        },
+        'ʈ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'plosive'
+        },
+        'ʜ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'epiglottal',
+            'consonant_manner': 'fricative'
+        },
+        'ɱ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'nasal'
+        },
+        'ɯ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded'
+        },
+        'ǀ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'click'
+        },
+        'ɸ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'fricative'
+        },
+        'ʘ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'click'
+        },
+        'ʐ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'retroflex',
+            'consonant_manner': 'fricative'
+        },
+        'ɰ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'approximant'
+        },
+        'ɘ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'unrounded'
+        },
+        'ħ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'pharyngal',
+            'consonant_manner': 'fricative'
+        },
+        'ɞ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded'
+        },
+        'ʉ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded'
+        },
+        'ɴ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'nasal'
+        },
+        'ʢ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'epiglottal',
+            'consonant_manner': 'fricative'
+        },
+        'ѵ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'flap'
+        },
+        'ǃ': {  # looks deceivingly like an exclamation mark, but it's a different unicode entry
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'postalveolar',
+            'consonant_manner': 'click'
+        },
+    }  # REMEMBER to also add the phonemes added here to the ID lookup below as the new highest ID
+def get_phone_to_id():
+    """
+    for the states of the ctc loss and dijkstra/mas in the aligner
+    cannot be extracted trivially from above because sets are unordered and the IDs need to be consistent
+    """
+    phone_to_id = dict()
+    for index, phone in enumerate("~#?!ǃ.ɜəaðɛɪŋɔɒɾʃθʊʌʒæbʔdefghijklmnɳopɡɹrstuvwxzʀøçɐœyʏɑcɲɣʎβʝɟqɕɭɵʑʋʁɨʂɓʙɗɖχʛʟɽɢɠǂɦǁĩʍʕɻʄũɤɶõʡʈʜɱɯǀɸʘʐɰɘħɞʉɴʢѵ"):
+        phone_to_id[phone] = index
+    # the following lines fix an issue with the aligner: While the different punctuation marks have
+    # different effects on their context, their realization in the signal is typically just silence.
+    # Since this is common for all of them, the CTC objective malfunctions for our purposes of
+    # alignment search. So it turned out that it's better to map all punctuation marks to silence.
+    phone_to_id["#"] = phone_to_id["~"]
+    phone_to_id["?"] = phone_to_id["~"]
+    phone_to_id["!"] = phone_to_id["~"]
+    phone_to_id["."] = phone_to_id["~"]
+    return phone_to_id
+def get_feature_to_index_lookup():
+    return {
+        # MODIFIER
+        # -- stress: modified by the previous symbol
+        "stressed"           : 0,
+        # -- tone: modified by the following symbol
+        "very-high-tone"     : 1,
+        "high-tone"          : 2,
+        "mid-tone"           : 3,
+        "low-tone"           : 4,
+        "very-low-tone"      : 5,
+        "rising-tone"        : 6,
+        "falling-tone"       : 7,
+        "peaking-tone"       : 8,
+        "dipping-tone"       : 9,
+        # -- lengthening: modified by the following symbol
+        "lengthened"         : 10,
+        "half-length"        : 11,
+        "shortened"          : 12,
+        # CATEGORIES
+        "consonant"          : 13,
+        "vowel"              : 14,
+        "phoneme"            : 15,
+        # NON-SPEECH-MARKERS
+        "silence"            : 16,
+        "end of sentence"    : 17,
+        "questionmark"       : 18,
+        "exclamationmark"    : 19,
+        "fullstop"           : 20,
+        "word-boundary"      : 21,
+        # PLACE
+        "dental"             : 22,
+        "postalveolar"       : 23,
+        "velar"              : 24,
+        "palatal"            : 25,
+        "glottal"            : 26,
+        "uvular"             : 27,
+        "labiodental"        : 28,
+        "labial-velar"       : 29,
+        "alveolar"           : 30,
+        "bilabial"           : 31,
+        "alveolopalatal"     : 32,
+        "retroflex"          : 33,
+        "pharyngal"          : 34,
+        "epiglottal"         : 35,
+        # TONGUE POSITION
+        "central"            : 36,
+        "back"               : 37,
+        "front_central"      : 38,
+        "front"              : 39,
+        "central_back"       : 40,
+        # MOUTH OPENNESS
+        "mid"                : 41,
+        "close-mid"          : 42,
+        "close"              : 43,
+        "open-mid"           : 44,
+        "close_close-mid"    : 45,
+        "open-mid_open"      : 46,
+        "open"               : 47,
+        # MOUTH SHAPE
+        "rounded"            : 48,
+        "unrounded"          : 49,
+        # MANNER
+        "plosive"            : 50,
+        "nasal"              : 51,
+        "approximant"        : 52,
+        "trill"              : 53,
+        "flap"               : 54,
+        "fricative"          : 55,
+        "lateral-approximant": 56,
+        "implosive"          : 57,
+        "vibrant"            : 58,
+        "click"              : 59,
+        "ejective"           : 60,
+        # TYPE
+        "aspirated"          : 61,
+        "unvoiced"           : 62,
+        "voiced"             : 63,
+    }
+def generate_feature_table():
+    ipa_to_phonemefeats = generate_feature_lookup()
+    feat_types = set()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            [feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()]
+    feat_to_val_set = dict()
+    for feat in feat_types:
+        feat_to_val_set[feat] = set()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            for feat in ipa_to_phonemefeats[ipa]:
+                feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat])
+    # print(feat_to_val_set)
+    value_list = set()
+    for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]:
+        for value in val_set:
+            value_list.add(value)
+    # print("{")
+    # for index, value in enumerate(list(value_list)):
+    #     print('"{}":{},'.format(value,index))
+    # print("}")
+    value_to_index = get_feature_to_index_lookup()
+    phone_to_vector = dict()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            phone_to_vector[ipa] = [0] * (15 + sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]]))
+            # 15 features come from modifiers, not from lexical sounds, so we have to add them to the ones we encounter naturally in the lexical sounds
+            for feat in ipa_to_phonemefeats[ipa]:
+                if ipa_to_phonemefeats[ipa][feat] in value_to_index:
+                    phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1
+            if phone_to_vector[ipa][value_to_index["phoneme"]] != 1:
+                # it's not a phoneme, so we give it the silence marker, regardless of what it is.
+                phone_to_vector[ipa][value_to_index["silence"]] = 1
+    for feat in feat_to_val_set:
+        for value in feat_to_val_set[feat]:
+            if value not in value_to_index:
+                print(f"Unknown feature value in featureset! {value}")
+    # print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 49")
+    return phone_to_vector
+if __name__ == '__main__':
+    print(generate_feature_table())

iso_lookup.json ADDED Viewed

The diff for this file is too large to render. See raw diff

iso_to_fullname.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch~=2.1.0
+epitran==1.24
+numpy~=1.23.4
+pypinyin~=0.47.1
+matplotlib~=3.7.0
+phonemizer~=3.2.1
+dragonmapper~=0.2.6
+transphone==1.5.3
+pykakasi~=2.2.1
+jamo~=0.4.1
+g2pk~=0.9.4
+gradio~=4.26.0

run_feature_visualization.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from ArticulatoryTextFrontend import ArticulatoryTextFrontend
+def visualize_one_hot_encoded_sequence(tensor, sentence, col_labels, cmap='BuGn'):
+    """
+    Visualize a 2D one-hot encoded tensor as a heatmap.
+    """
+    tensor = torch.clamp(tensor, min=0, max=1).transpose(0, 1).cpu().numpy()
+    if tensor.ndim != 2:
+        raise ValueError("Input tensor must be a 2D array")
+    # Check the size of labels matches the tensor dimensions
+    row_labels = ["stressed", "very-high-tone", "high-tone", "mid-tone", "low-tone", "very-low-tone", "rising-tone", "falling-tone", "peaking-tone", "dipping-tone", "lengthened", "half-length", "shortened", "consonant", "vowel", "phoneme", "silence", "end of sentence", "questionmark", "exclamationmark", "fullstop", "word-boundary", "dental", "postalveolar",
+                  "velar", "palatal", "glottal", "uvular", "labiodental", "labial-velar", "alveolar", "bilabial", "alveolopalatal", "retroflex", "pharyngal", "epiglottal", "central", "back", "front_central", "front", "central_back", "mid", "close-mid", "close", "open-mid", "close_close-mid", "open-mid_open", "open", "rounded", "unrounded", "plosive",
+                  "nasal", "approximant", "trill", "flap", "fricative", "lateral-approximant", "implosive", "vibrant", "click", "ejective", "aspirated", "unvoiced", "voiced"]
+    if row_labels and len(row_labels) != tensor.shape[0]:
+        raise ValueError("Number of row labels must match the number of rows in the tensor")
+    if col_labels and len(col_labels) != tensor.shape[1]:
+        raise ValueError("Number of column labels must match the number of columns in the tensor")
+    plt.figure(figsize=(10, 8))
+    # Create the heatmap
+    plt.imshow(tensor, cmap=cmap, aspect='auto')
+    # Add labels
+    if row_labels:
+        plt.yticks(np.arange(tensor.shape[0]), row_labels)
+    if col_labels:
+        plt.xticks(np.arange(tensor.shape[1]), col_labels, rotation=0)
+    plt.grid(False)
+    plt.xlabel('Phones')
+    plt.ylabel('Features')
+    # Display the heatmap
+    plt.title(f"»{sentence}«")
+    plt.tight_layout()
+    plt.show()
+if __name__ == '__main__':
+    sentence = "Rằng: Trong Thánh trạch dồi dào."
+    language = "vie"
+    tf = ArticulatoryTextFrontend(language=language)
+    features = tf.string_to_tensor(sentence)
+    phones = tf.get_phone_string(sentence)
+    visualize_one_hot_encoded_sequence(tensor=features, sentence=sentence, col_labels=phones)

run_grapheme_to_feature_demo.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from ArticulatoryTextFrontend import ArticulatoryTextFrontend, get_language_id
+if __name__ == '__main__':
+    # demonstrating the language ID lookup
+    print(get_language_id("eng"))
+    print(get_language_id("deu"))
+    print(get_language_id("fra"))
+    # demonstrating the conversion from graphemes to features
+    print("\n\nEnglish Test")
+    tf = ArticulatoryTextFrontend(language="eng")
+    features = tf.string_to_tensor("This is a complex sentence, it even has a pause!", view=True)
+    print("\n\nChinese Test")
+    tf = ArticulatoryTextFrontend(language="cmn")
+    features = tf.string_to_tensor("这是一个复杂的句子，它甚至包含一个停顿。", view=True)
+    features = tf.string_to_tensor("李绅 《悯农》 锄禾日当午， 汗滴禾下土。 谁知盘中餐， 粒粒皆辛苦。", view=True)
+    features = tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
+    print("\n\nVietnamese Test")
+    tf = ArticulatoryTextFrontend(language="vie")
+    features = tf.string_to_tensor("Xin chào thế giới, quả là một ngày tốt lành để học nói tiếng Việt!", view=True)
+    features = tf.string_to_tensor("ba bà bá bạ bả bã", view=True)
+    print("\n\nJapanese Test")
+    tf = ArticulatoryTextFrontend(language="jpn")
+    features = tf.string_to_tensor("医師会がなくても、近隣の病院なら紹介してくれると思います。", view=True)
+    print("\n\nZero-Shot Test")
+    tf = ArticulatoryTextFrontend(language="acr")
+    features = tf.string_to_tensor("I don't know this language, but this is just a placeholder text anyway.", view=True)