Text degradation

IntroductionUp

Tables ex (expressions) and df (definitions) include, in column td, degraded versions of the texts stored in column tt. The rationale for degradation is described in the documentation on the database design.

Infrastructure

Degradation in PanLex is performed by a stored Perl function in the database, which makes use of Unicode character classes, properties, and codepoint ranges. Degradation performs various operations, including subjecting the text to NFKD normalization; removing whitespace and punctuation; making letters lowercase; removing most diacritics (except in abugidas); and merging certain spelling variants (for example, “ß” and “ss”; “ı” and “i”). The precise details of degradation vary by script. The text’s language variety is not considered.

02016-06 algorithm

The td function, as of August 02016, was:

use strict;
# Require typing.

use utf8;
# Make Perl interpret strings as UTF-8.

use Unicode::Normalize 'NFKD';
# Import the NFKD function.

my $td_map = (
# Identify a table of context-independent conversions of single characters
#  to single characters. Specifically:

    # LATIN

    "ı" => "i",
    "ß" => "ss",

    # DEVANAGARI

    "\x{0900}" => "\x{0902}",
    "\x{0901}" => "\x{0902}",
    # Convert CHANDRABINDU or INVERTED CHANDRABINDU to ANUSVARA.

    "\x{0904}" => "\x{0905}",
    "\x{0906}" => "\x{0905}",
    "\x{0972}" => "\x{0905}",
    "\x{0908}" => "\x{0907}",
    "\x{090A}" => "\x{0909}",
    "\x{0976}" => "\x{0909}",
    "\x{0977}" => "\x{0909}",
    "\x{0960}" => "\x{090B}",
    "\x{0961}" => "\x{090C}",
    "\x{090D}" => "\x{090F}",
    "\x{090E}" => "\x{090F}",
    "\x{0910}" => "\x{090F}",
    "\x{0911}" => "\x{0913}",
    "\x{0912}" => "\x{0913}",
    "\x{0914}" => "\x{0913}",
    "\x{0973}" => "\x{0913}",
    "\x{0974}" => "\x{0913}",
    "\x{0975}" => "\x{0913}",
    # Convert a long or short vowel to the corresponding basic vowel.

    "\x{0958}" => "\x{0915}",
    "\x{0959}" => "\x{0916}",
    "\x{095A}" => "\x{0917}",
    "\x{097B}" => "\x{0917}",
    "\x{095B}" => "\x{091C}",
    "\x{0979}" => "\x{091C}",
    "\x{097C}" => "\x{091C}",
    "\x{095C}" => "\x{0921}",
    "\x{097E}" => "\x{0921}",
    "\x{095F}" => "\x{092F}",
    "\x{097A}" => "\x{092F}",
    "\x{095D}" => "\x{0922}",
    "\x{0929}" => "\x{0928}",
    "\x{095E}" => "\x{092B}",
    "\x{097F}" => "\x{092C}",
    "\x{0931}" => "\x{0930}",
    "\x{0934}" => "\x{0933}",
    # Convert a consonant with a diacritical mark to the corresponding
    # consonant without one.

    "\x{0936}" => "\x{0938}",
    "\x{0937}" => "\x{0938}",
    # Convert SHA or SSA to SA.

    "\x{0940}" => "\x{093F}",
    "\x{0942}" => "\x{0941}",
    "\x{0956}" => "\x{0941}",
    "\x{0957}" => "\x{0941}",
    "\x{0944}" => "\x{0943}",
    "\x{0945}" => "\x{0947}",
    "\x{0946}" => "\x{0947}",
    "\x{0948}" => "\x{0947}",
    "\x{0955}" => "\x{0947}",
    "\x{0949}" => "\x{094B}",
    "\x{094A}" => "\x{094B}",
    "\x{094C}" => "\x{094B}",
    "\x{094F}" => "\x{094B}",
    "\x{0963}" => "\x{0962}",
    # Convert a dependent short or long vowel to the corresponding
    # dependent basic vowel.

    # BENGALI

    "\x{0981}" => "\x{0982}",
    # Convert CHANDRABINDU to ANUSVARA.

    "\x{0986}" => "\x{0985}",
    "\x{0988}" => "\x{0987}",
    "\x{098A}" => "\x{0989}",
    "\x{09E0}" => "\x{098B}",
    "\x{09E1}" => "\x{098C}",
    "\x{0990}" => "\x{098F}",
    "\x{0994}" => "\x{0993}",
    # Convert a long or short vowel to the corresponding basic vowel.

    "\x{09DC}" => "\x{09A1}",
    "\x{09DD}" => "\x{09A2}",
    "\x{09CE}" => "\x{09A4}",
    "\x{09DF}" => "\x{09AF}",
    "\x{09F0}" => "\x{09B0}",
    "\x{09F1}" => "\x{09AC}",
    # Convert a consonant with a diacritical mark to the corresponding
    # consonant without one.

    "\x{09B6}" => "\x{09B8}",
    "\x{09B7}" => "\x{09B8}",
    # Convert SHA or SSA to SA.

    "\x{09C0}" => "\x{09BF}",
    "\x{09C2}" => "\x{09C1}",
    "\x{09C4}" => "\x{09C3}",
    "\x{09C8}" => "\x{09C7}",
    "\x{09D7}" => "\x{09CB}",
    "\x{09E3}" => "\x{09E2}",
    # Convert a dependent short or long vowel to the corresponding
    # dependent basic vowel.

    # GURMUKHI

    "\x{0A00}" => "\x{0A02}",
    "\x{0A01}" => "\x{0A02}",
    "\x{0A70}" => "\x{0A02}",
    # Convert ADAK BINDI or TIPPI to BINDI.

    "\x{0A06}" => "\x{0A05}",
    "\x{0A08}" => "\x{0A07}",
    "\x{0A0A}" => "\x{0A09}",
    "\x{0A10}" => "\x{0A0F}",
    "\x{0A14}" => "\x{0A13}",
    # Convert a long or short vowel to the
    # corresponding basic vowel.

    "\x{0A59}" => "\x{0A16}",
    "\x{0A5A}" => "\x{0A17}",
    "\x{0A5B}" => "\x{0A1C}",
    "\x{0A5E}" => "\x{0A2B}",
    "\x{0A33}" => "\x{0A32}",
    # Convert a consonant with a diacritical mark to the corresponding
    # consonant without one.

    "\x{0A36}" => "\x{0A38}",
    # Convert SHA to SA.

    "\x{0A40}" => "\x{0A3F}",
    "\x{0A42}" => "\x{0A41}",
    "\x{0A48}" => "\x{0A47}",
    "\x{0A4C}" => "\x{0A4B}",
    # Convert a dependent short or long vowel to the corresponding
    # dependent basic vowel.

    # GUJARATI

    "\x{0A86}" => "\x{0A85}",
    "\x{0A88}" => "\x{0A87}",
    "\x{0A8A}" => "\x{0A89}",
    "\x{0AE0}" => "\x{0A8B}",
    "\x{0AE1}" => "\x{0A8C}",
    "\x{0A8D}" => "\x{0A8F}",
    "\x{0A90}" => "\x{0A8F}",
    "\x{0A91}" => "\x{0A93}",
    "\x{0A94}" => "\x{0A93}",
    # Convert a long vowel to the corresponding basic vowel.

    "\x{0AB1}" => "\x{0AB0}",
    "\x{0ADE}" => "\x{0AB3}",
    # Convert a deprecated consonant to the corresponding stardard one.

    "\x{0AB6}" => "\x{0AB8}",
    "\x{0AB7}" => "\x{0AB8}",
    # Convert SHA or SSA to SA.

    "\x{0AC0}" => "\x{0ABF}",
    "\x{0AC2}" => "\x{0AC1}",
    "\x{0AC4}" => "\x{0AC3}",
    "\x{0AC5}" => "\x{0AC7}",
    "\x{0AC8}" => "\x{0AC7}",
    "\x{0AC9}" => "\x{0ACB}",
    "\x{0ACC}" => "\x{0ACB}",
    "\x{0AE3}" => "\x{0AE2}",
    # Convert a dependent long vowel to the corresponding dependent basic
    #  vowel.

    # ORIYA

    "\x{0B01}" => "\x{0B02}",
    # Convert CHANDRABINDU to ANUSVARA.

    "\x{0B06}" => "\x{0B05}",
    "\x{0B08}" => "\x{0B07}",
    "\x{0B0A}" => "\x{0B09}",
    "\x{0B60}" => "\x{0B0B}",
    "\x{0B61}" => "\x{0B0C}",
    "\x{0B10}" => "\x{0B0F}",
    "\x{0B14}" => "\x{0B13}",
    # Convert a long or short vowel to the corresponding basic vowel.

    "\x{0B5C}" => "\x{0B21}",
    "\x{0B5D}" => "\x{0B22}",
    "\x{0B35}" => "\x{0B2C}",
    "\x{0B71}" => "\x{0B2C}",
    "\x{0B5F}" => "\x{0B2F}",
    "\x{0B5D}" => "\x{0B22}",
    # Convert a consonant with a diacritical mark to the corresponding
    # consonant without one.

    "\x{0B36}" => "\x{0B38}",
    "\x{0B37}" => "\x{0B38}",
    # Convert SHA or SSA to SA.

    "\x{0B40}" => "\x{0B3F}",
    "\x{0B42}" => "\x{0B41}",
    "\x{0B44}" => "\x{0B43}",
    "\x{0B48}" => "\x{0B47}",
    "\x{0B57}" => "\x{0B4B}",
    "\x{0B63}" => "\x{0B62}",
    # Convert a dependent short or long vowel to the corresponding
    # dependent basic vowel.

    # TAMIL

    "\x{0B86}" => "\x{0B85}",
    "\x{0B88}" => "\x{0B87}",
    "\x{0B8A}" => "\x{0B89}",
    "\x{0B8F}" => "\x{0B8E}",
    "\x{0B93}" => "\x{0B92}",
    # Convert a long vowel to the corresponding basic vowel.

    "\x{0BB9}" => "\x{0B95}",
    "\x{0B9A}" => "\x{0B9C}",
    "\x{0BB6}" => "\x{0B9C}",
    "\x{0BB7}" => "\x{0B9C}",
    "\x{0BB8}" => "\x{0B9C}",
    # Convert a Grantha letter to the corresponding Tamil one.

    "\x{0BC0}" => "\x{0BBF}",
    "\x{0BC2}" => "\x{0BC1}",
    "\x{0BC7}" => "\x{0BC6}",
    "\x{0BCB}" => "\x{0BCA}",
    # Convert a dependent long vowel to the corresponding dependent basic
    # vowel.

    # TELUGU

    "\x{0C01}" => "\x{0C02}",
    # Convert CHANDRABINDU to ANUSVARA.

    "\x{0C06}" => "\x{0C05}",
    "\x{0C08}" => "\x{0C07}",
    "\x{0C0A}" => "\x{0C09}",
    "\x{0C60}" => "\x{0C0B}",
    "\x{0C61}" => "\x{0C0C}",
    "\x{0C0F}" => "\x{0C0E}",
    "\x{0C13}" => "\x{0C12}",
    # Convert a long vowel to the corresponding basic vowel.

    "\x{0C16}" => "\x{0C15}",
    "\x{0C18}" => "\x{0C17}",
    "\x{0C1B}" => "\x{0C1A}",
    "\x{0C58}" => "\x{0C1A}",
    "\x{0C1D}" => "\x{0C1C}",
    "\x{0C59}" => "\x{0C1C}",
    "\x{0C20}" => "\x{0C1F}",
    "\x{0C22}" => "\x{0C21}",
    "\x{0C25}" => "\x{0C24}",
    "\x{0C27}" => "\x{0C26}",
    "\x{0C2B}" => "\x{0C2A}",
    "\x{0C2D}" => "\x{0C2C}",
    "\x{0C31}" => "\x{0C30}",
    # Convert an aspirated or deprecated consonant to the corresponding
    # basic or standard one.

    "\x{0C36}" => "\x{0C38}",
    "\x{0C37}" => "\x{0C38}",
    # Convert SHA or SSA to SA.

    "\x{0C40}" => "\x{0C3F}",
    "\x{0C42}" => "\x{0C41}",
    "\x{0C44}" => "\x{0C43}",
    "\x{0C47}" => "\x{0C46}",
    "\x{0C4B}" => "\x{0C4A}",
    "\x{0C63}" => "\x{0C62}",
    # Convert a dependent long vowel to the corresponding dependent basic
    # vowel.

    # KANNADA

    "\x{0C86}" => "\x{0C85}",
    "\x{0C88}" => "\x{0C87}",
    "\x{0C8A}" => "\x{0C89}",
    "\x{0CE0}" => "\x{0C8B}",
    "\x{0CE1}" => "\x{0C8C}",
    "\x{0C8F}" => "\x{0C8E}",
    "\x{0C93}" => "\x{0C92}",
    # Convert a long vowel to the corresponding basic vowel.

    "\x{0C96}" => "\x{0C95}",
    "\x{0C98}" => "\x{0C97}",
    "\x{0C9B}" => "\x{0C9A}",
    "\x{0C9D}" => "\x{0C9C}",
    "\x{0CA0}" => "\x{0C9F}",
    "\x{0CA2}" => "\x{0CA1}",
    "\x{0CA5}" => "\x{0CA4}",
    "\x{0CA7}" => "\x{0CA6}",
    "\x{0CAB}" => "\x{0CAA}",
    "\x{0CAD}" => "\x{0CAC}",
    "\x{0CB1}" => "\x{0CB0}",
    "\x{0CDE}" => "\x{0CB3}",
    # Convert an aspirated or deprecated consonant to the corresponding
    # basic or standard one.

    "\x{0CB6}" => "\x{0CB8}",
    "\x{0CB7}" => "\x{0CB8}",
    # Convert SHA or SSA to SA.

    "\x{0CC0}" => "\x{0CBF}",
    "\x{0CC2}" => "\x{0CC1}",
    "\x{0CC4}" => "\x{0CC3}",
    "\x{0CC7}" => "\x{0CC6}",
    "\x{0CCB}" => "\x{0CCA}",
    "\x{0CE3}" => "\x{0CE2}",
    # Convert a dependent long vowel to the corresponding dependent basic
    # vowel.

    # MALAYALAM

    "\x{0D01}" => "\x{0D02}",
    # Convert CHANDRABINDU to ANUSVARA.

    "\x{0D06}" => "\x{0D05}",
    "\x{0D08}" => "\x{0D07}",
    "\x{0D0A}" => "\x{0D09}",
    "\x{0D60}" => "\x{0D0B}",
    "\x{0D61}" => "\x{0D0C}",
    "\x{0D0F}" => "\x{0D0E}",
    "\x{0D13}" => "\x{0D12}",
    # Convert a long vowel to the corresponding basic vowel.

    "\x{0D16}" => "\x{0D15}",
    "\x{0D18}" => "\x{0D17}",
    "\x{0D1B}" => "\x{0D1A}",
    "\x{0D58}" => "\x{0D1A}",
    "\x{0D1D}" => "\x{0D1C}",
    "\x{0D59}" => "\x{0D1C}",
    "\x{0D20}" => "\x{0D1F}",
    "\x{0D22}" => "\x{0D21}",
    "\x{0D25}" => "\x{0D24}",
    "\x{0D27}" => "\x{0D26}",
    "\x{0D2B}" => "\x{0D2A}",
    "\x{0D2D}" => "\x{0D2C}",
    "\x{0D31}" => "\x{0D30}",
    # Convert an aspirated or deprecated consonant to the corresponding
    # basic or standard one.

    "\x{0D36}" => "\x{0D38}",
    "\x{0D37}" => "\x{0D38}",
    # Convert SHA or SSA to SA.

    "\x{0D40}" => "\x{0D3F}",
    "\x{0D42}" => "\x{0D41}",
    "\x{0D44}" => "\x{0D43}",
    "\x{0D47}" => "\x{0D46}",
    "\x{0D4B}" => "\x{0D4A}",
    "\x{0D63}" => "\x{0D62}",
    # Convert a dependent long vowel to the corresponding dependent basic
    # vowel.

    # MYANMAR

    "\x{102B}" => "\x{102C}",
    # Convert VOWEL SIGN TALL AA to VOWEL SIGN AA.

    "\x{103F}" => "\x{101E}\x{103A}\x{101E}",
    # Convert GREAT SA to stacked form (with ASAT).

    "\x{1039}" => "\x{103A}",
    # Convert VIRAMA to ASAT.

    "\x{103B}" => "\x{103A}\x{101A}",
    "\x{103C}" => "\x{103A}\x{101B}",
    "\x{103D}" => "\x{103A}\x{101D}",
    "\x{103E}" => "\x{103A}\x{101E}",
    "\x{105E}" => "\x{103A}\x{1014}",
    "\x{105F}" => "\x{103A}\x{1019}",
    "\x{1060}" => "\x{103A}\x{101C}",
    "\x{1082}" => "\x{103A}\x{101D}",
    # Replace medial consonants with ASAT + full consonant.

    "\x{1022}" => "\x{1021}",
    "\x{1028}" => "\x{1027}",
    "\x{1033}" => "\x{102E}",
    "\x{1034}" => "\x{1031}\x{102C}",
    "\x{1035}" => "\x{1031}",
    "\x{105A}" => "\x{1004}",
    "\x{105B}" => "\x{1008}",
    "\x{1066}" => "\x{1015}\x{103A}\x{101D}",
    "\x{106E}" => "\x{100F}",
    "\x{106F}" => "\x{101A}\x{103A}\x{101D}",
    "\x{1070}" => "\x{1003}\x{103A}\x{101D}",
    "\x{1071}" => "\x{102D}",
    "\x{1073}" => "\x{1025}",
    "\x{1075}" => "\x{1000}",
    "\x{1076}" => "\x{1001}",
    "\x{1077}" => "\x{1002}",
    "\x{1078}" => "\x{1005}",
    "\x{107A}" => "\x{1009}",
    "\x{107B}" => "\x{1012}",
    "\x{107C}" => "\x{1014}",
    "\x{107D}" => "\x{1016}",
    "\x{107F}" => "\x{1017}",
    "\x{1081}" => "\x{101F}",
    "\x{1083}" => "\x{102C}",
    "\x{1084}" => "\x{1031}",
    "\x{1085}" => "\x{1031}",
    "\x{1086}" => "\x{101A}\x{103A}",
    "\x{AA60}" => "\x{1002}",
    "\x{AA61}" => "\x{1005}",
    "\x{AA62}" => "\x{1006}",
    "\x{AA63}" => "\x{1007}",
    "\x{AA64}" => "\x{1008}",
    "\x{AA65}" => "\x{1009}",
    "\x{AA66}" => "\x{100B}",
    "\x{AA67}" => "\x{100C}",
    "\x{AA68}" => "\x{100D}",
    "\x{AA69}" => "\x{100E}",
    "\x{AA6A}" => "\x{1013}",
    "\x{AA6B}" => "\x{1014}",
    "\x{AA6C}" => "\x{101E}",
    "\x{AA6D}" => "\x{101F}",
    "\x{AA73}" => "\x{101B}",
    "\x{AA7E}" => "\x{1006}",
    "\x{AA7E}" => "\x{1006}",
    "\x{A9E0}" => "\x{1003}",
    "\x{A9E1}" => "\x{1006}",
    "\x{A9E2}" => "\x{1008}",
    "\x{A9E3}" => "\x{100F}",
    "\x{A9E4}" => "\x{1018}",
    "\x{AA70}" => "\x{A9E6}",
    "\x{A9E7}" => "\x{1009}",
    "\x{A9E9}" => "\x{1002}",
    "\x{A9EA}" => "\x{1003}",
    "\x{A9EB}" => "\x{1007}",
    "\x{A9EC}" => "\x{1008}",
    "\x{A9ED}" => "\x{100D}",
    "\x{A9EE}" => "\x{100E}",
    "\x{A9FA}" => "\x{1020}",
    "\x{A9FB}" => "\x{1012}",
    "\x{A9FD}" => "\x{1017}",
    "\x{A9FE}" => "\x{1018}",
    # Replace non-Myanmar (Mon, Shan, etc.) letters with their Myanmar
    # equivalents.

);

my $td_basic =
    "\x{0902}\x{093F}\x{0941}\x{0943}\x{0947}\x{094B}\x{0962}" # Devanagari
    . "\x{0982}\x{09BF}\x{09C1}\x{09C3}\x{09C7}\x{09CB}\x{09E2}" # Bengali
    . "\x{0A02}\x{0A3F}\x{0A41}\x{0A47}\x{0A4B}\x{0A71}" # Gurmukhi
    . "\x{0A82}\x{0ABF}\x{0AC1}\x{0AC3}\x{0AC7}\x{0ACB}\x{0AE2}" # Gujarati
    . "\x{0B02}\x{0B3F}\x{0B41}\x{0B43}\x{0B47}\x{0B4B}\x{0B62}" # Oriya
    . "\x{0BBF}\x{0BC1}\x{0BC6}\x{0BC8}\x{0BCA}\x{0BCC}\x{0BD7}" # Tamil
    . "\x{0C02}\x{0C3F}\x{0C41}\x{0C43}\x{0C46}\x{0C48}\x{0C4A}\x{0C4C}\x{0C62}" # Telugu
    . "\x{0C82}\x{0CBF}\x{0CC1}\x{0CC3}\x{0CC6}\x{0CC8}\x{0CCA}\x{0CCC}\x{0CE2}" # Kannada
    . "\x{0D02}\x{0D3F}\x{0D41}\x{0D43}\x{0D46}\x{0D48}\x{0D4A}\x{0D4C}\x{0D62}" # Malayalam
    . "\x{0D80}-\x{109F}" # Sinhala, Thai, Lao, Tibetan, Myanmar
    . "\x{1700}-\x{17FF}" # Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer
    . "\x{1900}-\x{19DF}" # Limbu, Tai Le, Tai Lue
    . "\x{1A00}-\x{1AAF}" # Buginese, Tai Tham
    . "\x{1B00}-\x{1CFF}" # Balinese, Sundanese, Batak, Lepcha, Ol Chiki, Sundanese supplement, Vedic extensions
    . "\x{3031}\x{3032}\x{3099}\x{309A}\x{30FC}" # Japanese
    . "\x{A800}-\x{A82F}" # Syloti Nagri
    . "\x{A880}-\x{A95F}" # Saurashtra, Devanagari extensions, Kayah Li, Rejang
    . "\x{A980}-\x{AAFF}" # Javanese, Myanmar extended, Cham, Tai Viet, Meetei Mayek extensions
    . "\x{ABC0}-\x{ABFF}" # Meetei Mayek
    . "\x{11000}–\x{11AFF}" # various Brahmic scripts
    . "\x{16B00}-\x{16B8F}" # Pahawh Hmong
    . "\x{16F00}–\x{16F9F}" # Miao
;

sub td {
# PanLex expression text degradation. Original version by Jonathan Pool.
# Enhanced version with Indic script support, August 02013, by Yadav Gowda.
# Further enhanced by Jonathan Pool, David Kamholz, and Ben Yang.

    my $td = $_[0];
    # Initialize the degradation of the specified text.

    $td =~ s/([\x{AC00}-\x{D7AF}])/NFKD($1) =~ s|^\x{110B}||r/ge;
    # Replace each Hangul syllable with its jamos, removing null initials.

    $td = NFKD($td);
    # Set the degradation of to its compatibility decomposition (Normalization Form KD).

    $td = lc $td;
    # Make it lower-case.

    # DEVANAGARI

    $td =~ s/\x{094D}\x{0930}\x{093F}/\x{0943}/g;
    # Replace all instances of VIRAMA + RA + VOWEL SIGN I with VOCALIC R.

    $td =~ s/[\x{0919}\x{091E}\x{0923}\x{0928}\x{0929}\x{092E}]
        (?![\x{093E}-\x{094C}\x{094E}\x{094F}\x{0955}-\x{0957}\x{0962}\x{0963}])
        /\x{0902}/gx;
    # Replace all instances of a nasal not followed by a vowel with ANUSVARA.

    # BENGALI

    $td =~ s/\x{09CD}\x{09B0}[\x{09BF}\x{09C1}]/\x{09C3}/g;
    # Replace all instances of VIRAMA + RA + VOWEL SIGN I or U with VOCALIC R.

    $td =~ s/[\x{0999}\x{099E}\x{09A3}\x{09A8}\x{09A9}\x{09AE}]
        (?![\x{09BE}-\x{09CC}\x{09D6}\x{09D7}\x{09E2}\x{09E3}])
        /\x{0982}/gx;
    # Replace all instances of a nasal not followed by a vowel with ANUSVARA.

    $td =~ s/\x{09C7}[\x{09BE}\x{09D7}]/\x{09CB}/g;
    # Replace all instances of VOWEL SIGN E + VOWEL SIGN AA or AU LENGTH
    # MARK with VOWEL SIGN O.

    # GURMUKHI

    $td =~ s/[\x{0A19}\x{0A1E}\x{0A23}\x{0A28}\x{0A29}\x{0A2E}]
            (?![\x{0A3e}-\x{0A4c}])
            /\x{0A02}/gx;
    # Replace all instances of a nasal not followed by a vowel with BINDI.

    # GUJARATI

    $td =~ s/\x{0ACD}\x{0AB0}[\x{0ABF}\x{0AC1}]/\x{0AC3}/g;
    # Replace all instances of VIRAMA + RA + VOWEL SIGN I or U with VOCALIC R.

    $td =~ s/[\x{0A99}\x{0A9E}\x{0AA3}\x{0AA8}\x{0AAE}]
        (?![\x{0ABE}-\x{0ACC}\x{0AE2}\x{0AE3}])
        /\x{0A82}/gx;
    # Replace all instances of a nasal not followed by a vowel with ANUSVARA.

    # ORIYA

    $td =~ s/\x{0B4D}\x{0B30}[\x{0B3F}\x{0B41}]/\x{0B43}/g;
    # Replace all instances of VIRAMA + RA + VOWEL SIGN I or U with VOCALIC R.

    $td =~ s/[\x{0B19}\x{0B1E}\x{0B23}\x{0B28}\x{0B29}\x{0B2E}]
        (?![\x{0B3E}-\x{0B4C}\x{0B56}\x{0B57}\x{0B62}\x{0B63}])
        /\x{0B02}/gx;
    # Replace all instances of a nasal not followed by a vowel with ANUSVARA.

    $td =~ s/\x{0B47}[\x{0B3E}\x{0B57}]/\x{0B4B}/g;
    # Replace all instances of VOWEL SIGN E + VOWEL SIGN AA or AU LENGTH MARK
    # with VOWEL SIGN O.

    # TELUGU

    $td =~ s/\x{0C4D}\x{0C30}[\x{0C3F}\x{0C41}]/\x{0C43}/g;
    # Replace all instances of VIRAMA + RA + VOWEL SIGN I or U with VOCALIC R.

    $td =~ s/[\x{0C19}\x{0C1E}\x{0C23}\x{0C28}\x{0C2E}]
            (?![\x{0C3e}-\x{0C4c}\x{0C62}\x{0C63}])
            /\x{0C02}/gx;
    # Replace all instances of a nasal not followed by a vowel with ANUSVARA.

    # KANNADA

    $td =~ s/\x{0CCD}\x{0CB0}[\x{0CBF}\x{0CC1}]/\x{0CC3}/g;
    # Replace all instances of VIRAMA + RA + VOWEL SIGN I or U with VOCALIC R.

    $td =~ s/[\x{0C99}\x{0C9E}\x{0CA3}\x{0CA8}\x{0CAE}]
        (?![\x{0CBE}-\x{0CCC}\x{0CE2}\x{0CE3}])
        /\x{0C82}/gx;
    # Replace all instances of a nasal not followed by a vowel with ANUSVARA.

    # MALAYALAM

    $td =~ s/\x{0D4D}\x{0D30}[\x{0D3F}\x{0D41}]/\x{0D43}/g;
    # Replace all instances of VIRAMA + RA + VOWEL SIGN I or U with VOCALIC R.

    $td =~ s/[\x{0D19}\x{0D1E}\x{0D23}\x{0D28}\x{0D2E}]
        (?![\x{0D3E}-\x{0D4C}\x{0D62}\x{0D63}])
        /\x{0D02}/gx;
    # Replace all instances of a nasal not followed by a vowel with ANUSVARA.

    # JAPANESE

    $td =~ s/([\p{Ll}\p{Lo}])[^\p{Ll}\p{Lo}]*[\x{3005}\x{303B}\x{309D}\x{30FD}]/$1$1/g;
    # Replace all Han and Kana iteration marks with copies of the previous
    # full character.

    $td =~ s/\x{3033}\x{3035}/\x{3031}/g;
    $td =~ s/\x{3034}\x{3035}/\x{3032}/g;
    # Replace split vertical Kana repeat marks with a single vertical Kana
    # repeat mark.

    $td = join('', map { exists $td_map{$_} ? $td_map{$_} : $_ } split //, $td);
    # Make the replacements.

    $td =~ s/([\x{0982}\x{0A02}\x{0A82}\x{0B02}\x{0C02}\x{0C82}\x{0D02}\x{0902}\x{103A}])\1+/$1/g;
    # Collapse all sequences of 2 or more ANUSVARA, BINDI, or ASAT.

    $td =~ s/[^${td_basic}\p{Ll}\p{Lo}\p{Nd}]|[\p{C}\p{P}\p{S}\p{Z}]//g;
    # Delete all non-basic characters and all control, punctuation,
    # symbol, and separator characters.

    return $td;
    # Return the degradation.

}

Leave a Reply