diff --git a/.gitignore b/.gitignore index 9491a2f..f59c3cd 100644 --- a/.gitignore +++ b/.gitignore @@ -360,4 +360,5 @@ MigrationBackup/ .ionide/ # Fody - auto-generated XML schema -FodyWeavers.xsd \ No newline at end of file +FodyWeavers.xsd +repomix-output.xml diff --git a/TTSTextNormalization.Tests/Core/TextNormalizationPipelineTests.cs b/TTSTextNormalization.Tests/Core/TextNormalizationPipelineTests.cs index d544f48..64c4813 100644 --- a/TTSTextNormalization.Tests/Core/TextNormalizationPipelineTests.cs +++ b/TTSTextNormalization.Tests/Core/TextNormalizationPipelineTests.cs @@ -107,7 +107,7 @@ public void Normalize_RuleOrder_NumberBeforeWhitespace() [TestMethod] [DataRow( " ‘Test’ 1st.. soooo cool ✨!! LOL Cost: $12.50 USD??? ", - "'Test' first. soo cool sparkles! laughing out loud Cost: twelve dollars fifty cents USD?", + "'Test' first. soo cool sparkles! laughing out loud Cost: twelve US dollars fifty cents?", DisplayName = "All Rules Integration Test 1 - Corrected" )] [DataRow( @@ -122,7 +122,7 @@ public void Normalize_RuleOrder_NumberBeforeWhitespace() )] [DataRow( " OMG!!! The price is £50.00??? LOL... IDK. 1st prize! ", - "oh my god! The price is fifty pounds? laughing out loud. I don't know. first prize!", + "oh my god! The price is fifty British pounds? laughing out loud. I don't know. first prize!", DisplayName = "All Rules Integration Test 4 - Mixed Punctuation & Abbr - Corrected" )] [DataRow( diff --git a/TTSTextNormalization.Tests/Rules/AbbreviationNormalizationRuleTests.cs b/TTSTextNormalization.Tests/Rules/AbbreviationNormalizationRuleTests.cs index dd854e7..21a67a2 100644 --- a/TTSTextNormalization.Tests/Rules/AbbreviationNormalizationRuleTests.cs +++ b/TTSTextNormalization.Tests/Rules/AbbreviationNormalizationRuleTests.cs @@ -67,8 +67,8 @@ public void Apply_MultipleAbbreviations_ReplacesAll(string input, string expecte [DataRow("lollipop", "lollipop", DisplayName = "Substring 'lol'")] [DataRow("scrolling", "scrolling", DisplayName = "Substring 'lol' (reverse)")] [DataRow("theory", "theory", DisplayName = "Substring 'ty'")] - [DataRow("imo-test", "imo-test", DisplayName = "Abbreviation as prefix - Corrected Expectation")] // Lookaround fixed - [DataRow("test-imo", "test-imo", DisplayName = "Abbreviation as suffix - Corrected Expectation")] // Lookaround fixed + [DataRow("imo-test", "imo-test", DisplayName = "Abbreviation as prefix")] + [DataRow("test-imo", "test-imo", DisplayName = "Abbreviation as suffix")] public void Apply_AbbreviationAsSubstringOrAttached_DoesNotReplace(string input, string expected) { // Act diff --git a/TTSTextNormalization.Tests/Rules/CurrencyNormalizationRuleTests.cs b/TTSTextNormalization.Tests/Rules/CurrencyNormalizationRuleTests.cs index 78befe4..da77401 100644 --- a/TTSTextNormalization.Tests/Rules/CurrencyNormalizationRuleTests.cs +++ b/TTSTextNormalization.Tests/Rules/CurrencyNormalizationRuleTests.cs @@ -23,27 +23,39 @@ public void Apply_NoCurrency_ReturnsInput(string input, string expected) // NOTE: Expectations updated for default Humanizer output (includes "and") [TestMethod] // Symbol First - [DataRow("$1", " one dollar ", DisplayName = "USD Simple ($)")] - [DataRow("$1.00", " one dollar ", DisplayName = "USD Simple zero cents ($)")] - [DataRow("$1.50", " one dollar fifty cents ", DisplayName = "USD with Cents ($)")] // No "and" for cents usually - [DataRow("$1,234.56", " one thousand two hundred and thirty-four dollars fifty-six cents ", DisplayName = "USD Large with Cents ($)")] - [DataRow("£10", " ten pounds ", DisplayName = "GBP Simple (£)")] - [DataRow("£0.50", " zero pounds fifty pence ", DisplayName = "GBP Only Pence (£)")] + [DataRow("$1", " one US dollar ", DisplayName = "USD Simple ($)")] + [DataRow("$1.00", " one US dollar ", DisplayName = "USD Simple zero cents ($)")] + [DataRow("$1.50", " one US dollar fifty cents ", DisplayName = "USD with Cents ($)")] // No "and" for cents usually + [DataRow("$1,234.56", " one thousand two hundred and thirty-four US dollars fifty-six cents ", DisplayName = "USD Large with Cents ($)")] + [DataRow("£10", " ten British pounds ", DisplayName = "GBP Simple (£)")] + [DataRow("£0.50", " zero British pounds fifty pence ", DisplayName = "GBP Only Pence (£)")] [DataRow("€100", " one hundred euros ", DisplayName = "EUR Simple (€)")] [DataRow("€1.25", " one euro twenty-five cents ", DisplayName = "EUR With Cents (€)")] - [DataRow("¥500", " five hundred yen ", DisplayName = "JPY Simple (¥)")] + [DataRow("¥500", " five hundred Japanese yen ", DisplayName = "JPY Simple (¥)")] // Code Last - [DataRow("1 USD", " one dollar ", DisplayName = "USD Code Simple")] - [DataRow("1.00 USD", " one dollar ", DisplayName = "USD Code zero cents")] - [DataRow("1.50 USD", " one dollar fifty cents ", DisplayName = "USD Code with Cents")] - [DataRow("1,234.56 USD", " one thousand two hundred and thirty-four dollars fifty-six cents ", DisplayName = "USD Code Large")] - [DataRow("10 GBP", " ten pounds ", DisplayName = "GBP Code Simple")] // Uses "pound" from map - [DataRow("0.50 GBP", " zero pounds fifty pence ", DisplayName = "GBP Code Only Pence")] + [DataRow("1 USD", " one US dollar ", DisplayName = "USD Code Simple")] + [DataRow("1.00 USD", " one US dollar ", DisplayName = "USD Code zero cents")] + [DataRow("1.50 USD", " one US dollar fifty cents ", DisplayName = "USD Code with Cents")] + [DataRow("1,234.56 USD", " one thousand two hundred and thirty-four US dollars fifty-six cents ", DisplayName = "USD Code Large")] + [DataRow("10 GBP", " ten British pounds ", DisplayName = "GBP Code Simple")] // Uses "pound" from map + [DataRow("0.50 GBP", " zero British pounds fifty pence ", DisplayName = "GBP Code Only Pence")] [DataRow("100 EUR", " one hundred euros ", DisplayName = "EUR Code Simple")] [DataRow("1.25 EUR", " one euro twenty-five cents ", DisplayName = "EUR Code With Cents")] - [DataRow("500 JPY", " five hundred yen ", DisplayName = "JPY Code Simple")] // Uses "yen" from map + [DataRow("500 JPY", " five hundred Japanese yen ", DisplayName = "JPY Code Simple")] // Uses "yen" from map [DataRow("100 CAD", " one hundred Canadian dollars ", DisplayName = "CAD Code Example")] - [DataRow("10 BRL", " ten reais ", DisplayName = "BRL Code Example")] + [DataRow("10 BRL", " ten Brazilian reais ", DisplayName = "BRL Code Example")] + // Combined + [DataRow("$10 USD", " ten US dollars ", DisplayName = "USD Combined ($)")] + [DataRow("$10USD", " ten US dollars ", DisplayName = "USD Combined (wihtout spaces)")] + [DataRow("$10MXN", " ten Mexican pesos ", DisplayName = "MXN Combined (without spaces)")] + [DataRow("$10 CAD", " ten Canadian dollars ", DisplayName = "CAD Combined ($)")] + [DataRow("£10 GBP", " ten British pounds ", DisplayName = "GBP Combined (£)")] + [DataRow("€100 EUR", " one hundred euros ", DisplayName = "EUR Combined (€)")] + [DataRow("¥500 JPY", " five hundred Japanese yen ", DisplayName = "JPY Combined (¥)")] + [DataRow("10 USD $", " ten US dollars $", DisplayName = "USD Combined with Trailing Symbol")] + [DataRow("10 GBP £", " ten British pounds £", DisplayName = "GBP Combined with Trailing Symbol")] + [DataRow("100 EUR €", " one hundred euros €", DisplayName = "EUR Combined with Trailing Symbol")] + [DataRow("500 JPY ¥", " five hundred Japanese yen ¥", DisplayName = "JPY Combined with Trailing Symbol")] public void Apply_KnownCurrencies_ReplacesWithSpokenForm(string input, string expected) { // Act @@ -54,9 +66,10 @@ public void Apply_KnownCurrencies_ReplacesWithSpokenForm(string input, string ex } [TestMethod] - [DataRow("Send $10 now", "Send ten dollars now", DisplayName = "Currency within sentence")] + [DataRow("Send $10 now", "Send ten US dollars now", DisplayName = "Currency within sentence")] [DataRow("It costs 50 EUR.", "It costs fifty euros .", DisplayName = "Currency at end of sentence")] - [DataRow("$5 and £10", " five dollars and ten pounds ", DisplayName = "Multiple different currencies")] + [DataRow("It costs 50 EUR now.", "It costs fifty euros now.", DisplayName = "Currency within sentence")] + [DataRow("$5 and £10", " five US dollars and ten British pounds ", DisplayName = "Multiple different currencies")] public void Apply_CurrencyInContext_ReplacesCorrectly(string input, string expected) { // Act @@ -69,7 +82,6 @@ public void Apply_CurrencyInContext_ReplacesCorrectly(string input, string expec [TestMethod] [DataRow("10XYZ", "10XYZ", DisplayName = "Unknown Code XYZ")] [DataRow("¤10", "¤10", DisplayName = "Generic Currency Symbol")] - [DataRow("$10MXN", "$10MXN", DisplayName = "Symbol and Code")] public void Apply_UnknownOrAmbiguousCurrency_NoChange(string input, string expected) { // Act diff --git a/TTSTextNormalization.sln b/TTSTextNormalization.sln index dea0a32..9da5f40 100644 --- a/TTSTextNormalization.sln +++ b/TTSTextNormalization.sln @@ -9,13 +9,6 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TTSTextNormalization.EmojiD EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TTSTextNormalization", "TTSTextNormalization\TTSTextNormalization.csproj", "{1C2CA7DF-374E-FA47-469B-9751E035B2C8}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".github", ".github", "{02EA681E-C7D8-13C7-8484-4AC65E1B71E8}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "workflows", "workflows", "{3DCF185E-C897-4519-AB56-F4B91991DB25}" - ProjectSection(SolutionItems) = preProject - dotnet-publish.yml = dotnet-publish.yml - EndProjectSection -EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -38,9 +31,6 @@ Global GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection - GlobalSection(NestedProjects) = preSolution - {3DCF185E-C897-4519-AB56-F4B91991DB25} = {02EA681E-C7D8-13C7-8484-4AC65E1B71E8} - EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {53950FEC-997F-4537-B0E2-40090BAA342B} EndGlobalSection diff --git a/TTSTextNormalization/DependencyInjection/TextNormalizationServiceCollectionExtensions.cs b/TTSTextNormalization/DependencyInjection/TextNormalizationServiceCollectionExtensions.cs index f6d7e7b..e224f2d 100644 --- a/TTSTextNormalization/DependencyInjection/TextNormalizationServiceCollectionExtensions.cs +++ b/TTSTextNormalization/DependencyInjection/TextNormalizationServiceCollectionExtensions.cs @@ -1,8 +1,8 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.DependencyInjection.Extensions; -using TTSTextNormalization.Rules; using TTSTextNormalization.Abstractions; using TTSTextNormalization.Core; +using TTSTextNormalization.Rules; namespace TTSTextNormalization.DependencyInjection; @@ -29,30 +29,61 @@ public static IServiceCollection AddTextNormalization( } // --- Built-in Rule Extensions for the Builder --- + + /// + /// Adds the to the text normalization pipeline. + /// Performs essential cleanup like normalizing line breaks and replacing fancy characters. Recommended Order: 10. + /// + /// The text normalization builder. + /// The builder instance for fluent chaining. public static ITextNormalizationBuilder AddBasicSanitizationRule(this ITextNormalizationBuilder builder) { ArgumentNullException.ThrowIfNull(builder); return builder.AddRule(ServiceLifetime.Singleton); } + /// + /// Adds the to the text normalization pipeline. + /// Replaces standard Unicode emojis with their textual descriptions. Recommended Order: 100. + /// + /// The text normalization builder. + /// The builder instance for fluent chaining. public static ITextNormalizationBuilder AddEmojiRule(this ITextNormalizationBuilder builder) { ArgumentNullException.ThrowIfNull(builder); return builder.AddRule(ServiceLifetime.Singleton); } + /// + /// Adds the to the text normalization pipeline. + /// Normalizes currency symbols and codes (e.g., "$10.50", "100 EUR") into spoken text. Recommended Order: 200. + /// + /// The text normalization builder. + /// The builder instance for fluent chaining. public static ITextNormalizationBuilder AddCurrencyRule(this ITextNormalizationBuilder builder) { ArgumentNullException.ThrowIfNull(builder); return builder.AddRule(ServiceLifetime.Singleton); } + /// + /// Adds the to the text normalization pipeline. + /// Expands common chat/gaming abbreviations (e.g., "lol", "gg"). Recommended Order: 300. + /// + /// The text normalization builder. + /// The builder instance for fluent chaining. public static ITextNormalizationBuilder AddAbbreviationNormalizationRule(this ITextNormalizationBuilder builder) { ArgumentNullException.ThrowIfNull(builder); return builder.AddRule(ServiceLifetime.Singleton); } + /// + /// Adds the to the text normalization pipeline. + /// Converts cardinals, ordinals, decimals, and version-like numbers into words. Recommended Order: 400. + /// + /// The text normalization builder. + /// The builder instance for fluent chaining. public static ITextNormalizationBuilder AddNumberNormalizationRule(this ITextNormalizationBuilder builder) { ArgumentNullException.ThrowIfNull(builder); @@ -79,6 +110,12 @@ public static ITextNormalizationBuilder AddLetterRepetitionRule(this ITextNormal return builder.AddRule(ServiceLifetime.Singleton); } + /// + /// Adds the to the text normalization pipeline. + /// Trims ends, collapses internal spaces, and adjusts spacing around punctuation. Recommended Order: 9000. + /// + /// The text normalization builder. + /// The builder instance for fluent chaining. public static ITextNormalizationBuilder AddWhitespaceNormalizationRule(this ITextNormalizationBuilder builder) { ArgumentNullException.ThrowIfNull(builder); diff --git a/TTSTextNormalization/Rules/AbbreviationNormalizationRule.cs b/TTSTextNormalization/Rules/AbbreviationNormalizationRule.cs index 173c6d0..6461efd 100644 --- a/TTSTextNormalization/Rules/AbbreviationNormalizationRule.cs +++ b/TTSTextNormalization/Rules/AbbreviationNormalizationRule.cs @@ -9,6 +9,7 @@ namespace TTSTextNormalization.Rules; /// public sealed partial class AbbreviationNormalizationRule : ITextNormalizationRule { + /// public int Order => 300; private const int RegexTimeoutMilliseconds = 150; // Slightly increased for larger pattern @@ -73,8 +74,10 @@ public sealed partial class AbbreviationNormalizationRule : ITextNormalizationRu { "gpu", "g p u" }, // Spell out }.ToFrozenDictionary(StringComparer.OrdinalIgnoreCase); + /// public AbbreviationNormalizationRule() { } + /// public string Apply(string inputText) { ArgumentNullException.ThrowIfNull(inputText); diff --git a/TTSTextNormalization/Rules/BasicSanitizationRule.cs b/TTSTextNormalization/Rules/BasicSanitizationRule.cs index 9c68a2f..e97ee17 100644 --- a/TTSTextNormalization/Rules/BasicSanitizationRule.cs +++ b/TTSTextNormalization/Rules/BasicSanitizationRule.cs @@ -31,6 +31,9 @@ public sealed partial class BasicSanitizationRule : ITextNormalizationRule { "–", "-" }, // En dash }.ToFrozenDictionary(StringComparer.Ordinal); + /// + /// Initializes a new instance of the class. + /// public BasicSanitizationRule() { } /// diff --git a/TTSTextNormalization/Rules/CurrencyNormalizationRule.cs b/TTSTextNormalization/Rules/CurrencyNormalizationRule.cs index d967475..e78f996 100644 --- a/TTSTextNormalization/Rules/CurrencyNormalizationRule.cs +++ b/TTSTextNormalization/Rules/CurrencyNormalizationRule.cs @@ -7,14 +7,21 @@ namespace TTSTextNormalization.Rules; +/// +/// Normalizes currency amounts based on symbols and ISO codes using a multi-pass approach. +/// Handles patterns like $10, 10 USD, $10 USD, £5.50, 100 EUR, €100 EUR. +/// Uses Humanizer for number-to-words conversion. +/// public sealed partial class CurrencyNormalizationRule : ITextNormalizationRule { + /// public int Order => 200; - private const int RegexTimeoutMilliseconds = 150; + private const int RegexTimeoutMilliseconds = 150; // Timeout per regex operation private static readonly TimeSpan RegexTimeout = TimeSpan.FromMilliseconds( RegexTimeoutMilliseconds ); + // Structure to hold TTS specific names for a currency private readonly record struct CurrencyTTSInfo( string Singular, string Plural, @@ -22,37 +29,163 @@ private readonly record struct CurrencyTTSInfo( string FractionPlural ); + // Maps ISO Code (e.g., "USD") to its spoken form info private static readonly FrozenDictionary IsoCodeToTTSInfoMap; + + // Maps Symbol (e.g., "$") or Code (e.g., "USD") to its most likely ISO Code private static readonly FrozenDictionary SymbolOrCodeToIsoCodeMap; - private static readonly Regex CombinedCurrencyRegex; + + // Regex definitions (will be populated in static constructor) + private static readonly Regex? SymbolNumberCodeRegexInstance; + private static readonly Regex? SymbolNumberRegexInstance; + private static readonly Regex? NumberCodeRegexInstance; + + // Flag indicating successful initialization private static readonly bool IsInitialized; + // Shared number pattern part used in regexes + private const string NumberPatternPart = + @"(?\d{1,3}(?:[,\s'.]\d{3})*|\d+)(?:[.,](?\d{1,2}))?"; + static CurrencyNormalizationRule() { try { - // Define the Manual ISO -> TTS Mapping + // --- TTS Map Population --- Dictionary ttsMapBuilder = new( StringComparer.OrdinalIgnoreCase ) { - // Add more common currencies... - { "USD", new("dollar", "dollars", "cent", "cents") }, + // === Africa === + { "DZD", new("Algerian dinar", "Algerian dinars", "santeem", "santeems") }, + { "BIF", new("Burundian franc", "Burundian francs", "centime", "centimes") }, + { "EGP", new("Egyptian pound", "Egyptian pounds", "piastre", "piastres") }, + { "ETB", new("Ethiopian birr", "Ethiopian birrs", "santim", "santim") }, + { "GHS", new("Ghanaian cedi", "Ghanaian cedis", "pesewa", "pesewas") }, + { "KES", new("Kenyan shilling", "Kenyan shillings", "cent", "cents") }, + { "MAD", new("Moroccan dirham", "Moroccan dirhams", "centime", "centimes") }, + { "MUR", new("Mauritian rupee", "Mauritian rupees", "cent", "cents") }, + { "NGN", new("Nigerian naira", "Nigerian naira", "kobo", "kobo") }, + { "TND", new("Tunisian dinar", "Tunisian dinars", "millime", "millimes") }, + { "TZS", new("Tanzanian shilling", "Tanzanian shillings", "cent", "cents") }, + { "UGX", new("Ugandan shilling", "Ugandan shillings", "cent", "cents") }, + { + "XOF", + new("West African CFA franc", "West African CFA francs", "centime", "centimes") + }, + { "ZAR", new("South African rand", "South African rand", "cent", "cents") }, + // === Asia === + { "AFN", new("Afghan afghani", "Afghan afghanis", "pul", "puls") }, + { "AMD", new("Armenian dram", "Armenian drams", "luma", "luma") }, + { "AZN", new("Azerbaijani manat", "Azerbaijani manats", "qəpik", "qəpiks") }, + { "BDT", new("Bangladeshi taka", "Bangladeshi taka", "poisha", "poisha") }, + { "BND", new("Brunei dollar", "Brunei dollars", "sen", "sen") }, + { "CNY", new("Chinese yuan", "Chinese yuan", "fen", "fen") }, + { "GEL", new("Georgian lari", "Georgian lari", "tetri", "tetri") }, + { "HKD", new("Hong Kong dollar", "Hong Kong dollars", "cent", "cents") }, + { "IDR", new("Indonesian rupiah", "Indonesian rupiahs", "sen", "sen") }, + { "INR", new("Indian rupee", "Indian rupees", "paisa", "paise") }, + { "IQD", new("Iraqi dinar", "Iraqi dinars", "fils", "fils") }, + { "JPY", new("Japanese yen", "Japanese yen", "sen", "sen") }, // Note: JPY fraction often ignored + { "KHR", new("Cambodian riel", "Cambodian riels", "sen", "sen") }, + { "KGS", new("Kyrgystani som", "Kyrgystani soms", "tyiyn", "tyiyns") }, + { "KRW", new("South Korean won", "South Korean won", "jeon", "jeon") }, + { "KZT", new("Kazakhstani tenge", "Kazakhstani tenge", "tiyn", "tiyn") }, + { "LAK", new("Lao kip", "Lao kips", "att", "att") }, + { "LKR", new("Sri Lankan rupee", "Sri Lankan rupees", "cent", "cents") }, + { "MNT", new("Mongolian tögrög", "Mongolian tögrögs", "möngö", "möngö") }, + { "MYR", new("Malaysian ringgit", "Malaysian ringgits", "sen", "sen") }, + { "NPR", new("Nepalese rupee", "Nepalese rupees", "paisa", "paise") }, + { "PHP", new("Philippine peso", "Philippine pesos", "sentimo", "sentimo") }, + { "PKR", new("Pakistani rupee", "Pakistani rupees", "paisa", "paisa") }, + { "RUB", new("Russian ruble", "Russian rubles", "kopek", "kopeks") }, + { "SGD", new("Singapore dollar", "Singapore dollars", "cent", "cents") }, + { "THB", new("Thai baht", "Thai baht", "satang", "satang") }, + { "TWD", new("new Taiwan dollar", "new Taiwan dollars", "cent", "cents") }, + { "UZS", new("Uzbekistani som", "Uzbekistani som", "tiyin", "tiyin") }, + { "VND", new("Vietnamese dong", "Vietnamese dong", "hao", "hao") }, + // === Europe === + { "ALL", new("Albanian lek", "Albanian lekë", "qindarkë", "qindarka") }, + { + "BAM", + new( + "Bosnia-Herzegovina convertible mark", + "Bosnia-Herzegovina convertible marks", + "fening", + "feninga" + ) + }, + { "BGN", new("Bulgarian lev", "Bulgarian leva", "stotinka", "stotinki") }, + { "BYN", new("Belarusian ruble", "Belarusian rubles", "kopek", "kopeks") }, + { "CHF", new("Swiss franc", "Swiss francs", "rappen", "rappen") }, + { "CZK", new("Czech koruna", "Czech koruny", "haler", "haleru") }, + { "DKK", new("Danish krone", "Danish kroner", "øre", "øre") }, + { "EUR", new("euro", "euros", "cent", "cents") }, + { "GBP", new("British pound", "British pounds", "penny", "pence") }, + { "HRK", new("Croatian kuna", "Croatian kunas", "lipa", "lipa") }, // Replaced by EUR, but kept for legacy + { "HUF", new("Hungarian forint", "Hungarian forints", "fillér", "fillér") }, + { "ISK", new("Icelandic krona", "Icelandic kronur", "eyrir", "aurar") }, // Often no fractions used + { "MDL", new("Moldovan leu", "Moldovan lei", "ban", "bani") }, + { "MKD", new("Macedonian denar", "Macedonian denari", "deni", "deni") }, + { "NOK", new("Norwegian krone", "Norwegian kroner", "øre", "øre") }, + { "PLN", new("Polish zloty", "Polish zlotys", "grosz", "groszy") }, + { "RON", new("Romanian leu", "Romanian lei", "ban", "bani") }, + { "RSD", new("Serbian dinar", "Serbian dinars", "para", "para") }, + { "SEK", new("Swedish krona", "Swedish kronor", "öre", "öre") }, + { "TRY", new("Turkish lira", "Turkish liras", "kurus", "kurus") }, + { "UAH", new("Ukrainian hryvnia", "Ukrainian hryvnias", "kopiyka", "kopiyky") }, + // === Middle East === + { "AED", new("UAE dirham", "UAE dirhams", "fils", "fils") }, + { "BHD", new("Bahraini dinar", "Bahraini dinars", "fils", "fils") }, + { "ILS", new("Israeli new shekel", "Israeli new shekels", "agora", "agorot") }, + { "JOD", new("Jordanian dinar", "Jordanian dinars", "piastre", "piastres") }, + { "KWD", new("Kuwaiti dinar", "Kuwaiti dinars", "fils", "fils") }, + { "LBP", new("Lebanese pound", "Lebanese pounds", "piastre", "piastres") }, + { "OMR", new("Omani rial", "Omani rials", "baisa", "baisa") }, + { "QAR", new("Qatari riyal", "Qatari riyals", "dirham", "dirhams") }, + { "SAR", new("Saudi riyal", "Saudi riyals", "halala", "halalas") }, + // === North America === { "CAD", new("Canadian dollar", "Canadian dollars", "cent", "cents") }, + { "CRC", new("Costa Rican colón", "Costa Rican colones", "céntimo", "céntimos") }, + { "DOP", new("Dominican peso", "Dominican pesos", "centavo", "centavos") }, + { "GTQ", new("Guatemalan quetzal", "Guatemalan quetzals", "centavo", "centavos") }, + { "HNL", new("Honduran lempira", "Honduran lempiras", "centavo", "centavos") }, + { "JMD", new("Jamaican dollar", "Jamaican dollars", "cent", "cents") }, + { "MXN", new("Mexican peso", "Mexican pesos", "centavo", "centavos") }, + { "NIO", new("Nicaraguan córdoba", "Nicaraguan córdobas", "centavo", "centavos") }, + { + "PAB", + new("Panamanian balboa", "Panamanian balboas", "centésimo", "centésimos") + }, + { "USD", new("US dollar", "US dollars", "cent", "cents") }, + // === Oceania === { "AUD", new("Australian dollar", "Australian dollars", "cent", "cents") }, - { "GBP", new("pound", "pounds", "penny", "pence") }, // Using "pound" for GBP - { "EUR", new("euro", "euros", "cent", "cents") }, - { "JPY", new("yen", "yen", "sen", "sen") }, - { "INR", new("rupee", "rupees", "paisa", "paise") }, - { "BRL", new("real", "reais", "centavo", "centavos") }, - { "CNY", new("yuan", "yuan", "fen", "fen") }, - { "RUB", new("ruble", "rubles", "kopek", "kopeks") }, + { "FJD", new("Fijian dollar", "Fijian dollars", "cent", "cents") }, + { "NZD", new("New Zealand dollar", "New Zealand dollars", "cent", "cents") }, + // === South America === + { "ARS", new("Argentine peso", "Argentine pesos", "centavo", "centavos") }, + { "BOB", new("Bolivian boliviano", "Bolivian bolivianos", "centavo", "centavos") }, + { "BRL", new("Brazilian real", "Brazilian reais", "centavo", "centavos") }, + { "CLP", new("Chilean peso", "Chilean pesos", "", "") }, // No standard fraction + { "COP", new("Colombian peso", "Colombian pesos", "centavo", "centavos") }, + { "PEN", new("Peruvian sol", "Peruvian soles", "céntimo", "céntimos") }, + { "PYG", new("Paraguayan guaraní", "Paraguayan guaraníes", "céntimo", "céntimos") }, + { "UYU", new("Uruguayan peso", "Uruguayan pesos", "centésimo", "centésimos") }, + { + "VES", + new( + "Venezuelan bolívar soberano", + "Venezuelan bolívares soberanos", + "céntimo", + "céntimos" + ) + }, }; IsoCodeToTTSInfoMap = ttsMapBuilder.ToFrozenDictionary( StringComparer.OrdinalIgnoreCase ); - // Build Symbol/Code -> ISO Code Mapping + // --- Symbol/Code -> ISO Code Mapping Population --- Dictionary symbolMapBuilder = new(StringComparer.OrdinalIgnoreCase); HashSet uniqueSymbols = new(StringComparer.OrdinalIgnoreCase); HashSet uniqueIsoCodes = new(StringComparer.OrdinalIgnoreCase); @@ -61,13 +194,12 @@ static CurrencyNormalizationRule() CultureInfo ci in CultureInfo.GetCultures( CultureTypes.SpecificCultures | CultureTypes.InstalledWin32Cultures ) - ) // Broader search + ) { - // Skip problematic cultures if ( ci.IsNeutralCulture || ci.LCID == CultureInfo.InvariantCulture.LCID - || ci.Name == "" /* Invariant */ + || ci.Name == "" || ci.Name.StartsWith("x-", StringComparison.Ordinal) ) { @@ -81,8 +213,8 @@ CultureInfo ci in CultureInfo.GetCultures( } catch (ArgumentException) { - continue; /* Cannot create RegionInfo */ - } + continue; + } // Cannot create RegionInfo string isoCode = region.ISOCurrencySymbol; string symbol = region.CurrencySymbol; @@ -90,131 +222,231 @@ CultureInfo ci in CultureInfo.GetCultures( // Only add if we have TTS info for this ISO code if (!string.IsNullOrEmpty(isoCode) && IsoCodeToTTSInfoMap.ContainsKey(isoCode)) { + // Add the ISO code itself to the map (e.g., "USD" -> "USD") if (symbolMapBuilder.TryAdd(isoCode, isoCode)) uniqueIsoCodes.Add(isoCode); - // FIX: Prioritize JPY for ¥ symbol if not already mapped - if (symbol == "¥" && !symbolMapBuilder.ContainsKey("¥")) - { - symbolMapBuilder.Add("¥", "JPY"); - uniqueSymbols.Add("¥"); - } - else if ( - !string.IsNullOrEmpty(symbol) - && symbol != "¥" - && !symbolMapBuilder.ContainsKey(symbol) - && !symbol.All(char.IsLetterOrDigit) - ) + + // Add the symbol if it's not empty, not just letters/digits, and not already mapped + // Special handling for Yen symbol '¥' to prioritize JPY + if (!string.IsNullOrEmpty(symbol) && !symbol.All(char.IsLetterOrDigit)) { - symbolMapBuilder.Add(symbol, isoCode); - uniqueSymbols.Add(symbol); + if (symbol == "¥") + { + if (symbolMapBuilder.TryAdd("¥", "JPY")) // Map '¥' only once, prioritize JPY + { + uniqueSymbols.Add("¥"); + } + } + else if (symbolMapBuilder.TryAdd(symbol, isoCode)) // Try add other symbols + { + uniqueSymbols.Add(symbol); + } } } } - - if (IsoCodeToTTSInfoMap.ContainsKey("JPY")) + // Ensure JPY mapping for ¥ exists if JPY TTS info is present + if (IsoCodeToTTSInfoMap.ContainsKey("JPY") && !symbolMapBuilder.ContainsKey("¥")) { symbolMapBuilder["¥"] = "JPY"; - uniqueSymbols.Add("¥"); // Ensure it's in the symbol list for regex + uniqueSymbols.Add("¥"); } SymbolOrCodeToIsoCodeMap = symbolMapBuilder.ToFrozenDictionary( StringComparer.OrdinalIgnoreCase ); - // Dynamically Generate the Regex - IOrderedEnumerable escapedSymbols = uniqueSymbols - .Select(Regex.Escape) - .OrderByDescending(s => s.Length); - IOrderedEnumerable escapedIsoCodes = uniqueIsoCodes - .Select(Regex.Escape) - .OrderByDescending(s => s.Length); - - string symbolPatternPart = string.Join("|", escapedSymbols); - string codePatternPart = string.Join("|", escapedIsoCodes); - - // Number pattern allowing flexible separators but requiring at least one digit - string numberPatternPart = - @"(?\d{1,3}(?:[,\s'.]\d{3})*|\d+)(?:[.,](?\d{1,2}))?"; - - string pattern1 = !string.IsNullOrEmpty(symbolPatternPart) - ? $@"(?{symbolPatternPart})\s?{numberPatternPart}(?![\p{{L}}\p{{N}}])" - : string.Empty; - string pattern2 = !string.IsNullOrEmpty(codePatternPart) - ? $@"(?{codePatternPart})(?![\p{{L}}\p{{N}}])" - : string.Empty; - - string combinedPattern = !string.IsNullOrEmpty(pattern1) && !string.IsNullOrEmpty(pattern2) - ? $"({pattern1})|({pattern2})" - : !string.IsNullOrEmpty(pattern1) ? pattern1 : pattern2; - - if (!string.IsNullOrEmpty(combinedPattern)) + // --- Generate Regex Patterns --- + string symbolPatternPart = string.Join( + "|", + uniqueSymbols.Select(Regex.Escape).OrderByDescending(s => s.Length) + ); + string codePatternPart = string.Join( + "|", + uniqueIsoCodes.Select(Regex.Escape).OrderByDescending(s => s.Length) + ); + + // Only initialize regexes if symbols/codes were found + if (!string.IsNullOrEmpty(symbolPatternPart) && !string.IsNullOrEmpty(codePatternPart)) { - CombinedCurrencyRegex = new Regex( - combinedPattern, - RegexOptions.Compiled | RegexOptions.IgnoreCase, - RegexTimeout - ); - IsInitialized = true; - Console.WriteLine($"INFO: Currency Regex Initialized: {CombinedCurrencyRegex}"); + // Pattern for Symbol + Number + Code (e.g., "$10 USD") + string patternSNC = + $@"(?{symbolPatternPart})\s?{NumberPatternPart}\s?(?{codePatternPart})(?![\p{{L}}\p{{N}}])"; + // Pattern for Symbol + Number (e.g., "$10") + string patternSN = + $@"(?{symbolPatternPart})\s?{NumberPatternPart}(?![\p{{L}}\p{{N}}])"; + // Pattern for Number + Code (e.g., "10 USD") + string patternNC = + $@"(?{codePatternPart})(?![\p{{L}}\p{{N}}])"; + + SymbolNumberCodeRegexInstance = BuildRegex(patternSNC); + SymbolNumberRegexInstance = BuildRegex(patternSN); + NumberCodeRegexInstance = BuildRegex(patternNC); + + IsInitialized = + SymbolNumberCodeRegexInstance != null + && SymbolNumberRegexInstance != null + && NumberCodeRegexInstance != null; + + if (!IsInitialized) + { + Console.Error.WriteLine( + "Warning: One or more currency regex patterns failed to initialize." + ); + } } else { - CombinedCurrencyRegex = new Regex("(?!)", RegexOptions.Compiled); // Never matches + Console.Error.WriteLine( + "Warning: Could not generate currency regex patterns. No unique symbols or codes found/mapped." + ); IsInitialized = false; - Console.Error.WriteLine("Warning: No valid currency patterns generated."); } } catch (Exception ex) { Console.Error.WriteLine($"FATAL: Currency Rule static constructor failed: {ex}"); - CombinedCurrencyRegex = new Regex("(?!)", RegexOptions.Compiled); IsInitialized = false; + throw; // Re-throw fatal exceptions during static init + } + } + + /// + /// Helper to build Regex with options and timeout handling. + /// + private static Regex? BuildRegex(string pattern) + { + try + { + return new Regex( + pattern, + RegexOptions.Compiled | RegexOptions.IgnoreCase, + RegexTimeout + ); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error compiling regex pattern '{pattern}': {ex.Message}"); + return null; } } - public CurrencyNormalizationRule() { } + /// + /// Initializes a new instance of the class. + /// + public CurrencyNormalizationRule() { } // Instance constructor + /// public string Apply(string inputText) { ArgumentNullException.ThrowIfNull(inputText); + if (!IsInitialized || string.IsNullOrEmpty(inputText)) return inputText; string currentText = inputText; try { - currentText = CombinedCurrencyRegex.Replace(currentText, CurrencyMatchEvaluator); + // Apply replacements in order of specificity: S+N+C -> S+N -> N+C + // This ensures that "$10 USD" is matched by the first regex and not partially by the second. + if (SymbolNumberCodeRegexInstance != null) + { + currentText = SymbolNumberCodeRegexInstance.Replace( + currentText, + CurrencyMatchEvaluator + ); + } + + if (SymbolNumberRegexInstance != null) + { + currentText = SymbolNumberRegexInstance.Replace( + currentText, + CurrencyMatchEvaluator + ); + } + + if (NumberCodeRegexInstance != null) + { + currentText = NumberCodeRegexInstance.Replace(currentText, CurrencyMatchEvaluator); + } } catch (RegexMatchTimeoutException ex) { - Console.Error.WriteLine($"Regex timeout during currency normalization: {ex.Message}"); + Console.Error.WriteLine( + $"Regex timeout during currency normalization pass: {ex.Message}" + ); + // Return text processed up to the point of timeout } - catch (Exception ex) + catch (Exception ex) // Catch other potential errors during replacement { Console.Error.WriteLine($"Error during currency normalization: {ex.Message}"); + // Optionally return original text or partially processed text + // return inputText; // Safer fallback } return currentText; } + /// + /// Shared evaluator for all currency regex matches. Determines ISO code and converts to spoken form. + /// private static string CurrencyMatchEvaluator(Match match) { - // Prioritize symbol group if it exists and matched (pattern 1 or 2 specific capture) - // This requires naming the outer groups in the combined pattern. Let's adjust: - // combinedPattern = $"(?{pattern1})|(?{pattern2})"; - // But for simplicity now, rely on the 'symbol' group captured by either. - string detectedSymbolOrCode = match.Groups["symbol"].Value; - string integerPartStr = match.Groups["integer"].Value; + string? isoCode = null; + string integerPartStr = match.Groups["integer"].Value; // Always expected string fractionPartStr = match.Groups["fraction"].Success ? match.Groups["fraction"].Value : string.Empty; - if (!SymbolOrCodeToIsoCodeMap.TryGetValue(detectedSymbolOrCode, out string? isoCode)) - return match.Value; - if (!IsoCodeToTTSInfoMap.TryGetValue(isoCode, out CurrencyTTSInfo currencyTTSInfo)) + // Determine ISO code based on captured groups in the specific match + // Check which groups are present to infer which regex pattern succeeded + if (match.Groups["symbol"].Success && match.Groups["code"].Success) + { + // S+N+C match (from SymbolNumberCodeRegexInstance): Prioritize the explicit code + string explicitCode = match.Groups["code"].Value; + // Verify the explicit code exists in our TTS map + if (IsoCodeToTTSInfoMap.ContainsKey(explicitCode)) + { + isoCode = explicitCode; + } + else + { + // Fallback to symbol's code if explicit code isn't recognized (less likely but possible) + SymbolOrCodeToIsoCodeMap.TryGetValue(match.Groups["symbol"].Value, out isoCode); + } + } + else if (match.Groups["symbol"].Success) + { + // S+N match (from SymbolNumberRegexInstance): Use symbol's code from the map + SymbolOrCodeToIsoCodeMap.TryGetValue(match.Groups["symbol"].Value, out isoCode); + } + else if (match.Groups["code"].Success) + { + // N+C match (from NumberCodeRegexInstance): Use the code directly if valid + string explicitCode = match.Groups["code"].Value; + // Check if the code is known in the symbol/code map AND has TTS info + if ( + SymbolOrCodeToIsoCodeMap.ContainsKey(explicitCode) + && IsoCodeToTTSInfoMap.ContainsKey(explicitCode) + ) + { + isoCode = explicitCode; + } + } + + // --- Proceed if a valid ISO code was found and is supported --- + if ( + isoCode == null + || !IsoCodeToTTSInfoMap.TryGetValue(isoCode, out CurrencyTTSInfo currencyTTSInfo) + ) + { + // Cannot determine or unsupported currency, return the original matched text return match.Value; + } + + // --- Parse Numbers --- + // Remove common separators like commas, spaces, apostrophes, periods (for thousands) + string integerForParsing = CleanIntegerRegex().Replace(integerPartStr, ""); - string integerForParsing = Regex.Replace(integerPartStr, "[,' .]", ""); // Remove common separators if ( !long.TryParse( integerForParsing, @@ -224,13 +456,15 @@ out long integerValue ) ) { - return match.Value; + return match.Value; // Integer parsing failed } int fractionValue = 0; if (!string.IsNullOrEmpty(fractionPartStr)) { - string paddedFraction = fractionPartStr.PadRight(2, '0'); + // Ensure fraction is treated as two digits (e.g., ".5" becomes 50) + string paddedFraction = + fractionPartStr.Length == 1 ? fractionPartStr + "0" : fractionPartStr; if ( !int.TryParse( paddedFraction, @@ -242,21 +476,26 @@ out fractionValue || fractionValue > 99 ) { - return match.Value; + return match.Value; // Invalid fraction format or value } } + // --- Convert to Words using Humanizer --- try { - string integerWords = integerValue.ToWords(); - string? fractionWords = fractionValue > 0 ? fractionValue.ToWords() : null; + // Use InvariantCulture for ToWords to get consistent English number words + string integerWords = integerValue.ToWords(CultureInfo.InvariantCulture); + string? fractionWords = + fractionValue > 0 ? fractionValue.ToWords(CultureInfo.InvariantCulture) : null; + // --- Build Spoken String --- StringBuilder builder = new(); builder.Append(integerWords); builder.Append(' '); builder.Append(integerValue == 1 ? currencyTTSInfo.Singular : currencyTTSInfo.Plural); - if (fractionWords != null && fractionValue > 0) // Ensure fraction > 0 + // Only add fraction part if it's greater than zero + if (fractionWords != null && fractionValue > 0) { builder.Append(' '); builder.Append(fractionWords); @@ -268,12 +507,19 @@ out fractionValue ); } + // Pad result with spaces for proper separation in the final text return $" {builder} "; } catch (Exception ex) { - Console.Error.WriteLine($"Humanizer failed for '{match.Value}': {ex.Message}"); - return match.Value; + // Log Humanizer errors + Console.Error.WriteLine( + $"Humanizer failed for '{match.Value}' (ISO: {isoCode}): {ex.Message}" + ); + return match.Value; // Return original on Humanizer error } } + + [GeneratedRegex("[,' .]", RegexOptions.Compiled)] + private static partial Regex CleanIntegerRegex(); } diff --git a/TTSTextNormalization/Rules/EmojiNormalizationRule.cs b/TTSTextNormalization/Rules/EmojiNormalizationRule.cs index b32c1ee..1c9e206 100644 --- a/TTSTextNormalization/Rules/EmojiNormalizationRule.cs +++ b/TTSTextNormalization/Rules/EmojiNormalizationRule.cs @@ -1,6 +1,6 @@ -using TTSTextNormalization.EmojiDataGenerated; -using System.Text.RegularExpressions; +using System.Text.RegularExpressions; using TTSTextNormalization.Abstractions; +using TTSTextNormalization.EmojiDataGenerated; namespace TTSTextNormalization.Rules; @@ -10,8 +10,10 @@ namespace TTSTextNormalization.Rules; /// public sealed class EmojiNormalizationRule : ITextNormalizationRule { + /// public int Order => 100; + /// public EmojiNormalizationRule() { } /// @@ -47,8 +49,10 @@ private static string EmojiMatchEvaluator(Match match) { // The Regex ensures we only match keys present in the map. if (EmojiData.EmojiToNameMap.TryGetValue(match.Value, out string? name)) + { // Pad with spaces for TTS separation. Use the 'name' from the JSON. return $" {name} "; + } else { // Should not happen if Regex and Map are generated correctly. diff --git a/TTSTextNormalization/Rules/ExcessivePunctuationRule.cs b/TTSTextNormalization/Rules/ExcessivePunctuationRule.cs index be8f953..f295419 100644 --- a/TTSTextNormalization/Rules/ExcessivePunctuationRule.cs +++ b/TTSTextNormalization/Rules/ExcessivePunctuationRule.cs @@ -8,11 +8,14 @@ namespace TTSTextNormalization.Rules; /// public sealed partial class ExcessivePunctuationRule : ITextNormalizationRule { + /// public int Order => 500; private const int RegexTimeoutMilliseconds = 100; + /// public ExcessivePunctuationRule() { } + /// public string Apply(string inputText) { ArgumentNullException.ThrowIfNull(inputText); diff --git a/TTSTextNormalization/Rules/LetterRepetitionRule.cs b/TTSTextNormalization/Rules/LetterRepetitionRule.cs index aed14b1..c1ba18c 100644 --- a/TTSTextNormalization/Rules/LetterRepetitionRule.cs +++ b/TTSTextNormalization/Rules/LetterRepetitionRule.cs @@ -8,11 +8,14 @@ namespace TTSTextNormalization.Rules; /// public sealed partial class LetterRepetitionRule : ITextNormalizationRule { + /// public int Order => 510; private const int RegexTimeoutMilliseconds = 150; // Might need slightly more time for complex strings + /// public LetterRepetitionRule() { } + /// public string Apply(string inputText) { ArgumentNullException.ThrowIfNull(inputText); diff --git a/TTSTextNormalization/Rules/NumberNormalizationRule.cs b/TTSTextNormalization/Rules/NumberNormalizationRule.cs index c128519..01e2f0c 100644 --- a/TTSTextNormalization/Rules/NumberNormalizationRule.cs +++ b/TTSTextNormalization/Rules/NumberNormalizationRule.cs @@ -6,16 +6,26 @@ namespace TTSTextNormalization.Rules; +/// +/// Normalizes standalone numbers, including cardinals, ordinals, decimals, and multi-dot sequences (like version numbers). +/// Uses Humanizer for cardinal and ordinal word conversion. +/// public sealed partial class NumberNormalizationRule : ITextNormalizationRule { + /// public int Order => 400; + private const int RegexTimeoutMilliseconds = 150; // Keep DigitWords for the new multi-dot logic private static readonly string[] DigitWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]; + /// + /// Initializes a new instance of the class. + /// public NumberNormalizationRule() { } + /// public string Apply(string inputText) { ArgumentNullException.ThrowIfNull(inputText); diff --git a/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs b/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs index 4ef3639..07dbb9b 100644 --- a/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs +++ b/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs @@ -9,11 +9,14 @@ namespace TTSTextNormalization.Rules; /// public sealed partial class WhitespaceNormalizationRule : ITextNormalizationRule { + /// public int Order => 9000; private const int RegexTimeoutMilliseconds = 100; // Timeout for each step + /// public WhitespaceNormalizationRule() { } + /// public string Apply(string inputText) { ArgumentNullException.ThrowIfNull(inputText); diff --git a/TTSTextNormalization/TTSTextNormalization.csproj b/TTSTextNormalization/TTSTextNormalization.csproj index d3cb932..3c825c2 100644 --- a/TTSTextNormalization/TTSTextNormalization.csproj +++ b/TTSTextNormalization/TTSTextNormalization.csproj @@ -35,6 +35,7 @@ true true + true v