diff --git a/TTSTextNormalization.Generator/EmojiDataGenerator.cs b/TTSTextNormalization.Generator/EmojiDataGenerator.cs index cea16c0..faab123 100644 --- a/TTSTextNormalization.Generator/EmojiDataGenerator.cs +++ b/TTSTextNormalization.Generator/EmojiDataGenerator.cs @@ -1,10 +1,10 @@ -using Microsoft.CodeAnalysis; -using Microsoft.CodeAnalysis.CSharp; -using Microsoft.CodeAnalysis.Text; -using System.Collections.Immutable; +using System.Collections.Immutable; using System.Text; using System.Text.Json; using System.Text.RegularExpressions; +using Microsoft.CodeAnalysis; +using Microsoft.CodeAnalysis.CSharp; +using Microsoft.CodeAnalysis.Text; namespace TTSTextNormalization.EmojiDataGenerator; @@ -13,100 +13,120 @@ public class EmojiGenerator : IIncrementalGenerator { public void Initialize(IncrementalGeneratorInitializationContext initContext) { - //if (!Debugger.IsAttached) - // Debugger.Launch(); + // Define the diagnostic descriptors + var MissingFileError = new DiagnosticDescriptor( + "SWTTSTN001", + "Missing Emoji Data File", + "The emoji data file 'data-by-emoji.json' is missing or could not be found.", + "FileAccess", + DiagnosticSeverity.Error, + true + ); + var InvalidFileError = new DiagnosticDescriptor( + "SWTTSTN002", + "Invalid Emoji Data File", + "The emoji data file 'data-by-emoji.json' is empty or invalid JSON.", + "FileContent", + DiagnosticSeverity.Error, + true + ); + var FormatError = new DiagnosticDescriptor( + "SWTTSTN003", + "Invalid Emoji Data Format", + "The emoji data file 'data-by-emoji.json' is not in the expected format (Dictionary).", + "FileFormat", + DiagnosticSeverity.Error, + true + ); + var RegexGenerationError = new DiagnosticDescriptor( + "SWTTSTN004", + "Regex Generation Error", + "Failed to generate the emoji matching Regex: {0}", + "Regex", + DiagnosticSeverity.Error, + true + ); - // get the additional text provider + // Get additional texts provider filtering for the specific file IncrementalValuesProvider additionalTexts = - initContext.AdditionalTextsProvider; - - // apply a 1-to-1 transform on each text, extracting the contents - IncrementalValuesProvider transformed = additionalTexts.Select( - static (text, _) => text?.GetText()?.ToString() ?? string.Empty - ); + initContext.AdditionalTextsProvider.Where(at => + Path.GetFileName(at.Path) + .Equals("data-by-emoji.json", StringComparison.OrdinalIgnoreCase) + ); - // collect the contents into a batch - IncrementalValueProvider> collected = transformed.Collect(); + // Combine with compilation to report errors correctly + IncrementalValueProvider<( + Compilation, + ImmutableArray + )> compilationAndTexts = initContext.CompilationProvider.Combine(additionalTexts.Collect()); - // take the file contents from the above batch and make some user visible syntax initContext.RegisterSourceOutput( - collected, - static (sourceProductionContext, textContents) => + compilationAndTexts, + (spc, source) => { - if (textContents.IsDefaultOrEmpty) + var (compilation, texts) = source; + + if (texts.IsDefaultOrEmpty) { - sourceProductionContext.ReportDiagnostic( - Diagnostic.Create( - new DiagnosticDescriptor( - "SWTTSTN001", - "Missing Emoji Data File", - "The emoji data file 'data-by-emoji.json' is missing.", - "FileNotFound", - DiagnosticSeverity.Error, - true - ), - Location.None - ) - ); + // File not found or not included as AdditionalFile + spc.ReportDiagnostic(Diagnostic.Create(MissingFileError, Location.None)); return; } - string? emojiFileContent = textContents.FirstOrDefault(); - if (string.IsNullOrEmpty(emojiFileContent)) + // Assuming only one file matches + AdditionalText emojiFile = texts[0]; + SourceText? fileSourceText = emojiFile.GetText(spc.CancellationToken); + + if (fileSourceText == null || fileSourceText.Length == 0) { - sourceProductionContext.ReportDiagnostic( - Diagnostic.Create( - new DiagnosticDescriptor( - "SWTTSTN002", - "Invalid Emoji Data File", - "The emoji data file 'data-by-emoji.json' is empty or invalid.", - "FileError", - DiagnosticSeverity.Error, - true - ), - Location.None - ) - ); + spc.ReportDiagnostic(Diagnostic.Create(InvalidFileError, Location.None)); return; } - JsonDocument jsonDocument = JsonDocument.Parse( - emojiFileContent!, - new JsonDocumentOptions - { - AllowTrailingCommas = true, - CommentHandling = JsonCommentHandling.Skip, - } - ); - - Dictionary? emojiData = jsonDocument.Deserialize>( - new JsonSerializerOptions { PropertyNameCaseInsensitive = true } - ); + string emojiFileContent = fileSourceText.ToString(); + Dictionary? emojiData = null; - if (emojiData == null) + try + { + JsonDocument jsonDocument = JsonDocument.Parse( + emojiFileContent, + new JsonDocumentOptions + { + AllowTrailingCommas = true, + CommentHandling = JsonCommentHandling.Skip, + } + ); + emojiData = jsonDocument.Deserialize>( + new JsonSerializerOptions { PropertyNameCaseInsensitive = true } + ); + } + catch (JsonException ex) { - sourceProductionContext.ReportDiagnostic( - Diagnostic.Create( - new DiagnosticDescriptor( - "SWTTSTN003", - "Invalid Emoji Data Format", - "The emoji data file 'data-by-emoji.json' is not in the expected format.", - "FileFormatError", - DiagnosticSeverity.Error, - true - ), - Location.None - ) + spc.ReportDiagnostic( + Diagnostic.Create(InvalidFileError, Location.None, ex.Message) ); return; } + catch (Exception ex) // Catch other potential deserialization errors + { + spc.ReportDiagnostic(Diagnostic.Create(FormatError, Location.None, ex.Message)); + return; + } + + if (emojiData == null || emojiData.Count == 0) + { + spc.ReportDiagnostic(Diagnostic.Create(FormatError, Location.None)); + return; + } StringBuilder sb = new(); sb.AppendLine( """ // - // Generated by EmojiDataGenerator + #pragma warning disable #nullable enable + // Generated by EmojiDataGenerator + using System; using System.Collections.Frozen; using System.Collections.Generic; @@ -126,49 +146,87 @@ static EmojiData() """ ); + // Build the dictionary initializer foreach (KeyValuePair keyValue in emojiData) { - string key = SymbolDisplay.FormatLiteral(keyValue.Key, true); - string value = keyValue.Value.Name.Replace("\"", "\\\""); - sb.AppendLine($" {{ {key}, \"{value}\" }},"); + // Basic validation for name + if (string.IsNullOrWhiteSpace(keyValue.Value?.Name)) + continue; + + string keyLiteral = SymbolDisplay.FormatLiteral(keyValue.Key, true); + string valueLiteral = SymbolDisplay.FormatLiteral(keyValue.Value!.Name!, true); // Ensure name is also correctly literalized + sb.AppendLine($" {{ {keyLiteral}, {valueLiteral} }},"); } sb.AppendLine( """ }; EmojiToNameMap = mapBuilder.ToFrozenDictionary(StringComparer.Ordinal); + """ ); + // Build the Regex pattern string pattern = string.Join( "|", - emojiData.Keys.OrderByDescending(k => k.Length).Select(Regex.Escape) + // Filter keys used in the map to ensure consistency + emojiData + .Where(kv => !string.IsNullOrWhiteSpace(kv.Value?.Name)) + .Select(kv => Regex.Escape(kv.Key)) + .OrderByDescending(k => k.Length) // Match longest first ); - sb.AppendLine($" const string pattern = @\"{pattern}\";"); + + if (string.IsNullOrEmpty(pattern)) + { + // Handle case where no valid emojis were processed + sb.AppendLine($" // No valid emoji data found to build Regex."); + sb.AppendLine( + $" EmojiMatchRegex = new Regex(\"(?!)\", RegexOptions.Compiled); // Regex that never matches" + ); + } + else + { + // Escape the pattern string itself for use in a C# string literal + string patternLiteral = SymbolDisplay.FormatLiteral(pattern, true); + sb.AppendLine($" const string pattern = {patternLiteral};"); + + const int timeoutMilliseconds = 200; // Define timeout + + sb.AppendLine($" try"); + sb.AppendLine(" {"); + sb.AppendLine( + $" EmojiMatchRegex = new Regex(pattern, RegexOptions.Compiled | RegexOptions.NonBacktracking, TimeSpan.FromMilliseconds({timeoutMilliseconds}));" + ); + sb.AppendLine(" }"); + sb.AppendLine($" catch(Exception ex)"); // Catch potential Regex creation errors + sb.AppendLine(" {"); + sb.AppendLine( + $" Console.Error.WriteLine($\"FATAL: Failed to compile Emoji Regex: {{ex.Message}}\");" + ); + sb.AppendLine( + $" EmojiMatchRegex = new Regex(\"(?!)\", RegexOptions.Compiled); // Fallback" + ); + sb.AppendLine(" }"); + } + sb.AppendLine( """ - EmojiMatchRegex = new Regex(pattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(200)); - } - } + } // End static constructor + } // End class EmojiData #nullable restore + #pragma warning restore """ ); - sourceProductionContext.AddSource( - "EmojiData.g.cs", - SourceText.From(sb.ToString(), Encoding.UTF8) - ); + spc.AddSource("EmojiData.g.cs", SourceText.From(sb.ToString(), Encoding.UTF8)); } ); } - private class EmojiEntry + // Simple class to hold necessary emoji data from JSON + private sealed class EmojiEntry { - public string Name { get; set; } = string.Empty; - public string Slug { get; set; } = string.Empty; - public string Group { get; set; } = string.Empty; - public string Emoji_version { get; set; } = string.Empty; - public string Unicode_version { get; set; } = string.Empty; - public bool Skin_tone_support { get; set; } + public string? Name { get; set; } + // Other properties like Slug, Group, etc. can be added if needed later } } diff --git a/TTSTextNormalization.sln b/TTSTextNormalization.sln index 9da5f40..ffe8bcb 100644 --- a/TTSTextNormalization.sln +++ b/TTSTextNormalization.sln @@ -9,6 +9,11 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TTSTextNormalization.EmojiD EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TTSTextNormalization", "TTSTextNormalization\TTSTextNormalization.csproj", "{1C2CA7DF-374E-FA47-469B-9751E035B2C8}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{8EC462FD-D22E-90A8-E5CE-7E832BA40C5D}" + ProjectSection(SolutionItems) = preProject + .github\workflows\dotnet-publish.yml = .github\workflows\dotnet-publish.yml + EndProjectSection +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU diff --git a/TTSTextNormalization/Rules/BasicSanitizationRule.cs b/TTSTextNormalization/Rules/BasicSanitizationRule.cs index e97ee17..6b4e6e4 100644 --- a/TTSTextNormalization/Rules/BasicSanitizationRule.cs +++ b/TTSTextNormalization/Rules/BasicSanitizationRule.cs @@ -121,7 +121,7 @@ private static bool MightContainFancyChars(string text) /// [GeneratedRegex( @"[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F\u200B-\u200D\uFEFF]", - RegexOptions.Compiled | RegexOptions.CultureInvariant, // CultureInvariant is fine for code points + RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.NonBacktracking, // CultureInvariant is fine for code points matchTimeoutMilliseconds: RegexTimeoutMilliseconds)] private static partial Regex RemoveControlCharsRegex(); } \ No newline at end of file diff --git a/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs b/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs index 2415611..715d045 100644 --- a/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs +++ b/TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs @@ -57,19 +57,25 @@ public string Apply(string inputText) return currentText; } - // Regex for Step 2: Collapse multiple whitespace - [GeneratedRegex(@"\s{2,}", RegexOptions.Compiled, RegexTimeoutMilliseconds)] + /// + /// Regex for Step 2: Collapse multiple whitespace. Uses NonBacktracking. + /// + [GeneratedRegex(@"\s{2,}", RegexOptions.Compiled | RegexOptions.NonBacktracking, RegexTimeoutMilliseconds)] private static partial Regex MultipleWhitespaceRegex(); - // Regex for Step 3: Remove space before punctuation - // \s+ : one or more whitespace chars - // ([.,!?;:]) : Captures one of the punctuation marks into group 1 - [GeneratedRegex(@"\s+([.,!?;:])", RegexOptions.Compiled, RegexTimeoutMilliseconds)] + /// + /// Regex for Step 3: Remove space before punctuation. Uses NonBacktracking. + /// \s+ : one or more whitespace chars + /// ([.,!?;:]) : Captures one of the punctuation marks into group 1 + /// + [GeneratedRegex(@"\s+([.,!?;:])", RegexOptions.Compiled | RegexOptions.NonBacktracking, RegexTimeoutMilliseconds)] private static partial Regex SpaceBeforePunctuationRegex(); - // Regex for Step 4: Ensure space after punctuation - // ([.,!?;:]) : Captures one of the punctuation marks into group 1 - // (?!\s|$) : Negative lookahead - asserts that the char is NOT followed by whitespace OR end of string + /// + /// Regex for Step 4: Ensure space after punctuation. Cannot use NonBacktracking due to lookahead. + /// ([.,!?;:]) : Captures one of the punctuation marks into group 1 + /// (?!\s|$) : Negative lookahead - asserts that the char is NOT followed by whitespace OR end of string + /// [GeneratedRegex(@"([.,!?;:])(?!\s|$)", RegexOptions.Compiled, RegexTimeoutMilliseconds)] private static partial Regex SpaceAfterPunctuationRegex(); } \ No newline at end of file