Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 150 additions & 92 deletions TTSTextNormalization.Generator/EmojiDataGenerator.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.Text;
using System.Collections.Immutable;
using System.Collections.Immutable;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.Text;

namespace TTSTextNormalization.EmojiDataGenerator;

Expand All @@ -13,100 +13,120 @@ public class EmojiGenerator : IIncrementalGenerator
{
public void Initialize(IncrementalGeneratorInitializationContext initContext)
{
//if (!Debugger.IsAttached)
// Debugger.Launch();
// Define the diagnostic descriptors
var MissingFileError = new DiagnosticDescriptor(
"SWTTSTN001",
"Missing Emoji Data File",
"The emoji data file 'data-by-emoji.json' is missing or could not be found.",
"FileAccess",
DiagnosticSeverity.Error,
true
);
var InvalidFileError = new DiagnosticDescriptor(
"SWTTSTN002",
"Invalid Emoji Data File",
"The emoji data file 'data-by-emoji.json' is empty or invalid JSON.",
"FileContent",
DiagnosticSeverity.Error,
true
);
var FormatError = new DiagnosticDescriptor(
"SWTTSTN003",
"Invalid Emoji Data Format",
"The emoji data file 'data-by-emoji.json' is not in the expected format (Dictionary<string, EmojiEntry>).",
"FileFormat",
DiagnosticSeverity.Error,
true
);
var RegexGenerationError = new DiagnosticDescriptor(
"SWTTSTN004",
"Regex Generation Error",
"Failed to generate the emoji matching Regex: {0}",
"Regex",
DiagnosticSeverity.Error,
true
);

// get the additional text provider
// Get additional texts provider filtering for the specific file
IncrementalValuesProvider<AdditionalText> additionalTexts =
initContext.AdditionalTextsProvider;

// apply a 1-to-1 transform on each text, extracting the contents
IncrementalValuesProvider<string> transformed = additionalTexts.Select(
static (text, _) => text?.GetText()?.ToString() ?? string.Empty
);
initContext.AdditionalTextsProvider.Where(at =>
Path.GetFileName(at.Path)
.Equals("data-by-emoji.json", StringComparison.OrdinalIgnoreCase)
);

// collect the contents into a batch
IncrementalValueProvider<ImmutableArray<string>> collected = transformed.Collect();
// Combine with compilation to report errors correctly
IncrementalValueProvider<(
Compilation,
ImmutableArray<AdditionalText>
)> compilationAndTexts = initContext.CompilationProvider.Combine(additionalTexts.Collect());

// take the file contents from the above batch and make some user visible syntax
initContext.RegisterSourceOutput(
collected,
static (sourceProductionContext, textContents) =>
compilationAndTexts,
(spc, source) =>
{
if (textContents.IsDefaultOrEmpty)
var (compilation, texts) = source;

if (texts.IsDefaultOrEmpty)
{
sourceProductionContext.ReportDiagnostic(
Diagnostic.Create(
new DiagnosticDescriptor(
"SWTTSTN001",
"Missing Emoji Data File",
"The emoji data file 'data-by-emoji.json' is missing.",
"FileNotFound",
DiagnosticSeverity.Error,
true
),
Location.None
)
);
// File not found or not included as AdditionalFile
spc.ReportDiagnostic(Diagnostic.Create(MissingFileError, Location.None));
return;
}

string? emojiFileContent = textContents.FirstOrDefault();
if (string.IsNullOrEmpty(emojiFileContent))
// Assuming only one file matches
AdditionalText emojiFile = texts[0];
SourceText? fileSourceText = emojiFile.GetText(spc.CancellationToken);

if (fileSourceText == null || fileSourceText.Length == 0)
{
sourceProductionContext.ReportDiagnostic(
Diagnostic.Create(
new DiagnosticDescriptor(
"SWTTSTN002",
"Invalid Emoji Data File",
"The emoji data file 'data-by-emoji.json' is empty or invalid.",
"FileError",
DiagnosticSeverity.Error,
true
),
Location.None
)
);
spc.ReportDiagnostic(Diagnostic.Create(InvalidFileError, Location.None));
return;
}

JsonDocument jsonDocument = JsonDocument.Parse(
emojiFileContent!,
new JsonDocumentOptions
{
AllowTrailingCommas = true,
CommentHandling = JsonCommentHandling.Skip,
}
);

Dictionary<string, EmojiEntry>? emojiData = jsonDocument.Deserialize<Dictionary<string, EmojiEntry>>(
new JsonSerializerOptions { PropertyNameCaseInsensitive = true }
);
string emojiFileContent = fileSourceText.ToString();
Dictionary<string, EmojiEntry>? emojiData = null;

if (emojiData == null)
try
{
JsonDocument jsonDocument = JsonDocument.Parse(
emojiFileContent,
new JsonDocumentOptions
{
AllowTrailingCommas = true,
CommentHandling = JsonCommentHandling.Skip,
}
);
emojiData = jsonDocument.Deserialize<Dictionary<string, EmojiEntry>>(
new JsonSerializerOptions { PropertyNameCaseInsensitive = true }
);
}
catch (JsonException ex)
{
sourceProductionContext.ReportDiagnostic(
Diagnostic.Create(
new DiagnosticDescriptor(
"SWTTSTN003",
"Invalid Emoji Data Format",
"The emoji data file 'data-by-emoji.json' is not in the expected format.",
"FileFormatError",
DiagnosticSeverity.Error,
true
),
Location.None
)
spc.ReportDiagnostic(
Diagnostic.Create(InvalidFileError, Location.None, ex.Message)
);
return;
}
catch (Exception ex) // Catch other potential deserialization errors
{
spc.ReportDiagnostic(Diagnostic.Create(FormatError, Location.None, ex.Message));
return;
}

if (emojiData == null || emojiData.Count == 0)
{
spc.ReportDiagnostic(Diagnostic.Create(FormatError, Location.None));
return;
}

StringBuilder sb = new();
sb.AppendLine(
"""
// <auto-generated/>
// Generated by EmojiDataGenerator
#pragma warning disable
#nullable enable
// Generated by EmojiDataGenerator

using System;
using System.Collections.Frozen;
using System.Collections.Generic;
Expand All @@ -126,49 +146,87 @@ static EmojiData()
"""
);

// Build the dictionary initializer
foreach (KeyValuePair<string, EmojiEntry> keyValue in emojiData)
{
string key = SymbolDisplay.FormatLiteral(keyValue.Key, true);
string value = keyValue.Value.Name.Replace("\"", "\\\"");
sb.AppendLine($" {{ {key}, \"{value}\" }},");
// Basic validation for name
if (string.IsNullOrWhiteSpace(keyValue.Value?.Name))
continue;

string keyLiteral = SymbolDisplay.FormatLiteral(keyValue.Key, true);
string valueLiteral = SymbolDisplay.FormatLiteral(keyValue.Value!.Name!, true); // Ensure name is also correctly literalized
sb.AppendLine($" {{ {keyLiteral}, {valueLiteral} }},");
}

sb.AppendLine(
"""
};
EmojiToNameMap = mapBuilder.ToFrozenDictionary(StringComparer.Ordinal);

"""
);

// Build the Regex pattern
string pattern = string.Join(
"|",
emojiData.Keys.OrderByDescending(k => k.Length).Select(Regex.Escape)
// Filter keys used in the map to ensure consistency
emojiData
.Where(kv => !string.IsNullOrWhiteSpace(kv.Value?.Name))
.Select(kv => Regex.Escape(kv.Key))
.OrderByDescending(k => k.Length) // Match longest first
);
sb.AppendLine($" const string pattern = @\"{pattern}\";");

if (string.IsNullOrEmpty(pattern))
{
// Handle case where no valid emojis were processed
sb.AppendLine($" // No valid emoji data found to build Regex.");
sb.AppendLine(
$" EmojiMatchRegex = new Regex(\"(?!)\", RegexOptions.Compiled); // Regex that never matches"
);
}
else
{
// Escape the pattern string itself for use in a C# string literal
string patternLiteral = SymbolDisplay.FormatLiteral(pattern, true);
sb.AppendLine($" const string pattern = {patternLiteral};");

const int timeoutMilliseconds = 200; // Define timeout

sb.AppendLine($" try");
sb.AppendLine(" {");
sb.AppendLine(
$" EmojiMatchRegex = new Regex(pattern, RegexOptions.Compiled | RegexOptions.NonBacktracking, TimeSpan.FromMilliseconds({timeoutMilliseconds}));"
);
sb.AppendLine(" }");
sb.AppendLine($" catch(Exception ex)"); // Catch potential Regex creation errors
sb.AppendLine(" {");
sb.AppendLine(
$" Console.Error.WriteLine($\"FATAL: Failed to compile Emoji Regex: {{ex.Message}}\");"
);
sb.AppendLine(
$" EmojiMatchRegex = new Regex(\"(?!)\", RegexOptions.Compiled); // Fallback"
);
sb.AppendLine(" }");
}

sb.AppendLine(
"""
EmojiMatchRegex = new Regex(pattern, RegexOptions.Compiled, TimeSpan.FromMilliseconds(200));
}
}
} // End static constructor
} // End class EmojiData
#nullable restore
#pragma warning restore
"""
);

sourceProductionContext.AddSource(
"EmojiData.g.cs",
SourceText.From(sb.ToString(), Encoding.UTF8)
);
spc.AddSource("EmojiData.g.cs", SourceText.From(sb.ToString(), Encoding.UTF8));
}
);
}

private class EmojiEntry
// Simple class to hold necessary emoji data from JSON
private sealed class EmojiEntry
{
public string Name { get; set; } = string.Empty;
public string Slug { get; set; } = string.Empty;
public string Group { get; set; } = string.Empty;
public string Emoji_version { get; set; } = string.Empty;
public string Unicode_version { get; set; } = string.Empty;
public bool Skin_tone_support { get; set; }
public string? Name { get; set; }
// Other properties like Slug, Group, etc. can be added if needed later
}
}
5 changes: 5 additions & 0 deletions TTSTextNormalization.sln
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TTSTextNormalization.EmojiD
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TTSTextNormalization", "TTSTextNormalization\TTSTextNormalization.csproj", "{1C2CA7DF-374E-FA47-469B-9751E035B2C8}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{8EC462FD-D22E-90A8-E5CE-7E832BA40C5D}"
ProjectSection(SolutionItems) = preProject
.github\workflows\dotnet-publish.yml = .github\workflows\dotnet-publish.yml
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down
2 changes: 1 addition & 1 deletion TTSTextNormalization/Rules/BasicSanitizationRule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ private static bool MightContainFancyChars(string text)
/// </summary>
[GeneratedRegex(
@"[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F\u200B-\u200D\uFEFF]",
RegexOptions.Compiled | RegexOptions.CultureInvariant, // CultureInvariant is fine for code points
RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.NonBacktracking, // CultureInvariant is fine for code points
matchTimeoutMilliseconds: RegexTimeoutMilliseconds)]
private static partial Regex RemoveControlCharsRegex();
}
24 changes: 15 additions & 9 deletions TTSTextNormalization/Rules/WhitespaceNormalizationRule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,25 @@ public string Apply(string inputText)
return currentText;
}

// Regex for Step 2: Collapse multiple whitespace
[GeneratedRegex(@"\s{2,}", RegexOptions.Compiled, RegexTimeoutMilliseconds)]
/// <summary>
/// Regex for Step 2: Collapse multiple whitespace. Uses NonBacktracking.
/// </summary>
[GeneratedRegex(@"\s{2,}", RegexOptions.Compiled | RegexOptions.NonBacktracking, RegexTimeoutMilliseconds)]
private static partial Regex MultipleWhitespaceRegex();

// Regex for Step 3: Remove space before punctuation
// \s+ : one or more whitespace chars
// ([.,!?;:]) : Captures one of the punctuation marks into group 1
[GeneratedRegex(@"\s+([.,!?;:])", RegexOptions.Compiled, RegexTimeoutMilliseconds)]
/// <summary>
/// Regex for Step 3: Remove space before punctuation. Uses NonBacktracking.
/// \s+ : one or more whitespace chars
/// ([.,!?;:]) : Captures one of the punctuation marks into group 1
/// </summary>
[GeneratedRegex(@"\s+([.,!?;:])", RegexOptions.Compiled | RegexOptions.NonBacktracking, RegexTimeoutMilliseconds)]
private static partial Regex SpaceBeforePunctuationRegex();

// Regex for Step 4: Ensure space after punctuation
// ([.,!?;:]) : Captures one of the punctuation marks into group 1
// (?!\s|$) : Negative lookahead - asserts that the char is NOT followed by whitespace OR end of string
/// <summary>
/// Regex for Step 4: Ensure space after punctuation. Cannot use NonBacktracking due to lookahead.
/// ([.,!?;:]) : Captures one of the punctuation marks into group 1
/// (?!\s|$) : Negative lookahead - asserts that the char is NOT followed by whitespace OR end of string
/// </summary>
[GeneratedRegex(@"([.,!?;:])(?!\s|$)", RegexOptions.Compiled, RegexTimeoutMilliseconds)]
private static partial Regex SpaceAfterPunctuationRegex();
}