From 92d868b6ab8d5833b799a41c5c9313d8c53d3a47 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 22 Jan 2026 13:04:52 -0500 Subject: [PATCH 1/3] Remove RTL markers from verse token data; add tests --- src/SIL.Machine/Corpora/UsfmTokenizer.cs | 7 +++- .../Corpora/UpdateUsfmParserHandlerTests.cs | 33 ++++++++++++++++++ .../Corpora/UsfmMemoryTextTests.cs | 34 +++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/src/SIL.Machine/Corpora/UsfmTokenizer.cs b/src/SIL.Machine/Corpora/UsfmTokenizer.cs index c03b836d..a4fc2830 100644 --- a/src/SIL.Machine/Corpora/UsfmTokenizer.cs +++ b/src/SIL.Machine/Corpora/UsfmTokenizer.cs @@ -175,7 +175,7 @@ public IReadOnlyList Tokenize(string usfm, bool preserveWhitespace = marker, null, null, - GetNextWord(usfm, ref index, preserveWhitespace) + SanitizeVerseData(GetNextWord(usfm, ref index, preserveWhitespace)) ) { LineNumber = lineNum, @@ -563,6 +563,11 @@ private static string GetNextWord(string usfm, ref int index, bool preserveWhite return data; } + private static string SanitizeVerseData(string verseData) + { + return verseData.Replace("‏", ""); + } + /// /// Converts all control characters, carriage returns and tabs into /// spaces, and then strips duplicate spaces. diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 77c9f7cc..a2501054 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -906,6 +906,39 @@ public void UpdateBlock_Verse_Range() ); } + [Test] + public void UpdateBlock_Verse_Range_RightToLeftMarker() + { + var rows = new List { new UpdateUsfmRow(ScrRef("MAT 1:1", "MAT 1:2", "MAT 1:3"), "Update 1-3") }; + string usfm = + @"\id MAT - Test +\c 1 +\v 1‏-3 verse 1 through 3 +"; + TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); + string updatedUsfm = UpdateUsfm( + rows, + usfm, + embedBehavior: UpdateUsfmMarkerBehavior.Preserve, + usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] + ); + string expectedUsfm = + @"\id MAT - Test +\c 1 +\v 1-3 Update 1-3 +"; + Assert.That(updatedUsfm.Replace("\r\n", "\n"), Is.EqualTo(expectedUsfm)); + Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); + + UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; + AssertUpdateBlockEquals( + usfmUpdateBlock, + ["MAT 1:1", "MAT 1:2", "MAT 1:3"], + (UsfmUpdateBlockElementType.Text, "Update 1-3 ", false), + (UsfmUpdateBlockElementType.Text, "verse 1 through 3 ", true) + ); + } + [Test] public void UpdateBlock_Footnote_PreserveEmbeds() { diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index d1321f7c..7fae89bf 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -425,6 +425,40 @@ public void GetRows_PrivateUseMarker() }); } + [Test] + public void GetRows_VerseRangeWithRightToLeftMarker() + { + TextRow[] rows = GetRows( + @"\id MAT - Test +\h +\mt +\c 1 +\v 1‏-2 Verse one and two. +" + ); + + Assert.Multiple(() => + { + Assert.That(rows, Has.Length.EqualTo(2)); + + Assert.That( + rows[0].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:1")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + Assert.That( + rows[0].Text, + Is.EqualTo("Verse one and two."), + string.Join(",", rows.ToList().Select(tr => tr.Text)) + ); + Assert.That( + rows[1].Ref, + Is.EqualTo(ScriptureRef.Parse("MAT 1:2")), + string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) + ); + }); + } + private static TextRow[] GetRows(string usfm, bool includeMarkers = false, bool includeAllText = false) { UsfmMemoryText text = From dc46aaa84d052cd8c712121431814e2bdf4e47cd Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 22 Jan 2026 13:13:54 -0500 Subject: [PATCH 2/3] Use ignore line endings --- tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index a2501054..0ec7ea77 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -927,7 +927,7 @@ public void UpdateBlock_Verse_Range_RightToLeftMarker() \c 1 \v 1-3 Update 1-3 "; - Assert.That(updatedUsfm.Replace("\r\n", "\n"), Is.EqualTo(expectedUsfm)); + Assert.That(updatedUsfm, Is.EqualTo(expectedUsfm).IgnoreLineEndings()); Assert.That(usfmUpdateBlockHandler.Blocks.Count, Is.EqualTo(1)); UsfmUpdateBlock usfmUpdateBlock = usfmUpdateBlockHandler.Blocks[0]; From 30d6cb7d318038ba462fe8df2cca162e6cd6a9d7 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 23 Jan 2026 12:36:41 -0500 Subject: [PATCH 3/3] Move verse token data sanitization to updater --- src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs | 10 ++++++++++ src/SIL.Machine/Corpora/UsfmTokenizer.cs | 7 +------ .../Corpora/UpdateUsfmParserHandlerTests.cs | 7 +------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs index 00c1f719..239978b6 100644 --- a/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs +++ b/src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs @@ -512,6 +512,11 @@ private void CollectUpdatableTokens(UsfmParserState state) while (_tokenIndex <= state.Index + state.SpecialTokenCount) { UsfmToken token = state.Tokens[_tokenIndex]; + if (token.Type == UsfmTokenType.Verse) + { + string sanitizedVerseData = SanitizeVerseData(token.Data); + token = new UsfmToken(token.Type, token.Marker, token.Text, token.EndMarker, sanitizedVerseData); + } if (CurrentTextType == ScriptureTextType.Embed) { _embedTokens.Add(token); @@ -746,6 +751,11 @@ private void UpdateVerseRows() } } + private static string SanitizeVerseData(string verseData) + { + return verseData.Replace("\u200F", ""); + } + private class RowInfo { public RowInfo(int rowIndex) diff --git a/src/SIL.Machine/Corpora/UsfmTokenizer.cs b/src/SIL.Machine/Corpora/UsfmTokenizer.cs index a4fc2830..c03b836d 100644 --- a/src/SIL.Machine/Corpora/UsfmTokenizer.cs +++ b/src/SIL.Machine/Corpora/UsfmTokenizer.cs @@ -175,7 +175,7 @@ public IReadOnlyList Tokenize(string usfm, bool preserveWhitespace = marker, null, null, - SanitizeVerseData(GetNextWord(usfm, ref index, preserveWhitespace)) + GetNextWord(usfm, ref index, preserveWhitespace) ) { LineNumber = lineNum, @@ -563,11 +563,6 @@ private static string GetNextWord(string usfm, ref int index, bool preserveWhite return data; } - private static string SanitizeVerseData(string verseData) - { - return verseData.Replace("‏", ""); - } - /// /// Converts all control characters, carriage returns and tabs into /// spaces, and then strips duplicate spaces. diff --git a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs index 0ec7ea77..ac7af2dc 100644 --- a/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UpdateUsfmParserHandlerTests.cs @@ -916,12 +916,7 @@ public void UpdateBlock_Verse_Range_RightToLeftMarker() \v 1‏-3 verse 1 through 3 "; TestUsfmUpdateBlockHandler usfmUpdateBlockHandler = new TestUsfmUpdateBlockHandler(); - string updatedUsfm = UpdateUsfm( - rows, - usfm, - embedBehavior: UpdateUsfmMarkerBehavior.Preserve, - usfmUpdateBlockHandlers: [usfmUpdateBlockHandler] - ); + string updatedUsfm = UpdateUsfm(rows, usfm, usfmUpdateBlockHandlers: [usfmUpdateBlockHandler]); string expectedUsfm = @"\id MAT - Test \c 1