From 4eb21c8605dc6c7c52314ba83aa94003c30c12c6 Mon Sep 17 00:00:00 2001
From: "Hung Q. Vo" <66151263+Hung304-WBLEM@users.noreply.github.com>
Date: Thu, 29 Sep 2022 05:35:45 +0700
Subject: [PATCH 1/2] Update spacy_component.py

---
 quickumls/spacy_component.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py
index f64982f..8a79446 100644
--- a/quickumls/spacy_component.py
+++ b/quickumls/spacy_component.py
@@ -62,6 +62,16 @@ def __call__(self, doc):
                 # add some custom metadata to the spans
                 span._.similarity = ngram_match_dict['similarity']
                 span._.semtypes = ngram_match_dict['semtypes']
-                doc.ents = list(doc.ents) + [span]
                 
-        return doc
\ No newline at end of file
+                # OLD: doc.ents = list(doc.ents) + [span]
+                # Using doc.spans["sc"] (SpanCategorizer) to solve the problem of overlapped tokens in nested NER for spacy.
+                # With doc.spans["sc"], all possible entities are stored without throwing errors.
+                doc.spans["sc"] = list(doc.spans["sc"]) + [span]
+
+        # After storing all possible spans, we filter out overlapping spans before adding them to doc.ents. 
+        # Here we remove overlapping spans using spacy.util.filter_spans 
+        # When spans overlap, the rule is to prefer the first longest span over shorter ones.
+        for span in spacy.util.filter_spans(doc.spans["sc"]):
+            doc.ents = list(doc.ents) + [span]
+
+        return doc

From d3571694ccf2b5f678f9e55283f31ed0549241f6 Mon Sep 17 00:00:00 2001
From: "Hung Q. Vo" <66151263+Hung304-WBLEM@users.noreply.github.com>
Date: Thu, 6 Oct 2022 06:50:41 +0700
Subject: [PATCH 2/2] Update spacy_component.py

---
 quickumls/spacy_component.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py
index 8a79446..6ecca24 100644
--- a/quickumls/spacy_component.py
+++ b/quickumls/spacy_component.py
@@ -44,6 +44,7 @@ def __call__(self, doc):
         matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax)
         
         # Convert QuickUMLS match objects into Spans
+        doc.spans['sc'] = []
         for match in matches:
             # each match may match multiple ngrams
             for ngram_match_dict in match: