From 310d92b696a3fef0509b776eaff009b3040960a1 Mon Sep 17 00:00:00 2001
From: burgersmoke <burgersmoke@gmail.com>
Date: Thu, 6 Aug 2020 23:29:44 -0600
Subject: [PATCH 1/3] Implementation to allow QuickUMLS to be used as a
 component within a spacy pipeline.  This can be used as an entity matcher for
 UMLS concepts in other modular spacy pipelines.  This is already being used
 in an operational capacity for syndromic surveillance.

---
 quickumls/core.py            | 32 +++++++------
 quickumls/spacy_component.py | 90 ++++++++++++++++++++++++++++++++++++
 spacy_example_pipeline.py    | 29 ++++++++++++
 3 files changed, 138 insertions(+), 13 deletions(-)
 create mode 100644 quickumls/spacy_component.py
 create mode 100644 spacy_example_pipeline.py

diff --git a/quickumls/core.py b/quickumls/core.py
index fa9955b..e84bdf6 100644
--- a/quickumls/core.py
+++ b/quickumls/core.py
@@ -26,7 +26,8 @@ def __init__(
             overlapping_criteria='score', threshold=0.7, window=5,
             similarity_name='jaccard', min_match_length=3,
             accepted_semtypes=constants.ACCEPTED_SEMTYPES,
-            verbose=False, keep_uppercase=False):
+            verbose=False, keep_uppercase=False,
+            spacy_component = False):
         """Instantiate QuickUMLS object
 
             This is the main interface through which text can be processed.
@@ -146,18 +147,23 @@ def __init__(
 
         self.accepted_semtypes = accepted_semtypes
 
-        try:
-            self.nlp = spacy.load(spacy_lang)
-        except OSError:
-            msg = (
-                'Model for language "{}" is not downloaded. Please '
-                'run "python -m spacy download {}" before launching '
-                'QuickUMLS'
-            ).format(
-                self.language_flag,
-                constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx')
-            )
-            raise OSError(msg)
+        # if this is not being executed as as spacy component, then it must be standalone
+        if spacy_component:
+            # In this case, the pipeline is external to this current class
+            self.nlp = None
+        else:
+            try:
+                self.nlp = spacy.load(spacy_lang)
+            except OSError:
+                msg = (
+                    'Model for language "{}" is not downloaded. Please '
+                    'run "python -m spacy download {}" before launching '
+                    'QuickUMLS'
+                ).format(
+                    self.language_flag,
+                    constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx')
+                )
+                raise OSError(msg)
 
         self.ss_db = toolbox.SimstringDBReader(
             simstring_fp, similarity_name, threshold
diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py
new file mode 100644
index 0000000..336f796
--- /dev/null
+++ b/quickumls/spacy_component.py
@@ -0,0 +1,90 @@
+import spacy
+from spacy.tokens import Span
+from spacy.strings import StringStore
+
+from .core import QuickUMLS
+from . import constants
+
+class SpacyQuickUMLS(object):
+    name = 'QuickUMLS matcher'
+    
+    def __init__(self, nlp, quickumls_path, 
+        # these are all params that were from match() in quickumls but since we want to construct this
+        # and then make its behavior consistent as a component, we'll set them here:
+        best_match = True, ignore_syntax = False, verbose = False,
+        # these below are the same as in quickumls.py (so let's pass them through as our wrapper)
+        overlapping_criteria='score', threshold=0.7, window=5,
+        similarity_name='jaccard', min_match_length=3,
+        accepted_semtypes=constants.ACCEPTED_SEMTYPES):
+        
+        self.quickumls = QuickUMLS(quickumls_path, 
+            overlapping_criteria=overlapping_criteria, threshold=threshold, window=window,
+            similarity_name=similarity_name, min_match_length=min_match_length,
+            accepted_semtypes=accepted_semtypes,
+            # By default, the QuickUMLS objects creates its own internal spacy pipeline but we do not need that in this case
+            spacy_component = True,
+            verbose = verbose)
+        
+        # save this off so that we can get vocab values of labels later
+        self.nlp = nlp
+        
+        self.best_match = best_match
+        self.ignore_syntax = ignore_syntax
+        self.verbose = verbose
+
+        # let's extend this with some proprties that we want
+        Span.set_extension('similarity', default = -1.0)
+        Span.set_extension('semtypes', default = -1.0)
+        
+        if self.verbose:
+            print('Accepted semtypes : [{0}]'.format(accepted_semtypes))
+        
+    def __call__(self, doc):
+        # much of this is a re-write of match() in quickumls.py
+        # however, the changes include:
+        # receiving an incoming doc (rather than parsing at calling time)
+        # transforming matches into Spans as per spaCy custom Entity code example
+        
+        ngrams = None
+        # pass in the incoming doc which has already been tokenized (ready for ngrams)
+        if self.ignore_syntax:
+            ngrams = self.quickumls._make_token_sequences(doc)
+        else:
+            ngrams = self.quickumls._make_ngrams(doc)
+            
+        # perform the matching
+        matches = self.quickumls._get_all_matches(ngrams)
+        
+        if self.verbose:
+            print('Total matches before best match: [{0}]'.format(len(matches)))
+        
+        if self.best_match:
+            matches = self.quickumls._select_terms(matches)
+            
+            if self.verbose:
+                print('Total matches after best match: [{0}]'.format(len(matches)))
+            
+        self.quickumls._print_verbose_status(doc, matches)
+        
+        # Here's another change: convert match objects into Spans
+        for match in matches:
+            # each match may match multiple ngrams
+            for ngram_match_dict in match:
+                start_char_idx = int(ngram_match_dict['start'])
+                end_char_idx = int(ngram_match_dict['end'])
+                
+                cui = ngram_match_dict['cui']
+                # add the string
+                self.nlp.vocab.strings.add(cui)
+                # pull out the value
+                cui_label_value = self.nlp.vocab.strings[cui]
+                
+                # char_span() created a Span from the character indices
+                # UMLS CUI should work well as the label here
+                span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value)
+                # add some custom metadata
+                span._.similarity = ngram_match_dict['similarity']
+                span._.semtypes = ngram_match_dict['semtypes']
+                doc.ents = list(doc.ents) + [span]
+                
+        return doc
\ No newline at end of file
diff --git a/spacy_example_pipeline.py b/spacy_example_pipeline.py
new file mode 100644
index 0000000..4d4ced1
--- /dev/null
+++ b/spacy_example_pipeline.py
@@ -0,0 +1,29 @@
+import spacy
+
+import quickumls
+from quickumls.spacy_component import SpacyQuickUMLS
+
+# setup a spacy pipeline which includes QuickUMLS as a component and nothing else
+nlp = spacy.load('en_core_web_sm')
+
+nlp.remove_pipe('tagger')
+nlp.remove_pipe('parser')
+
+# TODO -- change this from a hard coded path to a small chunk of UMLS that can be legally distributed
+quickumls_path = r'C:\temp_quickumls\SNOMED_RXNORM_CPT_lowercase'
+
+threshold = 0.8
+
+quickumls_component = SpacyQuickUMLS(nlp, quickumls_path, threshold = threshold)
+
+nlp.add_pipe(quickumls_component)
+
+doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea')
+
+for ent in doc.ents:
+    print('Entity text : {}'.format(ent.text))
+    print('Label : {}'.format(ent.label_))
+    print('Similarity : {}'.format(ent._.similarity))
+    print('Semtypes : {}'.format(ent._.semtypes))
+    
+print('DONE with spacy/QuickUMLS demo')
\ No newline at end of file

From b06b8820c27bfdd1ef98c37e887de60f1a63e9fc Mon Sep 17 00:00:00 2001
From: burgersmoke <burgersmoke@gmail.com>
Date: Mon, 10 Aug 2020 22:17:54 -0600
Subject: [PATCH 2/3] Factoring QuickUMLS class slightly to minimize duplicated
 code between QuickUMLS and SpacyQuickUMLS.  Fixed keyword arguments to
 QuickUMLS when creating a spacy component.  Added documentation as well. 
 Removing previous standalone example Python file and instead adding an
 example of a QuickUMLS spacy pipleline to the README.

---
 README.md                    | 23 ++++++++++++
 quickumls/core.py            | 30 +++++++++++++--
 quickumls/spacy_component.py | 71 ++++++++++++------------------------
 spacy_example_pipeline.py    | 29 ---------------
 4 files changed, 73 insertions(+), 80 deletions(-)
 delete mode 100644 spacy_example_pipeline.py

diff --git a/README.md b/README.md
index 3e42585..8ed022e 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,29 @@ Set `best_match` to `False` if you want to return overlapping candidates, `ignor
 
 If the matcher throws a warning during initialization, read [this page](https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4) to learn why and how to stop it from doing so.
 
+## spaCy pipeline component
+
+QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline.  This follows traditional spaCy handling of concepts to be entity objects added to the Document object.  These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object.
+
+Adding QuickUMLS as a component in a pipeline can be done as follows:
+
+```python
+from quickumls.spacy_component import SpacyQuickUMLS
+
+# common English pipeline
+nlp = spacy.load('en_core_web_sm')
+
+quickumls_component = SpacyQuickUMLS(nlp, 'PATH_TO_QUICKUMLS_DATA')
+nlp.add_pipe(quickumls_component)
+
+doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea')
+
+for ent in doc.ents:
+    print('Entity text : {}'.format(ent.text))
+    print('Label (UMLS CUI) : {}'.format(ent.label_))
+    print('Similarity : {}'.format(ent._.similarity))
+    print('Semtypes : {}'.format(ent._.semtypes))
+```
 
 ## Server / Client Support
 
diff --git a/quickumls/core.py b/quickumls/core.py
index e84bdf6..7164d18 100644
--- a/quickumls/core.py
+++ b/quickumls/core.py
@@ -443,17 +443,39 @@ def match(self, text, best_match=True, ignore_syntax=False):
         """
 
         parsed = self.nlp(u'{}'.format(text))
+        
+        # pass in parsed spacy doc to get concept matches
+        matches = self._match(parsed)
 
+        return matches
+        
+    def _match(self, doc, best_match=True, ignore_syntax=False):
+        """Gathers ngram matches given a spaCy document object.
+
+        [extended_summary]
+
+        Args:
+            text (Document): spaCy Document object to be used for extracting ngrams
+
+            best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
+            ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
+
+        Returns:
+            List: List of all matches in the text
+            TODO: Describe format
+        """
+        
+        ngrams = None
         if ignore_syntax:
-            ngrams = self._make_token_sequences(parsed)
+            ngrams = self._make_token_sequences(doc)
         else:
-            ngrams = self._make_ngrams(parsed)
+            ngrams = self._make_ngrams(doc)
 
         matches = self._get_all_matches(ngrams)
 
         if best_match:
             matches = self._select_terms(matches)
 
-        self._print_verbose_status(parsed, matches)
-
+        self._print_verbose_status(doc, matches)
+        
         return matches
diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py
index 336f796..f64982f 100644
--- a/quickumls/spacy_component.py
+++ b/quickumls/spacy_component.py
@@ -8,65 +8,42 @@
 class SpacyQuickUMLS(object):
     name = 'QuickUMLS matcher'
     
-    def __init__(self, nlp, quickumls_path, 
-        # these are all params that were from match() in quickumls but since we want to construct this
-        # and then make its behavior consistent as a component, we'll set them here:
-        best_match = True, ignore_syntax = False, verbose = False,
-        # these below are the same as in quickumls.py (so let's pass them through as our wrapper)
-        overlapping_criteria='score', threshold=0.7, window=5,
-        similarity_name='jaccard', min_match_length=3,
-        accepted_semtypes=constants.ACCEPTED_SEMTYPES):
+    def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs):
+        """Instantiate SpacyQuickUMLS object
+
+            This creates a QuickUMLS spaCy component which can be used in modular pipelines.  
+            This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts.
+
+        Args:
+            nlp: Existing spaCy pipeline.  This is needed to update the vocabulary with UMLS CUI values
+            quickumls_fp (str): Path to QuickUMLS data
+            best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
+            ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
+            **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py)
+        """
         
-        self.quickumls = QuickUMLS(quickumls_path, 
-            overlapping_criteria=overlapping_criteria, threshold=threshold, window=window,
-            similarity_name=similarity_name, min_match_length=min_match_length,
-            accepted_semtypes=accepted_semtypes,
-            # By default, the QuickUMLS objects creates its own internal spacy pipeline but we do not need that in this case
+        self.quickumls = QuickUMLS(quickumls_fp, 
+            # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed
+            # when we're using it as a component in a pipeline
             spacy_component = True,
-            verbose = verbose)
+            **kwargs)
         
         # save this off so that we can get vocab values of labels later
         self.nlp = nlp
         
+        # keep these for matching
         self.best_match = best_match
         self.ignore_syntax = ignore_syntax
-        self.verbose = verbose
 
         # let's extend this with some proprties that we want
         Span.set_extension('similarity', default = -1.0)
         Span.set_extension('semtypes', default = -1.0)
         
-        if self.verbose:
-            print('Accepted semtypes : [{0}]'.format(accepted_semtypes))
-        
     def __call__(self, doc):
-        # much of this is a re-write of match() in quickumls.py
-        # however, the changes include:
-        # receiving an incoming doc (rather than parsing at calling time)
-        # transforming matches into Spans as per spaCy custom Entity code example
-        
-        ngrams = None
-        # pass in the incoming doc which has already been tokenized (ready for ngrams)
-        if self.ignore_syntax:
-            ngrams = self.quickumls._make_token_sequences(doc)
-        else:
-            ngrams = self.quickumls._make_ngrams(doc)
-            
-        # perform the matching
-        matches = self.quickumls._get_all_matches(ngrams)
-        
-        if self.verbose:
-            print('Total matches before best match: [{0}]'.format(len(matches)))
-        
-        if self.best_match:
-            matches = self.quickumls._select_terms(matches)
-            
-            if self.verbose:
-                print('Total matches after best match: [{0}]'.format(len(matches)))
-            
-        self.quickumls._print_verbose_status(doc, matches)
+        # pass in the document which has been parsed to this point in the pipeline for ngrams and matches
+        matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax)
         
-        # Here's another change: convert match objects into Spans
+        # Convert QuickUMLS match objects into Spans
         for match in matches:
             # each match may match multiple ngrams
             for ngram_match_dict in match:
@@ -74,15 +51,15 @@ def __call__(self, doc):
                 end_char_idx = int(ngram_match_dict['end'])
                 
                 cui = ngram_match_dict['cui']
-                # add the string
+                # add the string to the spacy vocab
                 self.nlp.vocab.strings.add(cui)
                 # pull out the value
                 cui_label_value = self.nlp.vocab.strings[cui]
                 
-                # char_span() created a Span from the character indices
+                # char_span() creates a Span from these character indices
                 # UMLS CUI should work well as the label here
                 span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value)
-                # add some custom metadata
+                # add some custom metadata to the spans
                 span._.similarity = ngram_match_dict['similarity']
                 span._.semtypes = ngram_match_dict['semtypes']
                 doc.ents = list(doc.ents) + [span]
diff --git a/spacy_example_pipeline.py b/spacy_example_pipeline.py
deleted file mode 100644
index 4d4ced1..0000000
--- a/spacy_example_pipeline.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import spacy
-
-import quickumls
-from quickumls.spacy_component import SpacyQuickUMLS
-
-# setup a spacy pipeline which includes QuickUMLS as a component and nothing else
-nlp = spacy.load('en_core_web_sm')
-
-nlp.remove_pipe('tagger')
-nlp.remove_pipe('parser')
-
-# TODO -- change this from a hard coded path to a small chunk of UMLS that can be legally distributed
-quickumls_path = r'C:\temp_quickumls\SNOMED_RXNORM_CPT_lowercase'
-
-threshold = 0.8
-
-quickumls_component = SpacyQuickUMLS(nlp, quickumls_path, threshold = threshold)
-
-nlp.add_pipe(quickumls_component)
-
-doc = nlp('Pt c/o shortness of breath, chest pain, nausea, vomiting, diarrrhea')
-
-for ent in doc.ents:
-    print('Entity text : {}'.format(ent.text))
-    print('Label : {}'.format(ent.label_))
-    print('Similarity : {}'.format(ent._.similarity))
-    print('Semtypes : {}'.format(ent._.semtypes))
-    
-print('DONE with spacy/QuickUMLS demo')
\ No newline at end of file

From 0dda67b4cef810c694a8807bf4e93295157dd25e Mon Sep 17 00:00:00 2001
From: burgersmoke <burgersmoke@gmail.com>
Date: Thu, 3 Sep 2020 16:13:32 -0600
Subject: [PATCH 3/3] Fixing issue #60 to ensure that SpacyQuickUMLS cannot add
 entity spans which overlap on a token.  Also added some documentation to the
 class and README.

---
 README.md                    |  2 +-
 quickumls/spacy_component.py | 23 ++++++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8ed022e..a0de114 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ If the matcher throws a warning during initialization, read [this page](https://
 
 ## spaCy pipeline component
 
-QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline.  This follows traditional spaCy handling of concepts to be entity objects added to the Document object.  These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object.
+QuickUMLS can be used for standalone processing but it can also be use as a component in a modular spaCy pipeline.  This follows traditional spaCy handling of concepts to be entity objects added to the Document object.  These entity objects contain the CUI, similarity score and Semantic Types in the spacy "underscore" object.  Note that this implementation follows a [known spacy convention](https://github.com/explosion/spaCy/issues/3608) that entity Spans cannot overlap on a single token. To prevent token overlap, matches are ranked according to the `overlapping_criteria` supplied so that overlap of any tokens will be prioritized by this order.
 
 Adding QuickUMLS as a component in a pipeline can be done as follows:
 
diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py
index f64982f..036aef3 100644
--- a/quickumls/spacy_component.py
+++ b/quickumls/spacy_component.py
@@ -13,12 +13,13 @@ def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kw
 
             This creates a QuickUMLS spaCy component which can be used in modular pipelines.  
             This module adds entity Spans to the document where the entity label is the UMLS CUI and the Span's "underscore" object is extended to contains "similarity" and "semtypes" for matched concepts.
+            Note that this implementation follows and enforces a known spacy convention that entity Spans cannot overlap on a single token.
 
         Args:
             nlp: Existing spaCy pipeline.  This is needed to update the vocabulary with UMLS CUI values
             quickumls_fp (str): Path to QuickUMLS data
             best_match (bool, optional): Whether to return only the top match or all overlapping candidates. Defaults to True.
-            ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
+            ignore_syntax (bool, optional): Whether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
             **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py)
         """
         
@@ -43,6 +44,15 @@ def __call__(self, doc):
         # pass in the document which has been parsed to this point in the pipeline for ngrams and matches
         matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax)
         
+        # NOTE: Spacy spans do not allow overlapping tokens, so we prevent the overlap here
+        # For more information, see: https://github.com/explosion/spaCy/issues/3608
+        tokens_in_ents_set = set()
+        
+        # let's track any other entities which may have been attached via upstream components
+        for ent in doc.ents:
+            for token_index in range(ent.start, ent.end):
+                tokens_in_ents_set.add(token_index)
+        
         # Convert QuickUMLS match objects into Spans
         for match in matches:
             # each match may match multiple ngrams
@@ -59,6 +69,17 @@ def __call__(self, doc):
                 # char_span() creates a Span from these character indices
                 # UMLS CUI should work well as the label here
                 span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value)
+                
+                # before we add this, let's make sure that this entity does not overlap any tokens added thus far
+                candidate_token_indexes = set(range(span.start, span.end))
+                
+                # check the intersection and skip this if there is any overlap
+                if len(tokens_in_ents_set.intersection(candidate_token_indexes)) > 0:
+                    continue
+                    
+                # track this to make sure we do not introduce overlap later
+                tokens_in_ents_set.update(candidate_token_indexes)
+                
                 # add some custom metadata to the spans
                 span._.similarity = ngram_match_dict['similarity']
                 span._.semtypes = ngram_match_dict['semtypes']