diff --git a/quickumls/__init__.py b/quickumls/__init__.py
index fcfd593..39056d1 100644
--- a/quickumls/__init__.py
+++ b/quickumls/__init__.py
@@ -1,3 +1,3 @@
-from .core import QuickUMLS
-from .client import get_quickumls_client
 from .about import *
+from .client import get_quickumls_client
+from .core import QuickUMLS
diff --git a/quickumls/about.py b/quickumls/about.py
index dc0ac78..51e2ae4 100644
--- a/quickumls/about.py
+++ b/quickumls/about.py
@@ -3,10 +3,10 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 # https://github.com/explosion/spaCy/blob/master/spacy/about.py
 
-__title__ = 'quickumls'
-__version__ = '1.4.0r1'
-__author__ = 'Luca Soldaini'
-__email__ = 'luca@ir.cs.georgetown.edu'
-__license__ = 'MIT'
+__title__ = "quickumls"
+__version__ = "1.4.0r1"
+__author__ = "Luca Soldaini"
+__email__ = "luca@ir.cs.georgetown.edu"
+__license__ = "MIT"
 __uri__ = "https://github.com/Georgetown-IR-Lab/QuickUMLS"
-__copyright__ = '2014-2020, Georgetown University Information Retrieval Lab'
+__copyright__ = "2014-2020, Georgetown University Information Retrieval Lab"
diff --git a/quickumls/client.py b/quickumls/client.py
index b40d645..b133f03 100644
--- a/quickumls/client.py
+++ b/quickumls/client.py
@@ -1,8 +1,8 @@
-from .network import MinimalClient
 from .core import QuickUMLS
+from .network import MinimalClient
 
 
-def get_quickumls_client(host='localhost', port=4645):
-    '''Return a client for a QuickUMLS server running on host at port'''
+def get_quickumls_client(host="localhost", port=4645):
+    """Return a client for a QuickUMLS server running on host at port"""
     client = MinimalClient(QuickUMLS, host=host, port=port, buffersize=4096)
     return client
diff --git a/quickumls/constants.py b/quickumls/constants.py
index 02e66ba..4202b12 100644
--- a/quickumls/constants.py
+++ b/quickumls/constants.py
@@ -1,14 +1,28 @@
 HEADERS_MRCONSO = [
-    'cui', 'lat', 'ts', 'lui', 'stt', 'sui', 'ispref', 'aui', 'saui',
-    'scui', 'sdui', 'sab', 'tty', 'code', 'str', 'srl', 'suppress', 'cvf'
-]
-HEADERS_MRSTY = [
-    'cui', 'sty', 'hier' 'desc', 'sid', 'num'
+    "cui",
+    "lat",
+    "ts",
+    "lui",
+    "stt",
+    "sui",
+    "ispref",
+    "aui",
+    "saui",
+    "scui",
+    "sdui",
+    "sab",
+    "tty",
+    "code",
+    "str",
+    "srl",
+    "suppress",
+    "cvf",
 ]
+HEADERS_MRSTY = ["cui", "sty", "hier", "desc", "sid", "num"]
 
-NEGATIONS = {'none', 'non', 'neither', 'nor', 'no', 'not'}
+NEGATIONS = {"none", "non", "neither", "nor", "no", "not"}
 
-# The following is a list of all existing semtypes along with their name and some examples. 
+# The following is a list of all existing semtypes along with their name and some examples.
 # You can easily select the ones you need by commenting out the lines that are not relevant for your application.
 
 ACCEPTED_SEMTYPES = {
@@ -21,7 +35,7 @@
     # 'T190', # Anatomical Abnormality, ex.: Bronchial Fistula; Foot Deformities; Hyperostosis of skull
     # 'T017', # Anatomical Structure, ex.: Cadaver; Pharyngostome; Anatomic structures
     # 'T008', # Animal, ex.: Animals; Animals, Laboratory; Carnivore
-    'T195', # Antibiotic, ex.: Antibiotics; bactericide; Thienamycins
+    "T195",  # Antibiotic, ex.: Antibiotics; bactericide; Thienamycins
     # 'T194', # Archaeon, ex.: Thermoproteales; Haloferax volcanii; Methanospirillum
     # 'T007', # Bacterium, ex.: Acetobacter; Bacillus cereus; Cytophaga
     # 'T053', # Behavior, ex.: Homing Behavior; Sexuality; Habitat Selection
@@ -30,10 +44,10 @@
     # 'T091', # Biomedical Occupation or Discipline, ex.: Adolescent Medicine; Cellular Neurobiology; Dentistry
     # 'T122', # Biomedical or Dental Material, ex.: Acrylic Resins; Bone Cements; Dentifrices
     # 'T012', # Bird, ex.: Serinus; Ducks; Quail
-    'T029', # Body Location or Region, ex.: Forehead; Sublingual Region; Base of skull structure
-    'T023', # Body Part, Organ, or Organ Component, ex.: Aorta; Brain Stem; Structure of neck of femur
+    "T029",  # Body Location or Region, ex.: Forehead; Sublingual Region; Base of skull structure
+    "T023",  # Body Part, Organ, or Organ Component, ex.: Aorta; Brain Stem; Structure of neck of femur
     # 'T030', # Body Space or Junction, ex.: Knee joint; Greater sac of peritoneum; Synapses
-    'T031', # Body Substance, ex.: Amniotic Fluid; saliva; Smegma
+    "T031",  # Body Substance, ex.: Amniotic Fluid; saliva; Smegma
     # 'T022', # Body System, ex.: Endocrine system; Renin-angiotensin system; Reticuloendothelial System
     # 'T088', # Carbohydrate Sequence, ex.: Carbohydrate Sequence; Abnormal carbohydrate sequence
     # 'T025', # Cell, ex.: B-Lymphocytes; Dendritic Cells; Fibroblasts
@@ -44,14 +58,14 @@
     # 'T120', # Chemical Viewed Functionally, ex.: Aerosol Propellants; Detergents; Stabilizing Agents
     # 'T104', # Chemical Viewed Structurally, ex.: Ammonium Compounds; Cations; Sulfur Compounds
     # 'T185', # Classification, ex.: Anatomy (MeSH Category); Tumor Stage Classification; axis i
-    'T201', # Clinical Attribute, ex.: Bone Density; heart rate; Range of Motion, Articular
-    'T200', # Clinical Drug, ex.: Ranitidine 300 MG Oral Tablet [Zantac]; Aspirin 300 MG Delayed Release Oral
+    "T201",  # Clinical Attribute, ex.: Bone Density; heart rate; Range of Motion, Articular
+    "T200",  # Clinical Drug, ex.: Ranitidine 300 MG Oral Tablet [Zantac]; Aspirin 300 MG Delayed Release Oral
     # 'T077', # Conceptual Entity, ex.: Geographic Factors; Fractals; Secularism
     # 'T019', # Congenital Abnormality, ex.: Albinism; Cleft palate with cleft lip; Polydactyly of toes
     # 'T056', # Daily or Recreational Activity, ex.: Badminton; Dancing; Swimming
-    'T060', # Diagnostic Procedure, ex.: Biopsy; Heart Auscultation; Magnetic Resonance Imaging
-    'T047', # Disease or Syndrome, ex.: Diabetes Mellitus; Drug Allergy; Malabsorption Syndrome
-    'T203', # Drug Delivery Device, ex.: Nordette 21 Day Pack; {7 (Terazosin 1 MG Oral Tablet) / 7 (Terazosin 2 MG
+    "T060",  # Diagnostic Procedure, ex.: Biopsy; Heart Auscultation; Magnetic Resonance Imaging
+    "T047",  # Disease or Syndrome, ex.: Diabetes Mellitus; Drug Allergy; Malabsorption Syndrome
+    "T203",  # Drug Delivery Device, ex.: Nordette 21 Day Pack; {7 (Terazosin 1 MG Oral Tablet) / 7 (Terazosin 2 MG
     # 'T065', # Educational Activity, ex.: Academic Training; Family Planning Training; Preceptorship
     # 'T196', # Element, Ion, or Isotope, ex.: Carbon; Chromium Isotopes; Radioisotopes
     # 'T018', # Embryonic Structure, ex.: Blastoderm; Fetus; Neural Crest
@@ -62,7 +76,7 @@
     # 'T051', # Event, ex.: Anniversaries; Exposure to Mumps virus (event); Device Unattended
     # 'T050', # Experimental Model of Disease, ex.: Alloxan Diabetes; Liver Cirrhosis, Experimental; Transient Gene Knock-Out
     # 'T099', # Family Group, ex.: Daughter; Is an only child; Unmarried Fathers
-    'T033', # Finding, ex.: Birth History; Downward displacement of diaphragm; Decreased glucose level
+    "T033",  # Finding, ex.: Birth History; Downward displacement of diaphragm; Decreased glucose level
     # 'T013', # Fish, ex.: Bass; Salmonidae; Whitefish
     # 'T168', # Food, ex.: Beverages; Egg Yolk (Dietary); Ice Cream
     # 'T021', # Fully Formed Anatomical Structure, ex.: Entire body as a whole; Female human body; Set of parts of human body
@@ -75,32 +89,32 @@
     # 'T096', # Group, ex.: Focus Groups; jury; teams
     # 'T102', # Group Attribute, ex.: Family Size; Group Structure; Life Expectancy
     # 'T131', # Hazardous or Poisonous Substance, ex.: Carcinogens; Fumigant; Mutagens
-    'T058', # Health Care Activity, ex.: ambulatory care services; Clinic Activities; Preventive Health Services
+    "T058",  # Health Care Activity, ex.: ambulatory care services; Clinic Activities; Preventive Health Services
     # 'T093', # Health Care Related Organization, ex.: Centers for Disease Control and Prevention (U.S.); Halfway Houses;
     # 'T125', # Hormone, ex.: Enteric Hormones; thymic humoral factor; Prohormone
     # 'T016', # Human, ex.: Homo sapiens; jean piaget; Member of public
     # 'T068', # Human-caused Phenomenon or Process, ex.: Baby Boom; Cultural Evolution; Mass Media
     # 'T078', # Idea or Concept, ex.: Capitalism; Civil Rights; Ethics
     # 'T129', # Immunologic Factor, ex.: Antigens; Immunologic Factors; Blood group antigen P
-    'T130', # Indicator, Reagent, or Diagnostic Aid, ex.: Fluorescent Dyes; Indicators and Reagents; India ink stain
+    "T130",  # Indicator, Reagent, or Diagnostic Aid, ex.: Fluorescent Dyes; Indicators and Reagents; India ink stain
     # 'T055', # Individual Behavior, ex.: Assertiveness; Grooming; Risk-Taking
-    'T037', # Injury or Poisoning, ex.: Accidental Falls; Carbon Monoxide Poisoning; Snake Bites
+    "T037",  # Injury or Poisoning, ex.: Accidental Falls; Carbon Monoxide Poisoning; Snake Bites
     # 'T197', # Inorganic Chemical, ex.: Carbonic Acid; aluminum nitride; ferric citrate
-    'T170', # Intellectual Product, ex.: Decision Support Techniques; Information Systems; Literature
-    'T034', # Laboratory or Test Result, ex.: Blood Flow Velocity; Serum Calcium Level; Spinal Fluid Pressure
-    'T059', # Laboratory Procedure, ex.: Blood Protein Electrophoresis; Crystallography; Radioimmunoassay
+    "T170",  # Intellectual Product, ex.: Decision Support Techniques; Information Systems; Literature
+    "T034",  # Laboratory or Test Result, ex.: Blood Flow Velocity; Serum Calcium Level; Spinal Fluid Pressure
+    "T059",  # Laboratory Procedure, ex.: Blood Protein Electrophoresis; Crystallography; Radioimmunoassay
     # 'T171', # Language, ex.: Armenian language; braille; Bilingualism
     # 'T066', # Machine Activity, ex.: Computer Simulation; Equipment Failure; Natural Language Processing
     # 'T015', # Mammal, ex.: Ursidae Family; Hamsters; Macaca
     # 'T073', # Manufactured Object, ex.: car seat; Cooking and Eating Utensils; Goggles
-    'T074', # Medical Device, ex.: Bone Screws; Headgear, Orthodontic; Compression Stockings
-    'T048', # Mental or Behavioral Dysfunction, ex.: Agoraphobia; Cyclothymic Disorder; Frigidity
-    'T041', # Mental Process, ex.: Anger; Auditory Fatigue; Avoidance Learning
+    "T074",  # Medical Device, ex.: Bone Screws; Headgear, Orthodontic; Compression Stockings
+    "T048",  # Mental or Behavioral Dysfunction, ex.: Agoraphobia; Cyclothymic Disorder; Frigidity
+    "T041",  # Mental Process, ex.: Anger; Auditory Fatigue; Avoidance Learning
     # 'T063', # Molecular Biology Research Technique, ex.: Northern Blotting; Genetic Engineering; In Situ Hybridization
     # 'T044', # Molecular Function, ex.: Binding, Competitive; Electron Transport; Glycolysis
     # 'T085', # Molecular Sequence, ex.: Genetic Code; Homologous Sequences; Molecular Sequence
     # 'T070', # Natural Phenomenon or Process, ex.: Air Movements; Corrosion; Lightning (phenomenon)
-    'T191', # Neoplastic Process, ex.: Abdominal Neoplasms; Bowen's Disease; Polyp in nasopharynx
+    "T191",  # Neoplastic Process, ex.: Abdominal Neoplasms; Bowen's Disease; Polyp in nasopharynx
     # 'T114', # Nucleic Acid, Nucleoside, or Nucleotide, ex.: Cytosine Nucleotides; Guanine; Oligonucleotides
     # 'T086', # Nucleotide Sequence, ex.: Base Sequence; Direct Repeat; RNA Sequence
     # 'T090', # Occupation or Discipline, ex.: Aviation; Craniology; Ecology
@@ -109,14 +123,14 @@
     # 'T109', # Organic Chemical, ex.: Benzene Derivatives
     # 'T001', # Organism, ex.: Organism; Infectious agent; Heterotroph
     # 'T032', # Organism Attribute, ex.: Age; Birth Weight; Eye Color
-    'T040', # Organism Function, ex.: Breeding; Hibernation; Motor Skills
+    "T040",  # Organism Function, ex.: Breeding; Hibernation; Motor Skills
     # 'T092', # Organization, ex.: Labor Unions; United Nations; Boarding school
-    'T046', # Pathologic Function, ex.: Inflammation; Shock; Thrombosis
+    "T046",  # Pathologic Function, ex.: Inflammation; Shock; Thrombosis
     # 'T101', # Patient or Disabled Group, ex.: Amputees; Institutionalized Child; Mentally Ill Persons
-    'T121', # Pharmacologic Substance, ex.: Antiemetics; Cardiovascular Agents; Alka-Seltzer
-    'T067', # Phenomenon or Process, ex.: Disasters; Motor Traffic Accidents; Depolymerization
+    "T121",  # Pharmacologic Substance, ex.: Antiemetics; Cardiovascular Agents; Alka-Seltzer
+    "T067",  # Phenomenon or Process, ex.: Disasters; Motor Traffic Accidents; Depolymerization
     # 'T072', # Physical Object, ex.: Printed Media; Meteors; Physical object
-    'T039', # Physiologic Function, ex.: Biorhythms; Hearing; Vasodilation
+    "T039",  # Physiologic Function, ex.: Biorhythms; Hearing; Vasodilation
     # 'T002', # Plant, ex.: Aloe; Pollen; Helianthus species
     # 'T098', # Population Group, ex.: Asian Americans; Ethnic group; Adult Offenders
     # 'T097', # Professional or Occupational Group, ex.: Clergy; Demographers; Hospital Volunteers
@@ -129,12 +143,12 @@
     # 'T062', # Research Activity, ex.: Animal Experimentation; Biomedical Research; Experimental Replication
     # 'T075', # Research Device, ex.: Electrodes, Enzyme; DNA Microarray Chip; Particle Count and Size Analyzer
     # 'T095', # Self-help or Relief Organization, ex.: Alcoholics Anonymous; Charities - organization; Red Cross
-    'T184', # Sign or Symptom, ex.: Dyspnea; Nausea; Pain
+    "T184",  # Sign or Symptom, ex.: Dyspnea; Nausea; Pain
     # 'T054', # Social Behavior, ex.: Acculturation; Communication; Interpersonal Relations
     # 'T082', # Spatial Concept, ex.: Mandibular Rest Position; Lateral; Extrinsic
     # 'T167', # Substance, ex.: Air (substance); Fossils; Plastics
     # 'T079', # Temporal Concept, ex.: Birth Intervals; Half-Life; Postoperative Period
-    'T061', # Therapeutic or Preventive Procedure, ex.: Cesarean section; Dermabrasion; Family psychotherapy
+    "T061",  # Therapeutic or Preventive Procedure, ex.: Cesarean section; Dermabrasion; Family psychotherapy
     # 'T024', # Tissue, ex.: Cartilage; Endothelium; Epidermis
     # 'T010', # Vertebrate, ex.: Vertebrates; Gnathostomata vertebrate; Craniata <chordata>
     # 'T005', # Virus, ex.: Coliphages; Echovirus; Parvoviridae
@@ -142,47 +156,70 @@
 }
 
 UNICODE_DASHES = {
-    u'\u002d', u'\u007e', u'\u00ad', u'\u058a', u'\u05be', u'\u1400',
-    u'\u1806', u'\u2010', u'\u2011', u'\u2010', u'\u2012', u'\u2013',
-    u'\u2014', u'\u2015', u'\u2053', u'\u207b', u'\u2212', u'\u208b',
-    u'\u2212', u'\u2212', u'\u2e17', u'\u2e3a', u'\u2e3b', u'\u301c',
-    u'\u3030', u'\u30a0', u'\ufe31', u'\ufe32', u'\ufe58', u'\ufe63',
-    u'\uff0d'
+    u"\u002d",
+    u"\u007e",
+    u"\u00ad",
+    u"\u058a",
+    u"\u05be",
+    u"\u1400",
+    u"\u1806",
+    u"\u2010",
+    u"\u2011",
+    u"\u2010",
+    u"\u2012",
+    u"\u2013",
+    u"\u2014",
+    u"\u2015",
+    u"\u2053",
+    u"\u207b",
+    u"\u2212",
+    u"\u208b",
+    u"\u2212",
+    u"\u2212",
+    u"\u2e17",
+    u"\u2e3a",
+    u"\u2e3b",
+    u"\u301c",
+    u"\u3030",
+    u"\u30a0",
+    u"\ufe31",
+    u"\ufe32",
+    u"\ufe58",
+    u"\ufe63",
+    u"\uff0d",
 }
 
 # language with missing value
 # will not have support for tokenization
 LANGUAGES = {
-    'BAQ': None,           # Basque
-    'CHI': None,           # Chinese
-    'CZE': None,           # Czech
-    'DAN': 'danish',       # Danish
-    'DUT': 'dutch',        # Dutch
-    'ENG': 'english',      # English
-    'EST': None,           # Estonian
-    'FIN': 'finnish',      # Finnish
-    'FRE': 'french',       # French
-    'GER': 'german',       # German
-    'GRE': 'greek',        # Greek
-    'HEB': None,           # Hebrew
-    'HUN': 'hungarian',    # Hungarian
-    'ITA': 'italian',      # Italian
-    'JPN': None,           # Japanese
-    'KOR': None,           # Korean
-    'LAV': None,           # Latvian
-    'NOR': 'norwegian',    # Norwegian
-    'POL': 'polish',       # Polish
-    'POR': 'portoguese',   # Portuguese
-    'RUS': 'russian',      # Russian
-    'SCR': None,           # Croatian
-    'SPA': 'spanish',      # Spanish
-    'SWE': 'swedish',      # Swedish
-    'TUR': 'turkish',      # Turkish
+    "BAQ": None,  # Basque
+    "CHI": None,  # Chinese
+    "CZE": None,  # Czech
+    "DAN": "danish",  # Danish
+    "DUT": "dutch",  # Dutch
+    "ENG": "english",  # English
+    "EST": None,  # Estonian
+    "FIN": "finnish",  # Finnish
+    "FRE": "french",  # French
+    "GER": "german",  # German
+    "GRE": "greek",  # Greek
+    "HEB": None,  # Hebrew
+    "HUN": "hungarian",  # Hungarian
+    "ITA": "italian",  # Italian
+    "JPN": None,  # Japanese
+    "KOR": None,  # Korean
+    "LAV": None,  # Latvian
+    "NOR": "norwegian",  # Norwegian
+    "POL": "polish",  # Polish
+    "POR": "portoguese",  # Portuguese
+    "RUS": "russian",  # Russian
+    "SCR": None,  # Croatian
+    "SPA": "spanish",  # Spanish
+    "SWE": "swedish",  # Swedish
+    "TUR": "turkish",  # Turkish
 }
 
-DOMAIN_SPECIFIC_STOPWORDS = {
-    'time'
-}
+DOMAIN_SPECIFIC_STOPWORDS = {"time"}
 
 SPACY_LANGUAGE_MAP = {
     'ENG': 'en_core_web_sm',
diff --git a/quickumls/core.py b/quickumls/core.py
index 7164d18..8a35579 100644
--- a/quickumls/core.py
+++ b/quickumls/core.py
@@ -1,33 +1,40 @@
 # future statements for Python 2 compatibility
-from __future__ import (
-    unicode_literals, division, print_function, absolute_import)
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import datetime
 
 # built in modules
 import os
 import sys
-import datetime
-from six.moves import xrange
+
+import nltk
 
 # installed modules
 import spacy
-import nltk
+from six.moves import xrange
 from unidecode import unidecode
 
 # project modules
-from . import toolbox
-from . import constants
+from . import constants, toolbox
 
 
 class QuickUMLS(object):
     """The main class to interact with the matcher.
     """
+
     def __init__(
-            self, quickumls_fp,
-            overlapping_criteria='score', threshold=0.7, window=5,
-            similarity_name='jaccard', min_match_length=3,
-            accepted_semtypes=constants.ACCEPTED_SEMTYPES,
-            verbose=False, keep_uppercase=False,
-            spacy_component = False):
+        self,
+        quickumls_fp,
+        overlapping_criteria="score",
+        threshold=0.7,
+        window=5,
+        similarity_name="jaccard",
+        min_match_length=3,
+        accepted_semtypes=constants.ACCEPTED_SEMTYPES,
+        verbose=False,
+        keep_uppercase=False,
+        spacy_component=False,
+    ):
         """Instantiate QuickUMLS object
 
             This is the main interface through which text can be processed.
@@ -61,24 +68,23 @@ def __init__(
 
         self.verbose = verbose
 
-        valid_criteria = {'length', 'score'}
+        valid_criteria = {"length", "score"}
         err_msg = (
             '"{}" is not a valid overlapping_criteria. Choose '
-            'between {}'.format(
-                overlapping_criteria, ', '.join(valid_criteria)
-            )
+            "between {}".format(overlapping_criteria, ", ".join(valid_criteria))
         )
         assert overlapping_criteria in valid_criteria, err_msg
         self.overlapping_criteria = overlapping_criteria
 
-        valid_similarities = {'dice', 'jaccard', 'cosine', 'overlap'}
-        err_msg = ('"{}" is not a valid similarity name. Choose between '
-                   '{}'.format(similarity_name, ', '.join(valid_similarities)))
-        assert not(valid_similarities in valid_similarities), err_msg
+        valid_similarities = {"dice", "jaccard", "cosine", "overlap"}
+        err_msg = '"{}" is not a valid similarity name. Choose between ' "{}".format(
+            similarity_name, ", ".join(valid_similarities)
+        )
+        assert not (valid_similarities in valid_similarities), err_msg
         self.similarity_name = similarity_name
 
-        simstring_fp = os.path.join(quickumls_fp, 'umls-simstring.db')
-        cuisem_fp = os.path.join(quickumls_fp, 'cui-semtypes.db')
+        simstring_fp = os.path.join(quickumls_fp, "umls-simstring.db")
+        cuisem_fp = os.path.join(quickumls_fp, "cui-semtypes.db")
 
         self.valid_punct = constants.UNICODE_DASHES
         self.negations = constants.NEGATIONS
@@ -88,57 +94,61 @@ def __init__(
         self.threshold = threshold
         self.min_match_length = min_match_length
         self.to_lowercase_flag = os.path.exists(
-            os.path.join(quickumls_fp, 'lowercase.flag')
+            os.path.join(quickumls_fp, "lowercase.flag")
         )
         self.normalize_unicode_flag = os.path.exists(
-            os.path.join(quickumls_fp, 'normalize-unicode.flag')
+            os.path.join(quickumls_fp, "normalize-unicode.flag")
         )
         self.keep_uppercase = keep_uppercase
 
         # Check whether data is installed with lowercase flag and QuickUMLS initiated with keeping uppercase words
         if self.to_lowercase_flag and self.keep_uppercase:
-            raise ValueError('Database is installed with lowercase flag and QuickUMLS is initiated with '
-                             'keep_uppercase flag. This would prevent identifying concepts that contain all uppercase'
-                             'characters. Please reinstall data without --lowercase or run QuickUMLS without'
-                             '--keep_uppercase.')
+            raise ValueError(
+                "Database is installed with lowercase flag and QuickUMLS is initiated with "
+                "keep_uppercase flag. This would prevent identifying concepts that contain all uppercase"
+                "characters. Please reinstall data without --lowercase or run QuickUMLS without"
+                "--keep_uppercase."
+            )
 
-        language_fp = os.path.join(quickumls_fp, 'language.flag')
+        language_fp = os.path.join(quickumls_fp, "language.flag")
 
         # download stopwords if necessary
         try:
             nltk.corpus.stopwords.words()
         except LookupError:
-            nltk.download('stopwords')
+            nltk.download("stopwords")
 
         if os.path.exists(language_fp):
             with open(language_fp) as f:
                 self.language_flag = f.read().strip()
         else:
-            self.language_flag = 'ENG'
+            self.language_flag = "ENG"
 
         if self.language_flag not in constants.LANGUAGES:
             raise ValueError('Language "{}" not supported'.format(self.language_flag))
         elif constants.LANGUAGES[self.language_flag] is None:
             self._stopwords = set()
-            spacy_lang = 'XXX'
+            spacy_lang = "XXX"
         else:
             self._stopwords = set(
                 nltk.corpus.stopwords.words(constants.LANGUAGES[self.language_flag])
             )
             spacy_lang = constants.SPACY_LANGUAGE_MAP[self.language_flag]
 
-        database_backend_fp = os.path.join(quickumls_fp, 'database_backend.flag')
+        database_backend_fp = os.path.join(quickumls_fp, "database_backend.flag")
         if os.path.exists(database_backend_fp):
             with open(database_backend_fp) as f:
                 self._database_backend = f.read().strip()
         else:
-            print('[WARNING] This installation was created with QuickUMLS v.1.3 or earlier, '
-                  'which does not support multiple database backends. For now, I\'ll '
-                  'assume that leveldb was used as default, implicit assumption will '
-                  'change in future versions of QuickUMLS. More info here: '
-                  'https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4',
-                  file=sys.stderr)
-            self._database_backend = 'leveldb'
+            print(
+                "[WARNING] This installation was created with QuickUMLS v.1.3 or earlier, "
+                "which does not support multiple database backends. For now, I'll "
+                "assume that leveldb was used as default, implicit assumption will "
+                "change in future versions of QuickUMLS. More info here: "
+                "https://github.com/Georgetown-IR-Lab/QuickUMLS/wiki/Migration-QuickUMLS-1.3-to-1.4",
+                file=sys.stderr,
+            )
+            self._database_backend = "leveldb"
 
         # domain specific stopwords
         self._stopwords = self._stopwords.union(constants.DOMAIN_SPECIFIC_STOPWORDS)
@@ -158,19 +168,20 @@ def __init__(
                 msg = (
                     'Model for language "{}" is not downloaded. Please '
                     'run "python -m spacy download {}" before launching '
-                    'QuickUMLS'
+                    "QuickUMLS"
                 ).format(
                     self.language_flag,
-                    constants.SPACY_LANGUAGE_MAP.get(self.language_flag, 'xx')
+                    constants.SPACY_LANGUAGE_MAP.get(self.language_flag, "xx"),
                 )
                 raise OSError(msg)
 
-        self.ss_db = toolbox.SimstringDBReader(
-            simstring_fp, similarity_name, threshold
-        )
+        self.ss_db = toolbox.SimstringDBReader(simstring_fp, similarity_name, threshold)
         self.cuisem_db = toolbox.CuiSemTypesDB(
             cuisem_fp, database_backend=self._database_backend
         )
+        self.cuipref_db = toolbox.CuiPrefDB(
+            cuisem_fp, database_backend=self._database_backend
+        )
 
     def get_info(self):
         """Computes a summary of the matcher options.
@@ -194,44 +205,53 @@ def info(self):
 
         if self._info is None:
             self._info = {
-                'threshold': self.threshold,
-                'similarity_name': self.similarity_name,
-                'window': self.window,
-                'ngram_length': self.ngram_length,
-                'min_match_length': self.min_match_length,
-                'accepted_semtypes': sorted(self.accepted_semtypes),
-                'negations': sorted(self.negations),
-                'valid_punct': sorted(self.valid_punct)
+                "threshold": self.threshold,
+                "similarity_name": self.similarity_name,
+                "window": self.window,
+                "ngram_length": self.ngram_length,
+                "min_match_length": self.min_match_length,
+                "accepted_semtypes": sorted(self.accepted_semtypes),
+                "negations": sorted(self.negations),
+                "valid_punct": sorted(self.valid_punct),
             }
         return self._info
 
     def _is_valid_token(self, tok):
-        return not(
-            tok.is_punct or tok.is_space or
-            tok.pos_ == 'ADP' or tok.pos_ == 'DET' or tok.pos_ == 'CONJ'
+        return not (
+            tok.is_punct
+            or tok.is_space
+            or tok.pos_ == "ADP"
+            or tok.pos_ == "DET"
+            or tok.pos_ == "CONJ"
         )
 
     def _is_valid_start_token(self, tok):
-        return not(
-            tok.like_num or
-            (self._is_stop_term(tok) and tok.lemma_ not in self.negations) or
-            tok.pos_ == 'ADP' or tok.pos_ == 'DET' or tok.pos_ == 'CONJ'
+        return not (
+            tok.like_num
+            or (self._is_stop_term(tok) and tok.lemma_ not in self.negations)
+            or tok.pos_ == "ADP"
+            or tok.pos_ == "DET"
+            or tok.pos_ == "CONJ"
         )
 
     def _is_stop_term(self, tok):
         return tok.text in self._stopwords
 
     def _is_valid_end_token(self, tok):
-        return not(
-            tok.is_punct or tok.is_space or self._is_stop_term(tok) or
-            tok.pos_ == 'ADP' or tok.pos_ == 'DET' or tok.pos_ == 'CONJ'
+        return not (
+            tok.is_punct
+            or tok.is_space
+            or self._is_stop_term(tok)
+            or tok.pos_ == "ADP"
+            or tok.pos_ == "DET"
+            or tok.pos_ == "CONJ"
         )
 
     def _is_valid_middle_token(self, tok):
         return (
-            not(tok.is_punct or tok.is_space) or
-            tok.is_bracket or
-            tok.text in self.valid_punct
+            not (tok.is_punct or tok.is_space)
+            or tok.is_bracket
+            or tok.text in self.valid_punct
         )
 
     def _is_ok_semtype(self, target_semtypes):
@@ -248,7 +268,7 @@ def _make_ngrams(self, sent):
         sent_length = len(sent)
 
         # do not include determiners inside a span
-        skip_in_span = {token.i for token in sent if token.pos_ == 'DET'}
+        skip_in_span = {token.i for token in sent if token.pos_ == "DET"}
 
         # invalidate a span if it includes any on these symbols
         invalid_mid_tokens = {
@@ -273,11 +293,12 @@ def _make_ngrams(self, sent):
             # we take a shortcut if the token is the last one
             # in the sentence
             if (
-                i + 1 == sent_length and            # it's the last token
-                self._is_valid_end_token(tok) and   # it's a valid end token
-                len(tok) >= self.min_match_length   # it's of miminum length
+                i + 1 == sent_length
+                and self._is_valid_end_token(tok)  # it's the last token
+                and len(tok)  # it's a valid end token
+                >= self.min_match_length  # it's of miminum length
             ):
-                yield(tok.idx, tok.idx + len(tok), tok.text)
+                yield (tok.idx, tok.idx + len(tok), tok.text)
 
             for j in xrange(i + 1, span_end):
                 if compensate:
@@ -296,11 +317,18 @@ def _make_ngrams(self, sent):
                     continue
 
                 yield (
-                    span.start_char, span.end_char,
-                    ''.join(token.text_with_ws for token in span
-                            if token.i not in skip_in_span).strip()
+                    span.start_char,
+                    span.end_char,
+                    "".join(
+                        token.text_with_ws
+                        for token in span
+                        if token.i not in skip_in_span
+                    ).strip(),
                 )
 
+    def get_preferred_term(self, cui):
+        return self.cuipref_db.get(cui)
+
     def _get_all_matches(self, ngrams):
         matches = []
         for start, end, ngram in ngrams:
@@ -317,11 +345,15 @@ def _get_all_matches(self, ngrams):
             # no match is found; so we convert to lowercase;
             # however, this is never needed if the string is lowercased
             # in the step above
-            if not self.to_lowercase_flag and ngram_normalized.isupper() and not self.keep_uppercase:
+            if (
+                not self.to_lowercase_flag
+                and ngram_normalized.isupper()
+                and not self.keep_uppercase
+            ):
                 ngram_normalized = ngram_normalized.lower()
 
             prev_cui = None
-            ngram_cands = list(self.ss_db.get(ngram_normalized))
+            ngram_cands = list(set(self.ss_db.get(ngram_normalized)))
 
             ngram_matches = []
 
@@ -332,11 +364,11 @@ def _get_all_matches(self, ngrams):
                     x=ngram_normalized,
                     y=match,
                     n=self.ngram_length,
-                    similarity_name=self.similarity_name
+                    similarity_name=self.similarity_name,
                 )
 
                 if match_similarity == 0:
-                        continue
+                    continue
 
                 for cui, semtypes, preferred in cuisem_match:
 
@@ -344,23 +376,25 @@ def _get_all_matches(self, ngrams):
                         continue
 
                     if prev_cui is not None and prev_cui == cui:
-                        if match_similarity > ngram_matches[-1]['similarity']:
+                        if match_similarity > ngram_matches[-1]["similarity"]:
                             ngram_matches.pop(-1)
                         else:
                             continue
 
                     prev_cui = cui
+                    preferred_term = self.cuipref_db.get(cui)
 
                     ngram_matches.append(
                         {
-                            'start': start,
-                            'end': end,
-                            'ngram': ngram,
-                            'term': toolbox.safe_unicode(match),
-                            'cui': cui,
-                            'similarity': match_similarity,
-                            'semtypes': semtypes,
-                            'preferred': preferred
+                            "start": start,
+                            "end": end,
+                            "ngram": ngram,
+                            "term": toolbox.safe_unicode(match),
+                            "cui": cui,
+                            "similarity": match_similarity,
+                            "semtypes": semtypes,
+                            "preferred": preferred,
+                            "preferred_term": preferred_term,
                         }
                     )
 
@@ -368,23 +402,24 @@ def _get_all_matches(self, ngrams):
                 matches.append(
                     sorted(
                         ngram_matches,
-                        key=lambda m: m['similarity'] + m['preferred'],
-                        reverse=True
+                        key=lambda m: m["similarity"] + m["preferred"],
+                        reverse=True,
                     )
                 )
         return matches
 
     @staticmethod
     def _select_score(match):
-        return (match[0]['similarity'], (match[0]['end'] - match[0]['start']))
+        return (match[0]["similarity"], (match[0]["end"] - match[0]["start"]))
 
     @staticmethod
     def _select_longest(match):
-        return ((match[0]['end'] - match[0]['start']), match[0]['similarity'])
+        return ((match[0]["end"] - match[0]["start"]), match[0]["similarity"])
 
     def _select_terms(self, matches):
         sort_func = (
-            self._select_longest if self.overlapping_criteria == 'length'
+            self._select_longest
+            if self.overlapping_criteria == "length"
             else self._select_score
         )
 
@@ -394,7 +429,7 @@ def _select_terms(self, matches):
         final_matches_subset = []
 
         for match in matches:
-            match_interval = (match[0]['start'], match[0]['end'])
+            match_interval = (match[0]["start"], match[0]["end"])
             if match_interval not in intervals:
                 final_matches_subset.append(match)
                 intervals.append(match_interval)
@@ -403,8 +438,7 @@ def _select_terms(self, matches):
 
     def _make_token_sequences(self, parsed):
         for i in range(len(parsed)):
-            for j in xrange(
-                    i + 1, min(i + self.window, len(parsed)) + 1):
+            for j in xrange(i + 1, min(i + self.window, len(parsed)) + 1):
                 span = parsed[i:j]
 
                 if not self._is_longer_than_min(span):
@@ -417,12 +451,12 @@ def _print_verbose_status(self, parsed, matches):
             return False
 
         print(
-            '[{}] {:,} extracted from {:,} tokens'.format(
+            "[{}] {:,} extracted from {:,} tokens".format(
                 datetime.datetime.now().isoformat(),
                 sum(len(match_group) for match_group in matches),
-                len(parsed)
+                len(parsed),
             ),
-            file=sys.stderr
+            file=sys.stderr,
         )
         return True
 
@@ -442,13 +476,13 @@ def match(self, text, best_match=True, ignore_syntax=False):
             TODO: Describe format
         """
 
-        parsed = self.nlp(u'{}'.format(text))
-        
+        parsed = self.nlp("{}".format(text))
+
         # pass in parsed spacy doc to get concept matches
-        matches = self._match(parsed)
+        matches = self._match(parsed, best_match, ignore_syntax)
 
         return matches
-        
+
     def _match(self, doc, best_match=True, ignore_syntax=False):
         """Gathers ngram matches given a spaCy document object.
 
@@ -464,7 +498,7 @@ def _match(self, doc, best_match=True, ignore_syntax=False):
             List: List of all matches in the text
             TODO: Describe format
         """
-        
+
         ngrams = None
         if ignore_syntax:
             ngrams = self._make_token_sequences(doc)
@@ -477,5 +511,5 @@ def _match(self, doc, best_match=True, ignore_syntax=False):
             matches = self._select_terms(matches)
 
         self._print_verbose_status(doc, matches)
-        
+
         return matches
diff --git a/quickumls/install.py b/quickumls/install.py
index 0c1998c..f09c01d 100644
--- a/quickumls/install.py
+++ b/quickumls/install.py
@@ -1,85 +1,74 @@
-from __future__ import unicode_literals, division, print_function
+from __future__ import division, print_function, unicode_literals
 
 # built in modules
 import argparse
 import codecs
 import os
-from six.moves import input
 import shutil
 import sys
 import time
+
+import spacy
+import tqdm
+from six.moves import input
+
 try:
     from unidecode import unidecode
 except ImportError:
     pass
 
-
-# third party-dependencies
-import spacy
-
-
-# project modules
-from .toolbox import countlines, CuiSemTypesDB, SimstringDBWriter, mkdir
 from .constants import HEADERS_MRCONSO, HEADERS_MRSTY, LANGUAGES, SPACY_LANGUAGE_MAP
+from .toolbox import CuiPrefDB, CuiSemTypesDB, SimstringDBWriter, countlines, mkdir
 
 
 def get_semantic_types(path, headers):
     sem_types = {}
-    with codecs.open(path, encoding='utf-8') as f:
-        for i, ln in enumerate(f):
-            content = dict(zip(headers, ln.strip().split('|')))
-
-            sem_types.setdefault(content['cui'], []).append(content['sty'])
+    with codecs.open(path, encoding="utf-8") as f:
+        for line in tqdm.tqdm(f, total=countlines(path)):
+            content = dict(zip(headers, line.strip().split("|")))
+            sem_types.setdefault(content["cui"], []).append(content["sty"])
 
     return sem_types
 
 
-def get_mrconso_iterator(path, headers, lang='ENG'):
-    with codecs.open(path, encoding='utf-8') as f:
-        for i, ln in enumerate(f):
-            content = dict(zip(headers, ln.strip().split('|')))
+def get_mrconso_iterator(path, headers, lang="ENG"):
+    with codecs.open(path, encoding="utf-8") as f:
+        for ln in f:
+            content = dict(zip(headers, ln.strip().split("|")))
 
-            if content['lat'] != lang:
+            if content["lat"] != lang:
                 continue
 
             yield content
 
 
 def extract_from_mrconso(
-        mrconso_path, mrsty_path, opts,
-        mrconso_header=HEADERS_MRCONSO, mrsty_header=HEADERS_MRSTY):
+    mrconso_path,
+    mrsty_path,
+    opts,
+    mrconso_header=HEADERS_MRCONSO,
+    mrsty_header=HEADERS_MRSTY,
+):
 
     start = time.time()
-    print('loading semantic types...', end=' ')
+    print("loading semantic types...", end=" ")
     sys.stdout.flush()
     sem_types = get_semantic_types(mrsty_path, mrsty_header)
-    print('done in {:.2f} s'.format(time.time() - start))
+    print("done in {:.2f} s".format(time.time() - start))
 
     start = time.time()
 
-    mrconso_iterator = get_mrconso_iterator(
-        mrconso_path, mrconso_header, opts.language
-    )
+    mrconso_iterator = get_mrconso_iterator(mrconso_path, mrconso_header, opts.language)
 
     total = countlines(mrconso_path)
 
-    processed = set()
-    i = 0
-
-    for content in mrconso_iterator:
-        i += 1
-
-        if i % 100000 == 0:
-            delta = time.time() - start
-            status = (
-                '{:,} in {:.2f} s ({:.2%}, {:.1e} s / term)'
-                ''.format(i, delta, i / total, delta / i if i > 0 else 0)
-            )
-            print(status)
-
-        concept_text = content['str'].strip()
-        cui = content['cui']
-        preferred = 1 if content['ispref'] == 'Y' else 0
+    for content in tqdm.tqdm(mrconso_iterator, total=total):
+        concept_text = content["str"].strip()
+        cui = content["cui"]
+        preferred = 1 if content["ispref"] == "Y" else 0
+        preferred_term = 1 if content["ts"] == "P" else 0
+        preferred_string = 1 if content["stt"] == "PF" else 0
+        source = content["sab"]
 
         if opts.lowercase:
             concept_text = concept_text.lower()
@@ -87,37 +76,72 @@ def extract_from_mrconso(
         if opts.normalize_unicode:
             concept_text = unidecode(concept_text)
 
-        if (cui, concept_text) in processed:
-            continue
-        else:
-            processed.add((cui, concept_text))
+        yield (
+            concept_text,
+            cui,
+            sem_types[cui],
+            preferred,
+            preferred_term,
+            preferred_string,
+            source,
+        )
 
-        yield (concept_text, cui, sem_types[cui], preferred)
 
-    delta = time.time() - start
-    status = (
-        '\nCOMPLETED: {:,} in {:.2f} s ({:.1e} s / term)'
-        ''.format(i, delta, i / total, delta / i if i > 0 else 0)
-    )
-    print(status)
-
-
-def parse_and_encode_ngrams(extracted_it, simstring_dir, cuisty_dir, database_backend):
+def parse_and_encode_ngrams(
+    extracted_it, simstring_dir, cuisty_dir, database_backend, sources
+):
     # Create destination directories for the two databases
     mkdir(simstring_dir)
     mkdir(cuisty_dir)
 
     ss_db = SimstringDBWriter(simstring_dir)
     cuisty_db = CuiSemTypesDB(cuisty_dir, database_backend=database_backend)
-
-    simstring_terms = set()
-
-    for i, (term, cui, stys, preferred) in enumerate(extracted_it, start=1):
-        if term not in simstring_terms:
-            ss_db.insert(term)
-            simstring_terms.add(term)
-
-        cuisty_db.insert(term, cui, stys, preferred)
+    cuipref_db = CuiPrefDB(cuisty_dir, database_backend=database_backend)
+
+    prev_cui = None
+    pref_term = False
+    prev_term = None
+    cui_terms = set()
+    ss_db_terms = []
+    cuisty_terms = []
+    pref_terms = []
+    cui_sources = set()
+    cuis = []
+    for data in extracted_it:
+        term, cui, stys, preferred, preferred_term, preferred_string, source = data
+        cuis.append(cui)
+        if cui != prev_cui:
+            if prev_cui is not None:
+                if not pref_term:
+                    raise RuntimeError(
+                        f"did not find preferred term for cui {prev_cui}"
+                    )
+                if sources is None or cui_sources.intersection(sources):
+                    for _term in ss_db_terms:
+                        ss_db.insert(_term)
+                    for _data in cuisty_terms:
+                        cuisty_db.insert(*_data)
+                    for _data in pref_terms:
+                        cuipref_db.insert(*_data)
+
+            prev_cui = cui
+            pref_term = False
+            cui_terms = set()
+            ss_db_terms = []
+            cuisty_terms = []
+            pref_terms = []
+            cui_sources = set()
+
+        if prev_term != term and term not in cui_terms:
+            ss_db_terms.append(term)
+        prev_term = term
+        cui_terms.add(term)
+        cui_sources.add(source)
+
+        cuisty_terms.append((term, cui, stys, preferred))
+        if preferred_term and preferred and preferred_string:
+            pref_terms.append((term, cui))
+            pref_term = True
 
 
 def install_spacy(lang):
@@ -128,38 +152,57 @@ def install_spacy(lang):
     if lang in SPACY_LANGUAGE_MAP:
         try:
             spacy.load(SPACY_LANGUAGE_MAP[lang])
-            print(f'SpaCy is installed and avaliable for {lang}!')
+            print(f"SpaCy is installed and avaliable for {lang}!")
         except OSError:
-            print(f'SpaCy is not available! Attempting to download and install...')
+            print(f"SpaCy is not available! Attempting to download and install...")
             spacy.cli.download(SPACY_LANGUAGE_MAP[lang])
 
 
 def parse_args():
     ap = argparse.ArgumentParser()
     ap.add_argument(
-        'umls_installation_path',
-        help=('Location of UMLS installation files (`MRCONSO.RRF` and '
-              '`MRSTY.RRF` files)')
+        "umls_installation_path",
+        help=(
+            "Location of UMLS installation files (`MRCONSO.RRF` and "
+            "`MRSTY.RRF` files)"
+        ),
+    )
+    ap.add_argument(
+        "destination_path",
+        help="Location where the necessary QuickUMLS files are installed",
     )
     ap.add_argument(
-        'destination_path',
-        help='Location where the necessary QuickUMLS files are installed'
+        "-L",
+        "--lowercase",
+        action="store_true",
+        help="Consider only lowercase version of tokens",
     )
     ap.add_argument(
-        '-L', '--lowercase', action='store_true',
-        help='Consider only lowercase version of tokens'
+        "-U",
+        "--normalize-unicode",
+        action="store_true",
+        help="Normalize unicode strings to their closest ASCII representation",
     )
     ap.add_argument(
-        '-U', '--normalize-unicode', action='store_true',
-        help='Normalize unicode strings to their closest ASCII representation'
+        "-d",
+        "--database-backend",
+        choices=("leveldb", "unqlite"),
+        default="unqlite",
+        help="KV database to use to store CUIs and semantic types",
     )
     ap.add_argument(
-        '-d', '--database-backend', choices=('leveldb', 'unqlite'), default='unqlite',
-        help='KV database to use to store CUIs and semantic types'
+        "-E",
+        "--language",
+        default="ENG",
+        choices=LANGUAGES,
+        help="Extract concepts of the specified language",
     )
     ap.add_argument(
-        '-E', '--language', default='ENG', choices=LANGUAGES,
-        help='Extract concepts of the specified language'
+        "-S",
+        "--sources",
+        default=None,
+        nargs="*",
+        help="List of sources for which a concept has to have at least one term from",
     )
     opts = ap.parse_args()
     return opts
@@ -171,63 +214,72 @@ def main():
     install_spacy(opts.language)
 
     if not os.path.exists(opts.destination_path):
-        msg = ('Directory "{}" does not exists; should I create it? [y/N] '
-               ''.format(opts.destination_path))
-        create = input(msg).lower().strip() == 'y'
+        msg = 'Directory "{}" does not exists; should I create it? [y/N] ' "".format(
+            opts.destination_path
+        )
+        create = input(msg).lower().strip() == "y"
 
         if create:
             os.makedirs(opts.destination_path)
         else:
-            print('Aborting.')
+            print("Aborting.")
             exit(1)
 
     if len(os.listdir(opts.destination_path)) > 0:
-        msg = ('Directory "{}" is not empty; should I empty it? [y/N] '
-               ''.format(opts.destination_path))
-        empty = input(msg).lower().strip() == 'y'
+        msg = 'Directory "{}" is not empty; should I empty it? [y/N] ' "".format(
+            opts.destination_path
+        )
+        empty = input(msg).lower().strip() == "y"
         if empty:
             shutil.rmtree(opts.destination_path)
             os.mkdir(opts.destination_path)
         else:
-            print('Aborting.')
+            print("Aborting.")
             exit(1)
 
     if opts.normalize_unicode:
         try:
             unidecode
         except NameError:
-            err = ('`unidecode` is needed for unicode normalization'
-                   'please install it via the `[sudo] pip install '
-                   'unidecode` command.')
+            err = (
+                "`unidecode` is needed for unicode normalization"
+                "please install it via the `[sudo] pip install "
+                "unidecode` command."
+            )
             print(err, file=sys.stderr)
             exit(1)
 
-        flag_fp = os.path.join(opts.destination_path, 'normalize-unicode.flag')
-        open(flag_fp, 'w').close()
+        flag_fp = os.path.join(opts.destination_path, "normalize-unicode.flag")
+        open(flag_fp, "w").close()
 
     if opts.lowercase:
-        flag_fp = os.path.join(opts.destination_path, 'lowercase.flag')
-        open(flag_fp, 'w').close()
+        flag_fp = os.path.join(opts.destination_path, "lowercase.flag")
+        open(flag_fp, "w").close()
 
-    flag_fp = os.path.join(opts.destination_path, 'language.flag')
-    with open(flag_fp, 'w') as f:
+    flag_fp = os.path.join(opts.destination_path, "language.flag")
+    with open(flag_fp, "w") as f:
         f.write(opts.language)
 
-    flag_fp = os.path.join(opts.destination_path, 'database_backend.flag')
-    with open(flag_fp, 'w') as f:
+    flag_fp = os.path.join(opts.destination_path, "database_backend.flag")
+    with open(flag_fp, "w") as f:
         f.write(opts.database_backend)
 
-    mrconso_path = os.path.join(opts.umls_installation_path, 'MRCONSO.RRF')
-    mrsty_path = os.path.join(opts.umls_installation_path, 'MRSTY.RRF')
+    mrconso_path = os.path.join(opts.umls_installation_path, "MRCONSO.RRF")
+    mrsty_path = os.path.join(opts.umls_installation_path, "MRSTY.RRF")
 
     mrconso_iterator = extract_from_mrconso(mrconso_path, mrsty_path, opts)
 
-    simstring_dir = os.path.join(opts.destination_path, 'umls-simstring.db')
-    cuisty_dir = os.path.join(opts.destination_path, 'cui-semtypes.db')
+    simstring_dir = os.path.join(opts.destination_path, "umls-simstring.db")
+    cuisty_dir = os.path.join(opts.destination_path, "cui-semtypes.db")
 
-    parse_and_encode_ngrams(mrconso_iterator, simstring_dir, cuisty_dir,
-                            database_backend=opts.database_backend)
+    parse_and_encode_ngrams(
+        mrconso_iterator,
+        simstring_dir,
+        cuisty_dir,
+        database_backend=opts.database_backend,
+        sources=opts.sources,
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/quickumls/network.py b/quickumls/network.py
index 425289d..5af3504 100644
--- a/quickumls/network.py
+++ b/quickumls/network.py
@@ -1,27 +1,27 @@
-'''Minimal client server through sockets
-https://github.com/lucasoldaini/MinimalServer'''
+"""Minimal client server through sockets
+https://github.com/lucasoldaini/MinimalServer"""
 
-import six
-import sys
-import time
+import datetime
+import inspect
 import math
 import socket
-import inspect
-import datetime
+import sys
 import threading
+import time
+
+import six
 
 try:
-    import SocketServer as socketserver
     import cPickle as pickle
+    import SocketServer as socketserver
 except ImportError:
-    import socketserver
     import pickle
+    import socketserver
 
 
 def pad_message(message, blocklength):
     """Pad a message so its length is a multiple of blocklength."""
-    message_padded_length = (
-        int(math.ceil(len(message) / blocklength)) * blocklength)
+    message_padded_length = int(math.ceil(len(message) / blocklength)) * blocklength
     padded_message = message.ljust(message_padded_length)
     return padded_message
 
@@ -36,7 +36,7 @@ def receive_data_in_chunks(sock, buffersize):
         chunk = sock.recv(buffersize)
         chunks.append(chunk)
 
-    data = b''.join(chunks).strip()
+    data = b"".join(chunks).strip()
     return data
 
 
@@ -46,7 +46,7 @@ def send_data_in_chunks(data, sock, buffersize):
 
     # We sent an empty chunk to signal that we're done
     # transmitting the message.
-    sock.send(b' ' * buffersize)
+    sock.send(b" " * buffersize)
 
 
 class MinimalServerHandler(socketserver.BaseRequestHandler):
@@ -66,19 +66,21 @@ def handle(self):
         # fails, pass the error as response (the client will raise
         # the expection)
         try:
-            response = getattr(
-                self.server.served_object, method_name)(*args, **kwargs)
+            response = getattr(self.server.served_object, method_name)(*args, **kwargs)
         except Exception as ex:
             response = ex
 
         # send the response to the client in chunks
         send_data_in_chunks(
             pickle.dumps(response, protocol=self.server.pickle_protocol),
-            self.request, self.server.buffersize)
+            self.request,
+            self.server.buffersize,
+        )
 
 
 class MinimalServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
     """TCP Server"""
+
     served_object = None
     buffersize = 2048
     pickle_protocol = None
@@ -87,8 +89,14 @@ class MinimalServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
 class MinimalClient(object):
     """Minimal client to provide communication with the server"""
 
-    def __init__(self, target_class, host='localhost', port=4444,
-                 buffersize=2048, pickle_protocol=None):
+    def __init__(
+        self,
+        target_class,
+        host="localhost",
+        port=4444,
+        buffersize=2048,
+        pickle_protocol=None,
+    ):
         """Initialize the client
         Args:
             target_class (object): the class to be served by the
@@ -118,8 +126,9 @@ def __init__(self, target_class, host='localhost', port=4444,
 
         # bind public methods on target_class here
         for method_name, method in inspect.getmembers(
-                target_class, predicate=predicate):
-            if method_name.startswith('_'):
+            target_class, predicate=predicate
+        ):
+            if method_name.startswith("_"):
                 continue
             setattr(self, method_name, self._func_req_wrapper(method_name))
 
@@ -131,8 +140,9 @@ def func_request(*args, **kwargs):
             """Send the request to the server"""
 
             # prepare the data
-            data = pickle.dumps((method_name, args, kwargs),
-                                protocol=self.pickle_protocol)
+            data = pickle.dumps(
+                (method_name, args, kwargs), protocol=self.pickle_protocol
+            )
 
             # open the socket
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -150,7 +160,7 @@ def func_request(*args, **kwargs):
                 data = pickle.loads(response)
             except EOFError:
                 # server sent an empty message"
-                msg = 'empty message received from the server.'
+                msg = "empty message received from the server."
                 raise RuntimeError(msg)
 
             # raises an exception if an exception was raised by the
@@ -164,8 +174,9 @@ def func_request(*args, **kwargs):
         return func_request
 
 
-def run_server(served_object, host='localhost',
-               port=4444, buffersize=2048, pickle_protocol=None):
+def run_server(
+    served_object, host="localhost", port=4444, buffersize=2048, pickle_protocol=None
+):
     """Runs the server
     Args:
         served_obkect (object): the object to be served by the
@@ -199,7 +210,7 @@ def run_server(served_object, host='localhost',
     server_thread.daemon = True
     server_thread.start()
     print(
-        '[{}] server running at {}:{} (press ^C to interrupt)'.format(
+        "[{}] server running at {}:{} (press ^C to interrupt)".format(
             datetime.datetime.now().isoformat(), host, port, server_thread.name
         )
     )
@@ -209,9 +220,7 @@ def run_server(served_object, host='localhost',
         while True:
             time.sleep(1)
     except KeyboardInterrupt:
-        print(
-            '\n[{}] server stopped'.format(datetime.datetime.now().isoformat())
-        )
+        print("\n[{}] server stopped".format(datetime.datetime.now().isoformat()))
 
     # Terminate the server
     server.shutdown()
diff --git a/quickumls/server.py b/quickumls/server.py
index cc32c2c..29deee6 100644
--- a/quickumls/server.py
+++ b/quickumls/server.py
@@ -13,7 +13,7 @@ def run_quickumls_server(opts):
         window=opts.window,
         min_match_length=opts.min_match_length,
         verbose=opts.verbose,
-        keep_uppercase=opts.keep_uppercase
+        keep_uppercase=opts.keep_uppercase,
     )
 
     run_server(matcher, host=opts.host, port=opts.port, buffersize=4096)
@@ -21,63 +21,75 @@ def run_quickumls_server(opts):
 
 def parse_args():
     ap = ArgumentParser(
-        prog='QuickUMLS server',
+        prog="QuickUMLS server",
         description=(
-            'For more detailed instructions, visit '
-            'github.com/Georgetown-IR-Lab/QuickUMLS'
-        )
+            "For more detailed instructions, visit "
+            "github.com/Georgetown-IR-Lab/QuickUMLS"
+        ),
     )
 
     # required arguments
     ap.add_argument(
-        'quickumls_fp',
-        help='directory where the QuickUMLS data files are installed.'
+        "quickumls_fp", help="directory where the QuickUMLS data files are installed."
     )
 
     # server configuration
+    ap.add_argument("-H", "--host", default="localhost", help="host of the server")
     ap.add_argument(
-        '-H', '--host', default='localhost',
-        help='host of the server'
-    )
-    ap.add_argument(
-        '-P', '--port', default=4645, type=int,
-        help='port on which the script responds'
+        "-P", "--port", default=4645, type=int, help="port on which the script responds"
     )
 
     # QuickUMLS options
     ap.add_argument(
-        '-t', '--threshold', default=0.7, type=float,
-        help='minimum similarity value between strings'
+        "-t",
+        "--threshold",
+        default=0.7,
+        type=float,
+        help="minimum similarity value between strings",
     )
     ap.add_argument(
-        '-o', '--overlapping_criteria', default='score',
-        choices=['score', 'length'],
-        help='criteria used to deal with overlapping concepts'
+        "-o",
+        "--overlapping_criteria",
+        default="score",
+        choices=["score", "length"],
+        help="criteria used to deal with overlapping concepts",
     )
 
     ap.add_argument(
-        '-s', '--similarity_name', default='jaccard',
-        choices=['dice', 'jaccard', 'cosine', 'overlap'],
-        help='name of similarity to use'
+        "-s",
+        "--similarity_name",
+        default="jaccard",
+        choices=["dice", "jaccard", "cosine", "overlap"],
+        help="name of similarity to use",
     )
     ap.add_argument(
-        '-w', '--window', default=5, type=int,
-        help='maximum number of tokens to consider for matching'
+        "-w",
+        "--window",
+        default=5,
+        type=int,
+        help="maximum number of tokens to consider for matching",
     )
     ap.add_argument(
-        '-l', '--min-match-length', default=3, type=int,
-        help='minimum length of a match'
+        "-l",
+        "--min-match-length",
+        default=3,
+        type=int,
+        help="minimum length of a match",
     )
     ap.add_argument(
-        '-v', '--verbose', action='store_true',
-        help='return verbose information while running'
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="return verbose information while running",
     )
     ap.add_argument(
-        '-u', '--keep_uppercase', action='store_true',
-        help='By default QuickUMLS converts all uppercase strings to lowercase'
-             '. This option disables that functionality, which makes QuickUMLS '
-             'useful for distinguishing acronyms from normal words. For this '
-             'the database should be installed without the -L option.'
+        "-u",
+        "--keep_uppercase",
+        action="store_true",
+        help="By default QuickUMLS converts all uppercase strings to lowercase"
+        ". This option disables that functionality, which makes QuickUMLS "
+        "useful for distinguishing acronyms from normal words. For this "
+        "the database should be installed without the -L option.",
     )
     return ap.parse_args()
 
@@ -87,5 +99,5 @@ def main():
     run_quickumls_server(opts)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/quickumls/spacy_component.py b/quickumls/spacy_component.py
index f64982f..f268c4e 100644
--- a/quickumls/spacy_component.py
+++ b/quickumls/spacy_component.py
@@ -1,14 +1,17 @@
 import spacy
-from spacy.tokens import Span
 from spacy.strings import StringStore
+from spacy.tokens import Span
 
-from .core import QuickUMLS
 from . import constants
+from .core import QuickUMLS
+
 
 class SpacyQuickUMLS(object):
-    name = 'QuickUMLS matcher'
-    
-    def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs):
+    name = "QuickUMLS matcher"
+
+    def __init__(
+        self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kwargs
+    ):
         """Instantiate SpacyQuickUMLS object
 
             This creates a QuickUMLS spaCy component which can be used in modular pipelines.  
@@ -21,47 +24,53 @@ def __init__(self, nlp, quickumls_fp, best_match=True, ignore_syntax=False, **kw
             ignore_syntax (bool, optional): Wether to use the heuristcs introduced in the paper (Soldaini and Goharian, 2016). TODO: clarify,. Defaults to False
             **kwargs: QuickUMLS keyword arguments (see QuickUMLS in core.py)
         """
-        
-        self.quickumls = QuickUMLS(quickumls_fp, 
+
+        self.quickumls = QuickUMLS(
+            quickumls_fp,
             # By default, the QuickUMLS objects creates its own internal spacy pipeline but this is not needed
             # when we're using it as a component in a pipeline
-            spacy_component = True,
-            **kwargs)
-        
+            spacy_component=True,
+            **kwargs
+        )
+
         # save this off so that we can get vocab values of labels later
         self.nlp = nlp
-        
+
         # keep these for matching
         self.best_match = best_match
         self.ignore_syntax = ignore_syntax
 
         # let's extend this with some proprties that we want
-        Span.set_extension('similarity', default = -1.0)
-        Span.set_extension('semtypes', default = -1.0)
-        
+        Span.set_extension("similarity", default=-1.0)
+        Span.set_extension("semtypes", default=-1.0)
+
     def __call__(self, doc):
         # pass in the document which has been parsed to this point in the pipeline for ngrams and matches
-        matches = self.quickumls._match(doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax)
-        
+        matches = self.quickumls._match(
+            doc, best_match=self.best_match, ignore_syntax=self.ignore_syntax
+        )
+
         # Convert QuickUMLS match objects into Spans
         for match in matches:
             # each match may match multiple ngrams
             for ngram_match_dict in match:
-                start_char_idx = int(ngram_match_dict['start'])
-                end_char_idx = int(ngram_match_dict['end'])
-                
-                cui = ngram_match_dict['cui']
+                start_char_idx = int(ngram_match_dict["start"])
+                end_char_idx = int(ngram_match_dict["end"])
+
+                cui = ngram_match_dict["cui"]
                 # add the string to the spacy vocab
                 self.nlp.vocab.strings.add(cui)
                 # pull out the value
                 cui_label_value = self.nlp.vocab.strings[cui]
-                
+
                 # char_span() creates a Span from these character indices
                 # UMLS CUI should work well as the label here
-                span = doc.char_span(start_char_idx, end_char_idx, label = cui_label_value)
+                span = doc.char_span(
+                    start_char_idx, end_char_idx, label=cui_label_value
+                )
                 # add some custom metadata to the spans
-                span._.similarity = ngram_match_dict['similarity']
-                span._.semtypes = ngram_match_dict['semtypes']
+                span._.similarity = ngram_match_dict["similarity"]
+                span._.semtypes = ngram_match_dict["semtypes"]
                 doc.ents = list(doc.ents) + [span]
-                
-        return doc
\ No newline at end of file
+
+        return doc
diff --git a/quickumls/toolbox.py b/quickumls/toolbox.py
index 3c7ea34..a8ffbb8 100644
--- a/quickumls/toolbox.py
+++ b/quickumls/toolbox.py
@@ -1,20 +1,22 @@
-from __future__ import unicode_literals, division, print_function
+from __future__ import division, print_function, unicode_literals
 
+import os
 # build-in modules
 import re
-import os
-from functools import wraps
-import six
 import unicodedata
+from functools import wraps
+from itertools import repeat, takewhile
 from string import punctuation
-from itertools import takewhile, repeat
-from six.moves import xrange
 
+import leveldb
 # installed modules
 import numpy
-import leveldb
+import six
+from six.moves import xrange
+
 try:
     import unqlite
+
     UNQLITE_AVAILABLE = True
 except ImportError:
     UNQLITE_AVAILABLE = False
@@ -22,7 +24,6 @@
 # project imports
 from quickumls_simstring import simstring
 
-
 # Python version specific imports
 if six.PY2:
     import cPickle as pickle
@@ -55,16 +56,16 @@ def safe_unicode(s):
         # in python 3, there no ambiguity on whether
         # a string is encoded in bytes format or not
         try:
-            s = u'%s' % s
+            s = "%s" % s
         except UnicodeDecodeError:
-            s = u'%s' % s.decode('utf-8')
+            s = "%s" % s.decode("utf-8")
 
-    return u'{}'.format(unicodedata.normalize('NFKD', s))
+    return "{}".format(unicodedata.normalize("NFKD", s))
 
 
 def prepare_string_for_db_input(s):
     if six.PY2:
-        return s.encode('utf-8')
+        return s.encode("utf-8")
     else:
         return s
 
@@ -72,38 +73,60 @@ def prepare_string_for_db_input(s):
 def make_ngrams(s, n):
     # s = u'{t}{s}{t}'.format(s=safe_unicode(s), t=('$' * (n - 1)))
     n = len(s) if len(s) < n else n
-    return (s[i:i + n] for i in xrange(len(s) - n + 1))
+    return (s[i : i + n] for i in xrange(len(s) - n + 1))
 
 
 def get_similarity(x, y, n, similarity_name):
     if len(x) == 0 or len(y) == 0:
         # we define similarity between two strings
         # to be 0 if any of the two is empty.
-        return 0.
+        return 0.0
 
     X, Y = set(make_ngrams(x, n)), set(make_ngrams(y, n))
     intersec = len(X.intersection(Y))
 
-    if similarity_name == 'dice':
+    if similarity_name == "dice":
         return 2 * intersec / (len(X) + len(Y))
-    elif similarity_name == 'jaccard':
+    elif similarity_name == "jaccard":
         return intersec / (len(X) + len(Y) - intersec)
-    elif similarity_name == 'cosine':
+    elif similarity_name == "cosine":
         return intersec / numpy.sqrt(len(X) * len(Y))
-    elif similarity_name == 'overlap':
+    elif similarity_name == "overlap":
         return intersec
     else:
-        msg = 'Similarity {} not recognized'.format(similarity_name)
+        msg = "Similarity {} not recognized".format(similarity_name)
         raise TypeError(msg)
 
 
 class SimpleTokenizer(object):
     def __init__(self, stopwords=None, min_length=1, split_sym=None):
-        if stopwords == 'default':
+        if stopwords == "default":
             stopwords = [
-                'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
-                'from', 'has', 'he', 'in', 'is', 'its', 'of', 'on', 'or',
-                'that', 'the', 'to', 'was ', 'were', 'will', 'with'
+                "a",
+                "an",
+                "and",
+                "are",
+                "as",
+                "at",
+                "be",
+                "by",
+                "for",
+                "from",
+                "has",
+                "he",
+                "in",
+                "is",
+                "its",
+                "of",
+                "on",
+                "or",
+                "that",
+                "the",
+                "to",
+                "was ",
+                "were",
+                "will",
+                "with",
             ]
         elif stopwords is None:
             stopwords = []
@@ -113,10 +136,10 @@ def __init__(self, stopwords=None, min_length=1, split_sym=None):
         if split_sym is None:
             split_sym = []
 
-        split_sym = punctuation + ''.join(split_sym)
+        split_sym = punctuation + "".join(split_sym)
 
         self.min_length = min_length
-        self.re_tokenize = re.compile(r'&\w+;|\W+|_')
+        self.re_tokenize = re.compile(r"&\w+;|\W+|_")
 
     def tokenize(self, text, lower=True):
         """Tokenize text"""
@@ -130,7 +153,8 @@ def tokenize_list(self, text, lower=True):
         if lower:
             text = text.lower()
         return [
-            tok for tok in self.re_tokenize.split(text)
+            tok
+            for tok in self.re_tokenize.split(text)
             if len(tok) >= self.min_length and tok not in self.stopwords
         ]
 
@@ -139,26 +163,23 @@ def db_key_encode(term):
     if six.PY2:
         return term
     else:
-        return term.encode('utf-8')
+        return term.encode("utf-8")
 
 
 def countlines(fn):
     """Count lines in fn. Slightly modified version of
     http://stackoverflow.com/a/27518377"""
-    with open(fn, 'rb') as f:
-        bufgen = takewhile(
-            lambda x: x, (f.read(1024 * 1024) for _ in repeat(None)))
-        ln = sum(buf.count(b'\n') for buf in bufgen)
+    with open(fn, "rb") as f:
+        bufgen = takewhile(lambda x: x, (f.read(1024 * 1024) for _ in repeat(None)))
+        ln = sum(buf.count(b"\n") for buf in bufgen)
     return ln
 
 
 class SimstringDBWriter(object):
     def __init__(self, path):
 
-        if not(os.path.exists(path)) or not(os.path.isdir(path)):
-            err_msg = (
-                '"{}" does not exists or it is not a directory.'
-            ).format(path)
+        if not (os.path.exists(path)) or not (os.path.isdir(path)):
+            err_msg = ('"{}" does not exists or it is not a directory.').format(path)
             raise IOError(err_msg)
         else:
             try:
@@ -167,10 +188,10 @@ def __init__(self, path):
                 pass
 
         self.db = simstring.writer(
-            prepare_string_for_db_input(
-                os.path.join(path, 'umls-terms.simstring')
-            ),
-            3, False, True
+            prepare_string_for_db_input(os.path.join(path, "umls-terms.simstring")),
+            3,
+            False,
+            True,
         )
 
     def insert(self, term):
@@ -180,16 +201,12 @@ def insert(self, term):
 
 class SimstringDBReader(object):
     def __init__(self, path, similarity_name, threshold):
-        if not(os.path.exists(path)) or not(os.path.isdir(path)):
-            err_msg = (
-                '"{}" does not exists or it is not a directory.'
-            ).format(path)
+        if not (os.path.exists(path)) or not (os.path.isdir(path)):
+            err_msg = ('"{}" does not exists or it is not a directory.').format(path)
             raise IOError(err_msg)
 
         self.db = simstring.reader(
-            prepare_string_for_db_input(
-                os.path.join(path, 'umls-terms.simstring')
-            )
+            prepare_string_for_db_input(os.path.join(path, "umls-terms.simstring"))
         )
         self.db.measure = getattr(simstring, similarity_name)
         self.db.threshold = threshold
@@ -213,8 +230,7 @@ def _is_overlapping_intervals(self, a, b):
 
     def __contains__(self, interval):
         return any(
-            self._is_overlapping_intervals(interval, other)
-            for other in self.intervals
+            self._is_overlapping_intervals(interval, other) for other in self.intervals
         )
 
     def append(self, interval):
@@ -222,32 +238,31 @@ def append(self, interval):
 
 
 class CuiSemTypesDB(object):
-    def __init__(self, path, database_backend='leveldb'):
+    def __init__(self, path, database_backend="leveldb"):
         if not (os.path.exists(path) or os.path.isdir(path)):
-            err_msg = (
-                '"{}" is not a valid directory').format(path)
+            err_msg = ('"{}" is not a valid directory').format(path)
             raise IOError(err_msg)
 
-        if database_backend == 'unqlite':
+        if database_backend == "unqlite":
             assert UNQLITE_AVAILABLE, (
-                'You selected unqlite as database backend, but it is not '
-                'installed. Please install it via `pip install unqlite`'
+                "You selected unqlite as database backend, but it is not "
+                "installed. Please install it via `pip install unqlite`"
             )
-            self.cui_db = unqlite.UnQLite(os.path.join(path, 'cui.unqlite'))
+            self.cui_db = unqlite.UnQLite(os.path.join(path, "cui.unqlite"))
             self.cui_db_put = self.cui_db.store
             self.cui_db_get = self.cui_db.fetch
-            self.semtypes_db = unqlite.UnQLite(os.path.join(path, 'semtypes.unqlite'))
+            self.semtypes_db = unqlite.UnQLite(os.path.join(path, "semtypes.unqlite"))
             self.semtypes_db_put = self.semtypes_db.store
             self.semtypes_db_get = self.semtypes_db.fetch
-        elif database_backend == 'leveldb':
-            self.cui_db = leveldb.LevelDB(os.path.join(path, 'cui.leveldb'))
+        elif database_backend == "leveldb":
+            self.cui_db = leveldb.LevelDB(os.path.join(path, "cui.leveldb"))
             self.cui_db_put = self.cui_db.Put
             self.cui_db_get = self.cui_db.Get
-            self.semtypes_db = leveldb.LevelDB(os.path.join(path, 'semtypes.leveldb'))
+            self.semtypes_db = leveldb.LevelDB(os.path.join(path, "semtypes.leveldb"))
             self.semtypes_db_put = self.semtypes_db.Put
             self.semtypes_db_get = self.semtypes_db.Get
         else:
-            raise ValueError(f'database_backend {database_backend} not recognized')
+            raise ValueError(f"database_backend {database_backend} not recognized")
 
     def has_term(self, term):
         term = prepare_string_for_db_input(safe_unicode(term))
@@ -274,9 +289,7 @@ def insert(self, term, cui, semtypes, is_preferred):
         try:
             self.semtypes_db_get(db_key_encode(cui))
         except KeyError:
-            self.semtypes_db_put(
-                db_key_encode(cui), pickle.dumps(set(semtypes))
-            )
+            self.semtypes_db_put(db_key_encode(cui), pickle.dumps(set(semtypes)))
 
     def get(self, term):
         term = prepare_string_for_db_input(safe_unicode(term))
@@ -286,11 +299,56 @@ def get(self, term):
             cuis = set()
 
         matches = (
-            (
-                cui,
-                pickle.loads(self.semtypes_db_get(db_key_encode(cui))),
-                is_preferred
-            )
+            (cui, pickle.loads(self.semtypes_db_get(db_key_encode(cui))), is_preferred)
             for cui, is_preferred in cuis
         )
         return matches
+
+
+class CuiPrefDB(object):
+    def __init__(self, path, database_backend="leveldb"):
+        if not (os.path.exists(path) or os.path.isdir(path)):
+            err_msg = ('"{}" is not a valid directory').format(path)
+            raise IOError(err_msg)
+
+        if database_backend == "unqlite":
+            assert UNQLITE_AVAILABLE, (
+                "You selected unqlite as database backend, but it is not "
+                "installed. Please install it via `pip install unqlite`"
+            )
+            self.cui_db = unqlite.UnQLite(os.path.join(path, "cui_pref.unqlite"))
+            self.cui_db_put = self.cui_db.store
+            self.cui_db_get = self.cui_db.fetch
+        elif database_backend == "leveldb":
+            self.cui_db = leveldb.LevelDB(os.path.join(path, "cui_pref.leveldb"))
+            self.cui_db_put = self.cui_db.Put
+            self.cui_db_get = self.cui_db.Get
+        else:
+            raise ValueError(f"database_backend {database_backend} not recognized")
+
+    def has_cui(self, cui):
+        cui = prepare_string_for_db_input(safe_unicode(cui))
+        try:
+            self.cui_db_get(db_key_encode(cui))
+            return True
+        except KeyError:
+            return False
+
+    def insert(self, term, cui):
+        term = prepare_string_for_db_input(safe_unicode(term))
+        cui = prepare_string_for_db_input(safe_unicode(cui))
+
+        if self.has_cui(cui):
+            db_term = pickle.loads(self.cui_db_get(db_key_encode(cui)))
+            raise RuntimeError(
+                f"DB shouldn't contain preferred term for CUI twice: {cui}, {term}, {db_term}"
+            )
+
+        self.cui_db_put(db_key_encode(cui), pickle.dumps(term))
+
+    def get(self, cui):
+        cui = prepare_string_for_db_input(safe_unicode(cui))
+        try:
+            return pickle.loads(self.cui_db_get(db_key_encode(cui)))
+        except KeyError:
+            return
diff --git a/setup.py b/setup.py
index 35e23cd..6c163b4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,9 @@
-from setuptools import setup, find_packages
-import os
+import contextlib
 import io
+import os
 import sys
-import contextlib
+
+from setuptools import find_packages, setup
 
 PACKAGES = find_packages()
 
@@ -22,21 +23,20 @@ def chdir(new_dir):
 def setup_package():
     root = os.path.abspath(os.path.dirname(__file__))
 
-    with open('README.md') as reader:
+    with open("README.md") as reader:
         readme = reader.read()
 
-    with open('requirements.txt') as f:
+    with open("requirements.txt") as f:
         requirements = f.read().splitlines()
 
     dependency_links = []
     i = 0
     while i < len(requirements):
-        if requirements[i].startswith('https://'):
+        if requirements[i].startswith("https://"):
             dependency_links.append(requirements.pop(i))
         else:
             i += 1
 
-
     # From https://github.com/explosion/spaCy/blob/master/setup.py
     with chdir(root):
         with io.open(os.path.join(root, "quickumls", "about.py"), encoding="utf8") as f:
@@ -44,19 +44,19 @@ def setup_package():
             exec(f.read(), about)
 
     setup(
-        name=about['__title__'],
-        version=about['__version__'],
+        name=about["__title__"],
+        version=about["__version__"],
         description=(
-            'QuickUMLS is a tool for fast, unsupervised biomedical '
-            'concept extraction from medical text'
+            "QuickUMLS is a tool for fast, unsupervised biomedical "
+            "concept extraction from medical text"
         ),
         packages=PACKAGES,
         long_description=readme,
-        long_description_content_type='text/markdown',
-        author=about['__author__'],
-        author_email=about['__email__'],
-        url='https://github.com/Georgetown-IR-Lab/QuickUMLS',
-        license=about['__license__'],
+        long_description_content_type="text/markdown",
+        author=about["__author__"],
+        author_email=about["__email__"],
+        url="https://github.com/Georgetown-IR-Lab/QuickUMLS",
+        license=about["__license__"],
         install_requires=requirements,
         dependency_links=dependency_links,
         classifiers=[
@@ -67,8 +67,9 @@ def setup_package():
             "Development Status :: 5 - Production/Stable",
             "Topic :: Scientific/Engineering :: Artificial Intelligence",
             "Topic :: Scientific/Engineering :: Bio-Informatics",
-        ]
+        ],
     )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     setup_package()