@staticmethod
+def load(filename:str) -> "DictDatabase":
+ """Load db from a file
+
+ Args:
+ filename (str): Name of the file to load
+
+ Returns:
+ DictDatabase: the db
+ """
+ with open(filename, "rb") as f:
+ db = pickle.load(f)
+ return db
+
+
+
+
+
+
+
+
+
+
+loads(binary_data)
+
+
+ staticmethod
+
+
+
+
+
+
+
+
Load a binary string representing a database
+
Initially only unpickles the data
+
+
Parameters:
+
+
+
+
Name
+
Type
+
Description
+
Default
+
+
+
+
+
binary_data
+
+ str
+
+
String of data to unpickle
+
+ required
+
+
+
+
+
+
Returns:
+
+
+
+
Type
+
Description
+
+
+
+
+
+ DictDatabase
+
+
Model object
+
+
+
+
+
+ Source code in simstring\database\dict.py
+
@staticmethod
+def loads(binary_data: bytes) -> "DictDatabase":
+ """Load a binary string representing a database
+
+ Initially only unpickles the data
+
+ Args:
+ binary_data (str): String of data to unpickle
+
+ Returns:
+ Model object
+ """
+ return pickle.loads(binary_data)
+
+
+
+
+
+
+
+
+
+
+save(filename)
+
+
+
+
+
+
+
Save the database to a file as defined by filename.
+
+
Parameters:
+
+
+
+
Name
+
Type
+
Description
+
Default
+
+
+
+
+
filename
+
+ str
+
+
Filename to save the db at. Should include file extention.
+
+ required
+
+
+
+
+
+
Returns:
+
+
+
+
Type
+
Description
+
+
+
+
+
+
+
None
+
+
+
+
+
+ Source code in simstring\database\dict.py
+
def save(self, filename:str):
+ """Save the database to a file as defined by filename.
+
+ Args:
+ filename: Filename to save the db at. Should include file extention.
+
+ Returns:
+ None
+ """
+ with open(filename, "wb") as f:
+ pickle.dump(self, f)
+
+
+
+
+
+
+
+
+
+
+
+
PyMongo based database
+
+
+
+
+
+
+
+
+ Bases: BaseDatabase
+
+
+
+
+ Source code in simstring\database\mongo.py
+
+
+
+
+
+ Source code in simstring\feature_extractor\word_ngram.py
+
class WordNgramFeatureExtractor(BaseFeatureExtractor):
+ def __init__(self, n=2, splitter=" "):
+ self.n = n
+ self.splitter = splitter
+
+ def features(self, text: str) -> List[str]:
+ # Split text by white space.
+ # If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own.
+ words = text.split(self.splitter)
+ return self._words_ngram(words, self.n, SENTINAL_CHAR)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Bases: BaseFeatureExtractor
+
+
+
+
+ Source code in simstring\feature_extractor\mecab_ngram.py
+
class MecabNgramFeatureExtractor(BaseFeatureExtractor):
+ def __init__(self, n=2, user_dic_path='', sys_dic_path=''):
+ self.n = n
+ self.mecab = MecabTokenizer(user_dic_path, sys_dic_path)
+
+ def features(self, text: str) -> List[str]:
+ words = [x.surface() for x in self.mecab.tokenize(text)]
+ return self._words_ngram(words, self.n, SENTINAL_CHAR)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Search
+
+
+
+
From here you can search these documents. Enter your search terms below.
+
+
+
+
+
+
+
+
+
+
+
Keyboard Shortcuts
+
+
+
+
+
+
+
Keys
+
Action
+
+
+
+
+
?
+
Open this help
+
+
+
n
+
Next page
+
+
+
p
+
Previous page
+
+
+
s
+
Search
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/fonts/fontawesome-webfont.eot b/docs/fonts/fontawesome-webfont.eot
new file mode 100644
index 0000000..e9f60ca
Binary files /dev/null and b/docs/fonts/fontawesome-webfont.eot differ
diff --git a/docs/fonts/fontawesome-webfont.svg b/docs/fonts/fontawesome-webfont.svg
new file mode 100644
index 0000000..855c845
--- /dev/null
+++ b/docs/fonts/fontawesome-webfont.svg
@@ -0,0 +1,2671 @@
+
+
+
diff --git a/docs/fonts/fontawesome-webfont.ttf b/docs/fonts/fontawesome-webfont.ttf
new file mode 100644
index 0000000..35acda2
Binary files /dev/null and b/docs/fonts/fontawesome-webfont.ttf differ
diff --git a/docs/fonts/fontawesome-webfont.woff b/docs/fonts/fontawesome-webfont.woff
new file mode 100644
index 0000000..400014a
Binary files /dev/null and b/docs/fonts/fontawesome-webfont.woff differ
diff --git a/docs/fonts/fontawesome-webfont.woff2 b/docs/fonts/fontawesome-webfont.woff2
new file mode 100644
index 0000000..4d13fc6
Binary files /dev/null and b/docs/fonts/fontawesome-webfont.woff2 differ
diff --git a/docs/img/favicon.ico b/docs/img/favicon.ico
new file mode 100644
index 0000000..e85006a
Binary files /dev/null and b/docs/img/favicon.ico differ
diff --git a/docs/img/grid.png b/docs/img/grid.png
new file mode 100644
index 0000000..878c3ed
Binary files /dev/null and b/docs/img/grid.png differ
diff --git a/docs/index.html b/docs/index.html
new file mode 100644
index 0000000..7594253
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1,215 @@
+
+
+
+
+
+
+
+
+
+
+ Simstring docs
+
+
+
+
+
+
+
+
+
+
+
+
+
+
The original method is described in this paper. There is an even faster C++ implimentation by the original authors available here
+
This module is a fork of this repo which is no longer actively maintained. This module adds documentation, speedups and more measures and features such saving compiled databases.
+
Banchmarks
+
+Without compilation the code takes 14 seconds to run through this particular banchmark, which is only on the data retrieval.
+
+With compiltion this time is dropped to below 5 seconds.
+
+
+
+
+
+
+
+
+
+
+
+
+
Search
+
+
+
+
From here you can search these documents. Enter your search terms below.
a",n=d.getElementsByTagName("*")||[],r=d.getElementsByTagName("a")[0],!r||!r.style||!n.length)return t;s=a.createElement("select"),u=s.appendChild(a.createElement("option")),o=d.getElementsByTagName("input")[0],r.style.cssText="top:1px;float:left;opacity:.5",t.getSetAttribute="t"!==d.className,t.leadingWhitespace=3===d.firstChild.nodeType,t.tbody=!d.getElementsByTagName("tbody").length,t.htmlSerialize=!!d.getElementsByTagName("link").length,t.style=/top/.test(r.getAttribute("style")),t.hrefNormalized="/a"===r.getAttribute("href"),t.opacity=/^0.5/.test(r.style.opacity),t.cssFloat=!!r.style.cssFloat,t.checkOn=!!o.value,t.optSelected=u.selected,t.enctype=!!a.createElement("form").enctype,t.html5Clone="<:nav>"!==a.createElement("nav").cloneNode(!0).outerHTML,t.inlineBlockNeedsLayout=!1,t.shrinkWrapBlocks=!1,t.pixelPosition=!1,t.deleteExpando=!0,t.noCloneEvent=!0,t.reliableMarginRight=!0,t.boxSizingReliable=!0,o.checked=!0,t.noCloneChecked=o.cloneNode(!0).checked,s.disabled=!0,t.optDisabled=!u.disabled;try{delete d.test}catch(h){t.deleteExpando=!1}o=a.createElement("input"),o.setAttribute("value",""),t.input=""===o.getAttribute("value"),o.value="t",o.setAttribute("type","radio"),t.radioValue="t"===o.value,o.setAttribute("checked","t"),o.setAttribute("name","t"),l=a.createDocumentFragment(),l.appendChild(o),t.appendChecked=o.checked,t.checkClone=l.cloneNode(!0).cloneNode(!0).lastChild.checked,d.attachEvent&&(d.attachEvent("onclick",function(){t.noCloneEvent=!1}),d.cloneNode(!0).click());for(f in{submit:!0,change:!0,focusin:!0})d.setAttribute(c="on"+f,"t"),t[f+"Bubbles"]=c in e||d.attributes[c].expando===!1;d.style.backgroundClip="content-box",d.cloneNode(!0).style.backgroundClip="",t.clearCloneStyle="content-box"===d.style.backgroundClip;for(f in x(t))break;return t.ownLast="0"!==f,x(function(){var n,r,o,s="padding:0;margin:0;border:0;display:block;box-sizing:content-box;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;",l=a.getElementsByTagName("body")[0];l&&(n=a.createElement("div"),n.style.cssText="border:0;width:0;height:0;position:absolute;top:0;left:-9999px;margin-top:1px",l.appendChild(n).appendChild(d),d.innerHTML="
The measure defines the formula by which the distance between strings is measured.
+
Use as:
+
from simstring.measure import CosineMeasure, JaccardMeasure, OverlapMeasure, DiceMeasure
+
+
+
But be carefull, they are not identical to the normal definitions of these measures.
+
Cosine Measure is different to scipy.spatial.distance.cosine as it works on strings and not vectors.
+
Jaccard distance does not discard duplicates in its sets, unlike in the normally used definition. This means that 'fooo' is seen as more different from 'fo' than 'foo', which is a more useful way of lookng at the string difference, but is not the usual definition of the distance as implimanted by scipy.spatial.distance.jaccard or wikipedia or any public calculator.
+
Cosine Measure
+
+
+
+
+
+
+
+
+ Bases: BaseMeasure
+
+
+
+
+ Source code in simstring\measure\cosine.py
+
From here you can search these documents. Enter your search terms below.
+
+
+
+
+
+
+
+
+
+
+
Keyboard Shortcuts
+
+
+
+
+
+
+
Keys
+
Action
+
+
+
+
+
?
+
Open this help
+
+
+
n
+
Next page
+
+
+
p
+
Previous page
+
+
+
s
+
Search
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/objects.inv b/docs/objects.inv
new file mode 100644
index 0000000..24d7ec7
--- /dev/null
+++ b/docs/objects.inv
@@ -0,0 +1,6 @@
+# Sphinx inventory version 2
+# Project: Simstring docs
+# Version: 0.0.0
+# The remainder of this file is compressed using zlib.
+xڭ0w^ЭMM*E_T!ߧ$2/Uꌤ}49J#\3h;V9qOT<.pXK|b@P 6gUH!;{;O쌺
}5wjy^q;0ƃ?
+xpLx$#9#c_!|)w}v(
\ No newline at end of file
diff --git a/docs/search/lunr.js b/docs/search/lunr.js
new file mode 100644
index 0000000..6aa370f
--- /dev/null
+++ b/docs/search/lunr.js
@@ -0,0 +1,3475 @@
+/**
+ * lunr - http://lunrjs.com - A bit like Solr, but much smaller and not as bright - 2.3.9
+ * Copyright (C) 2020 Oliver Nightingale
+ * @license MIT
+ */
+
+;(function(){
+
+/**
+ * A convenience function for configuring and constructing
+ * a new lunr Index.
+ *
+ * A lunr.Builder instance is created and the pipeline setup
+ * with a trimmer, stop word filter and stemmer.
+ *
+ * This builder object is yielded to the configuration function
+ * that is passed as a parameter, allowing the list of fields
+ * and other builder parameters to be customised.
+ *
+ * All documents _must_ be added within the passed config function.
+ *
+ * @example
+ * var idx = lunr(function () {
+ * this.field('title')
+ * this.field('body')
+ * this.ref('id')
+ *
+ * documents.forEach(function (doc) {
+ * this.add(doc)
+ * }, this)
+ * })
+ *
+ * @see {@link lunr.Builder}
+ * @see {@link lunr.Pipeline}
+ * @see {@link lunr.trimmer}
+ * @see {@link lunr.stopWordFilter}
+ * @see {@link lunr.stemmer}
+ * @namespace {function} lunr
+ */
+var lunr = function (config) {
+ var builder = new lunr.Builder
+
+ builder.pipeline.add(
+ lunr.trimmer,
+ lunr.stopWordFilter,
+ lunr.stemmer
+ )
+
+ builder.searchPipeline.add(
+ lunr.stemmer
+ )
+
+ config.call(builder, builder)
+ return builder.build()
+}
+
+lunr.version = "2.3.9"
+/*!
+ * lunr.utils
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * A namespace containing utils for the rest of the lunr library
+ * @namespace lunr.utils
+ */
+lunr.utils = {}
+
+/**
+ * Print a warning message to the console.
+ *
+ * @param {String} message The message to be printed.
+ * @memberOf lunr.utils
+ * @function
+ */
+lunr.utils.warn = (function (global) {
+ /* eslint-disable no-console */
+ return function (message) {
+ if (global.console && console.warn) {
+ console.warn(message)
+ }
+ }
+ /* eslint-enable no-console */
+})(this)
+
+/**
+ * Convert an object to a string.
+ *
+ * In the case of `null` and `undefined` the function returns
+ * the empty string, in all other cases the result of calling
+ * `toString` on the passed object is returned.
+ *
+ * @param {Any} obj The object to convert to a string.
+ * @return {String} string representation of the passed object.
+ * @memberOf lunr.utils
+ */
+lunr.utils.asString = function (obj) {
+ if (obj === void 0 || obj === null) {
+ return ""
+ } else {
+ return obj.toString()
+ }
+}
+
+/**
+ * Clones an object.
+ *
+ * Will create a copy of an existing object such that any mutations
+ * on the copy cannot affect the original.
+ *
+ * Only shallow objects are supported, passing a nested object to this
+ * function will cause a TypeError.
+ *
+ * Objects with primitives, and arrays of primitives are supported.
+ *
+ * @param {Object} obj The object to clone.
+ * @return {Object} a clone of the passed object.
+ * @throws {TypeError} when a nested object is passed.
+ * @memberOf Utils
+ */
+lunr.utils.clone = function (obj) {
+ if (obj === null || obj === undefined) {
+ return obj
+ }
+
+ var clone = Object.create(null),
+ keys = Object.keys(obj)
+
+ for (var i = 0; i < keys.length; i++) {
+ var key = keys[i],
+ val = obj[key]
+
+ if (Array.isArray(val)) {
+ clone[key] = val.slice()
+ continue
+ }
+
+ if (typeof val === 'string' ||
+ typeof val === 'number' ||
+ typeof val === 'boolean') {
+ clone[key] = val
+ continue
+ }
+
+ throw new TypeError("clone is not deep and does not support nested objects")
+ }
+
+ return clone
+}
+lunr.FieldRef = function (docRef, fieldName, stringValue) {
+ this.docRef = docRef
+ this.fieldName = fieldName
+ this._stringValue = stringValue
+}
+
+lunr.FieldRef.joiner = "/"
+
+lunr.FieldRef.fromString = function (s) {
+ var n = s.indexOf(lunr.FieldRef.joiner)
+
+ if (n === -1) {
+ throw "malformed field ref string"
+ }
+
+ var fieldRef = s.slice(0, n),
+ docRef = s.slice(n + 1)
+
+ return new lunr.FieldRef (docRef, fieldRef, s)
+}
+
+lunr.FieldRef.prototype.toString = function () {
+ if (this._stringValue == undefined) {
+ this._stringValue = this.fieldName + lunr.FieldRef.joiner + this.docRef
+ }
+
+ return this._stringValue
+}
+/*!
+ * lunr.Set
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * A lunr set.
+ *
+ * @constructor
+ */
+lunr.Set = function (elements) {
+ this.elements = Object.create(null)
+
+ if (elements) {
+ this.length = elements.length
+
+ for (var i = 0; i < this.length; i++) {
+ this.elements[elements[i]] = true
+ }
+ } else {
+ this.length = 0
+ }
+}
+
+/**
+ * A complete set that contains all elements.
+ *
+ * @static
+ * @readonly
+ * @type {lunr.Set}
+ */
+lunr.Set.complete = {
+ intersect: function (other) {
+ return other
+ },
+
+ union: function () {
+ return this
+ },
+
+ contains: function () {
+ return true
+ }
+}
+
+/**
+ * An empty set that contains no elements.
+ *
+ * @static
+ * @readonly
+ * @type {lunr.Set}
+ */
+lunr.Set.empty = {
+ intersect: function () {
+ return this
+ },
+
+ union: function (other) {
+ return other
+ },
+
+ contains: function () {
+ return false
+ }
+}
+
+/**
+ * Returns true if this set contains the specified object.
+ *
+ * @param {object} object - Object whose presence in this set is to be tested.
+ * @returns {boolean} - True if this set contains the specified object.
+ */
+lunr.Set.prototype.contains = function (object) {
+ return !!this.elements[object]
+}
+
+/**
+ * Returns a new set containing only the elements that are present in both
+ * this set and the specified set.
+ *
+ * @param {lunr.Set} other - set to intersect with this set.
+ * @returns {lunr.Set} a new set that is the intersection of this and the specified set.
+ */
+
+lunr.Set.prototype.intersect = function (other) {
+ var a, b, elements, intersection = []
+
+ if (other === lunr.Set.complete) {
+ return this
+ }
+
+ if (other === lunr.Set.empty) {
+ return other
+ }
+
+ if (this.length < other.length) {
+ a = this
+ b = other
+ } else {
+ a = other
+ b = this
+ }
+
+ elements = Object.keys(a.elements)
+
+ for (var i = 0; i < elements.length; i++) {
+ var element = elements[i]
+ if (element in b.elements) {
+ intersection.push(element)
+ }
+ }
+
+ return new lunr.Set (intersection)
+}
+
+/**
+ * Returns a new set combining the elements of this and the specified set.
+ *
+ * @param {lunr.Set} other - set to union with this set.
+ * @return {lunr.Set} a new set that is the union of this and the specified set.
+ */
+
+lunr.Set.prototype.union = function (other) {
+ if (other === lunr.Set.complete) {
+ return lunr.Set.complete
+ }
+
+ if (other === lunr.Set.empty) {
+ return this
+ }
+
+ return new lunr.Set(Object.keys(this.elements).concat(Object.keys(other.elements)))
+}
+/**
+ * A function to calculate the inverse document frequency for
+ * a posting. This is shared between the builder and the index
+ *
+ * @private
+ * @param {object} posting - The posting for a given term
+ * @param {number} documentCount - The total number of documents.
+ */
+lunr.idf = function (posting, documentCount) {
+ var documentsWithTerm = 0
+
+ for (var fieldName in posting) {
+ if (fieldName == '_index') continue // Ignore the term index, its not a field
+ documentsWithTerm += Object.keys(posting[fieldName]).length
+ }
+
+ var x = (documentCount - documentsWithTerm + 0.5) / (documentsWithTerm + 0.5)
+
+ return Math.log(1 + Math.abs(x))
+}
+
+/**
+ * A token wraps a string representation of a token
+ * as it is passed through the text processing pipeline.
+ *
+ * @constructor
+ * @param {string} [str=''] - The string token being wrapped.
+ * @param {object} [metadata={}] - Metadata associated with this token.
+ */
+lunr.Token = function (str, metadata) {
+ this.str = str || ""
+ this.metadata = metadata || {}
+}
+
+/**
+ * Returns the token string that is being wrapped by this object.
+ *
+ * @returns {string}
+ */
+lunr.Token.prototype.toString = function () {
+ return this.str
+}
+
+/**
+ * A token update function is used when updating or optionally
+ * when cloning a token.
+ *
+ * @callback lunr.Token~updateFunction
+ * @param {string} str - The string representation of the token.
+ * @param {Object} metadata - All metadata associated with this token.
+ */
+
+/**
+ * Applies the given function to the wrapped string token.
+ *
+ * @example
+ * token.update(function (str, metadata) {
+ * return str.toUpperCase()
+ * })
+ *
+ * @param {lunr.Token~updateFunction} fn - A function to apply to the token string.
+ * @returns {lunr.Token}
+ */
+lunr.Token.prototype.update = function (fn) {
+ this.str = fn(this.str, this.metadata)
+ return this
+}
+
+/**
+ * Creates a clone of this token. Optionally a function can be
+ * applied to the cloned token.
+ *
+ * @param {lunr.Token~updateFunction} [fn] - An optional function to apply to the cloned token.
+ * @returns {lunr.Token}
+ */
+lunr.Token.prototype.clone = function (fn) {
+ fn = fn || function (s) { return s }
+ return new lunr.Token (fn(this.str, this.metadata), this.metadata)
+}
+/*!
+ * lunr.tokenizer
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * A function for splitting a string into tokens ready to be inserted into
+ * the search index. Uses `lunr.tokenizer.separator` to split strings, change
+ * the value of this property to change how strings are split into tokens.
+ *
+ * This tokenizer will convert its parameter to a string by calling `toString` and
+ * then will split this string on the character in `lunr.tokenizer.separator`.
+ * Arrays will have their elements converted to strings and wrapped in a lunr.Token.
+ *
+ * Optional metadata can be passed to the tokenizer, this metadata will be cloned and
+ * added as metadata to every token that is created from the object to be tokenized.
+ *
+ * @static
+ * @param {?(string|object|object[])} obj - The object to convert into tokens
+ * @param {?object} metadata - Optional metadata to associate with every token
+ * @returns {lunr.Token[]}
+ * @see {@link lunr.Pipeline}
+ */
+lunr.tokenizer = function (obj, metadata) {
+ if (obj == null || obj == undefined) {
+ return []
+ }
+
+ if (Array.isArray(obj)) {
+ return obj.map(function (t) {
+ return new lunr.Token(
+ lunr.utils.asString(t).toLowerCase(),
+ lunr.utils.clone(metadata)
+ )
+ })
+ }
+
+ var str = obj.toString().toLowerCase(),
+ len = str.length,
+ tokens = []
+
+ for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) {
+ var char = str.charAt(sliceEnd),
+ sliceLength = sliceEnd - sliceStart
+
+ if ((char.match(lunr.tokenizer.separator) || sliceEnd == len)) {
+
+ if (sliceLength > 0) {
+ var tokenMetadata = lunr.utils.clone(metadata) || {}
+ tokenMetadata["position"] = [sliceStart, sliceLength]
+ tokenMetadata["index"] = tokens.length
+
+ tokens.push(
+ new lunr.Token (
+ str.slice(sliceStart, sliceEnd),
+ tokenMetadata
+ )
+ )
+ }
+
+ sliceStart = sliceEnd + 1
+ }
+
+ }
+
+ return tokens
+}
+
+/**
+ * The separator used to split a string into tokens. Override this property to change the behaviour of
+ * `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
+ *
+ * @static
+ * @see lunr.tokenizer
+ */
+lunr.tokenizer.separator = /[\s\-]+/
+/*!
+ * lunr.Pipeline
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * lunr.Pipelines maintain an ordered list of functions to be applied to all
+ * tokens in documents entering the search index and queries being ran against
+ * the index.
+ *
+ * An instance of lunr.Index created with the lunr shortcut will contain a
+ * pipeline with a stop word filter and an English language stemmer. Extra
+ * functions can be added before or after either of these functions or these
+ * default functions can be removed.
+ *
+ * When run the pipeline will call each function in turn, passing a token, the
+ * index of that token in the original list of all tokens and finally a list of
+ * all the original tokens.
+ *
+ * The output of functions in the pipeline will be passed to the next function
+ * in the pipeline. To exclude a token from entering the index the function
+ * should return undefined, the rest of the pipeline will not be called with
+ * this token.
+ *
+ * For serialisation of pipelines to work, all functions used in an instance of
+ * a pipeline should be registered with lunr.Pipeline. Registered functions can
+ * then be loaded. If trying to load a serialised pipeline that uses functions
+ * that are not registered an error will be thrown.
+ *
+ * If not planning on serialising the pipeline then registering pipeline functions
+ * is not necessary.
+ *
+ * @constructor
+ */
+lunr.Pipeline = function () {
+ this._stack = []
+}
+
+lunr.Pipeline.registeredFunctions = Object.create(null)
+
+/**
+ * A pipeline function maps lunr.Token to lunr.Token. A lunr.Token contains the token
+ * string as well as all known metadata. A pipeline function can mutate the token string
+ * or mutate (or add) metadata for a given token.
+ *
+ * A pipeline function can indicate that the passed token should be discarded by returning
+ * null, undefined or an empty string. This token will not be passed to any downstream pipeline
+ * functions and will not be added to the index.
+ *
+ * Multiple tokens can be returned by returning an array of tokens. Each token will be passed
+ * to any downstream pipeline functions and all will returned tokens will be added to the index.
+ *
+ * Any number of pipeline functions may be chained together using a lunr.Pipeline.
+ *
+ * @interface lunr.PipelineFunction
+ * @param {lunr.Token} token - A token from the document being processed.
+ * @param {number} i - The index of this token in the complete list of tokens for this document/field.
+ * @param {lunr.Token[]} tokens - All tokens for this document/field.
+ * @returns {(?lunr.Token|lunr.Token[])}
+ */
+
+/**
+ * Register a function with the pipeline.
+ *
+ * Functions that are used in the pipeline should be registered if the pipeline
+ * needs to be serialised, or a serialised pipeline needs to be loaded.
+ *
+ * Registering a function does not add it to a pipeline, functions must still be
+ * added to instances of the pipeline for them to be used when running a pipeline.
+ *
+ * @param {lunr.PipelineFunction} fn - The function to check for.
+ * @param {String} label - The label to register this function with
+ */
+lunr.Pipeline.registerFunction = function (fn, label) {
+ if (label in this.registeredFunctions) {
+ lunr.utils.warn('Overwriting existing registered function: ' + label)
+ }
+
+ fn.label = label
+ lunr.Pipeline.registeredFunctions[fn.label] = fn
+}
+
+/**
+ * Warns if the function is not registered as a Pipeline function.
+ *
+ * @param {lunr.PipelineFunction} fn - The function to check for.
+ * @private
+ */
+lunr.Pipeline.warnIfFunctionNotRegistered = function (fn) {
+ var isRegistered = fn.label && (fn.label in this.registeredFunctions)
+
+ if (!isRegistered) {
+ lunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\n', fn)
+ }
+}
+
+/**
+ * Loads a previously serialised pipeline.
+ *
+ * All functions to be loaded must already be registered with lunr.Pipeline.
+ * If any function from the serialised data has not been registered then an
+ * error will be thrown.
+ *
+ * @param {Object} serialised - The serialised pipeline to load.
+ * @returns {lunr.Pipeline}
+ */
+lunr.Pipeline.load = function (serialised) {
+ var pipeline = new lunr.Pipeline
+
+ serialised.forEach(function (fnName) {
+ var fn = lunr.Pipeline.registeredFunctions[fnName]
+
+ if (fn) {
+ pipeline.add(fn)
+ } else {
+ throw new Error('Cannot load unregistered function: ' + fnName)
+ }
+ })
+
+ return pipeline
+}
+
+/**
+ * Adds new functions to the end of the pipeline.
+ *
+ * Logs a warning if the function has not been registered.
+ *
+ * @param {lunr.PipelineFunction[]} functions - Any number of functions to add to the pipeline.
+ */
+lunr.Pipeline.prototype.add = function () {
+ var fns = Array.prototype.slice.call(arguments)
+
+ fns.forEach(function (fn) {
+ lunr.Pipeline.warnIfFunctionNotRegistered(fn)
+ this._stack.push(fn)
+ }, this)
+}
+
+/**
+ * Adds a single function after a function that already exists in the
+ * pipeline.
+ *
+ * Logs a warning if the function has not been registered.
+ *
+ * @param {lunr.PipelineFunction} existingFn - A function that already exists in the pipeline.
+ * @param {lunr.PipelineFunction} newFn - The new function to add to the pipeline.
+ */
+lunr.Pipeline.prototype.after = function (existingFn, newFn) {
+ lunr.Pipeline.warnIfFunctionNotRegistered(newFn)
+
+ var pos = this._stack.indexOf(existingFn)
+ if (pos == -1) {
+ throw new Error('Cannot find existingFn')
+ }
+
+ pos = pos + 1
+ this._stack.splice(pos, 0, newFn)
+}
+
+/**
+ * Adds a single function before a function that already exists in the
+ * pipeline.
+ *
+ * Logs a warning if the function has not been registered.
+ *
+ * @param {lunr.PipelineFunction} existingFn - A function that already exists in the pipeline.
+ * @param {lunr.PipelineFunction} newFn - The new function to add to the pipeline.
+ */
+lunr.Pipeline.prototype.before = function (existingFn, newFn) {
+ lunr.Pipeline.warnIfFunctionNotRegistered(newFn)
+
+ var pos = this._stack.indexOf(existingFn)
+ if (pos == -1) {
+ throw new Error('Cannot find existingFn')
+ }
+
+ this._stack.splice(pos, 0, newFn)
+}
+
+/**
+ * Removes a function from the pipeline.
+ *
+ * @param {lunr.PipelineFunction} fn The function to remove from the pipeline.
+ */
+lunr.Pipeline.prototype.remove = function (fn) {
+ var pos = this._stack.indexOf(fn)
+ if (pos == -1) {
+ return
+ }
+
+ this._stack.splice(pos, 1)
+}
+
+/**
+ * Runs the current list of functions that make up the pipeline against the
+ * passed tokens.
+ *
+ * @param {Array} tokens The tokens to run through the pipeline.
+ * @returns {Array}
+ */
+lunr.Pipeline.prototype.run = function (tokens) {
+ var stackLength = this._stack.length
+
+ for (var i = 0; i < stackLength; i++) {
+ var fn = this._stack[i]
+ var memo = []
+
+ for (var j = 0; j < tokens.length; j++) {
+ var result = fn(tokens[j], j, tokens)
+
+ if (result === null || result === void 0 || result === '') continue
+
+ if (Array.isArray(result)) {
+ for (var k = 0; k < result.length; k++) {
+ memo.push(result[k])
+ }
+ } else {
+ memo.push(result)
+ }
+ }
+
+ tokens = memo
+ }
+
+ return tokens
+}
+
+/**
+ * Convenience method for passing a string through a pipeline and getting
+ * strings out. This method takes care of wrapping the passed string in a
+ * token and mapping the resulting tokens back to strings.
+ *
+ * @param {string} str - The string to pass through the pipeline.
+ * @param {?object} metadata - Optional metadata to associate with the token
+ * passed to the pipeline.
+ * @returns {string[]}
+ */
+lunr.Pipeline.prototype.runString = function (str, metadata) {
+ var token = new lunr.Token (str, metadata)
+
+ return this.run([token]).map(function (t) {
+ return t.toString()
+ })
+}
+
+/**
+ * Resets the pipeline by removing any existing processors.
+ *
+ */
+lunr.Pipeline.prototype.reset = function () {
+ this._stack = []
+}
+
+/**
+ * Returns a representation of the pipeline ready for serialisation.
+ *
+ * Logs a warning if the function has not been registered.
+ *
+ * @returns {Array}
+ */
+lunr.Pipeline.prototype.toJSON = function () {
+ return this._stack.map(function (fn) {
+ lunr.Pipeline.warnIfFunctionNotRegistered(fn)
+
+ return fn.label
+ })
+}
+/*!
+ * lunr.Vector
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * A vector is used to construct the vector space of documents and queries. These
+ * vectors support operations to determine the similarity between two documents or
+ * a document and a query.
+ *
+ * Normally no parameters are required for initializing a vector, but in the case of
+ * loading a previously dumped vector the raw elements can be provided to the constructor.
+ *
+ * For performance reasons vectors are implemented with a flat array, where an elements
+ * index is immediately followed by its value. E.g. [index, value, index, value]. This
+ * allows the underlying array to be as sparse as possible and still offer decent
+ * performance when being used for vector calculations.
+ *
+ * @constructor
+ * @param {Number[]} [elements] - The flat list of element index and element value pairs.
+ */
+lunr.Vector = function (elements) {
+ this._magnitude = 0
+ this.elements = elements || []
+}
+
+
+/**
+ * Calculates the position within the vector to insert a given index.
+ *
+ * This is used internally by insert and upsert. If there are duplicate indexes then
+ * the position is returned as if the value for that index were to be updated, but it
+ * is the callers responsibility to check whether there is a duplicate at that index
+ *
+ * @param {Number} insertIdx - The index at which the element should be inserted.
+ * @returns {Number}
+ */
+lunr.Vector.prototype.positionForIndex = function (index) {
+ // For an empty vector the tuple can be inserted at the beginning
+ if (this.elements.length == 0) {
+ return 0
+ }
+
+ var start = 0,
+ end = this.elements.length / 2,
+ sliceLength = end - start,
+ pivotPoint = Math.floor(sliceLength / 2),
+ pivotIndex = this.elements[pivotPoint * 2]
+
+ while (sliceLength > 1) {
+ if (pivotIndex < index) {
+ start = pivotPoint
+ }
+
+ if (pivotIndex > index) {
+ end = pivotPoint
+ }
+
+ if (pivotIndex == index) {
+ break
+ }
+
+ sliceLength = end - start
+ pivotPoint = start + Math.floor(sliceLength / 2)
+ pivotIndex = this.elements[pivotPoint * 2]
+ }
+
+ if (pivotIndex == index) {
+ return pivotPoint * 2
+ }
+
+ if (pivotIndex > index) {
+ return pivotPoint * 2
+ }
+
+ if (pivotIndex < index) {
+ return (pivotPoint + 1) * 2
+ }
+}
+
+/**
+ * Inserts an element at an index within the vector.
+ *
+ * Does not allow duplicates, will throw an error if there is already an entry
+ * for this index.
+ *
+ * @param {Number} insertIdx - The index at which the element should be inserted.
+ * @param {Number} val - The value to be inserted into the vector.
+ */
+lunr.Vector.prototype.insert = function (insertIdx, val) {
+ this.upsert(insertIdx, val, function () {
+ throw "duplicate index"
+ })
+}
+
+/**
+ * Inserts or updates an existing index within the vector.
+ *
+ * @param {Number} insertIdx - The index at which the element should be inserted.
+ * @param {Number} val - The value to be inserted into the vector.
+ * @param {function} fn - A function that is called for updates, the existing value and the
+ * requested value are passed as arguments
+ */
+lunr.Vector.prototype.upsert = function (insertIdx, val, fn) {
+ this._magnitude = 0
+ var position = this.positionForIndex(insertIdx)
+
+ if (this.elements[position] == insertIdx) {
+ this.elements[position + 1] = fn(this.elements[position + 1], val)
+ } else {
+ this.elements.splice(position, 0, insertIdx, val)
+ }
+}
+
+/**
+ * Calculates the magnitude of this vector.
+ *
+ * @returns {Number}
+ */
+lunr.Vector.prototype.magnitude = function () {
+ if (this._magnitude) return this._magnitude
+
+ var sumOfSquares = 0,
+ elementsLength = this.elements.length
+
+ for (var i = 1; i < elementsLength; i += 2) {
+ var val = this.elements[i]
+ sumOfSquares += val * val
+ }
+
+ return this._magnitude = Math.sqrt(sumOfSquares)
+}
+
+/**
+ * Calculates the dot product of this vector and another vector.
+ *
+ * @param {lunr.Vector} otherVector - The vector to compute the dot product with.
+ * @returns {Number}
+ */
+lunr.Vector.prototype.dot = function (otherVector) {
+ var dotProduct = 0,
+ a = this.elements, b = otherVector.elements,
+ aLen = a.length, bLen = b.length,
+ aVal = 0, bVal = 0,
+ i = 0, j = 0
+
+ while (i < aLen && j < bLen) {
+ aVal = a[i], bVal = b[j]
+ if (aVal < bVal) {
+ i += 2
+ } else if (aVal > bVal) {
+ j += 2
+ } else if (aVal == bVal) {
+ dotProduct += a[i + 1] * b[j + 1]
+ i += 2
+ j += 2
+ }
+ }
+
+ return dotProduct
+}
+
+/**
+ * Calculates the similarity between this vector and another vector.
+ *
+ * @param {lunr.Vector} otherVector - The other vector to calculate the
+ * similarity with.
+ * @returns {Number}
+ */
+lunr.Vector.prototype.similarity = function (otherVector) {
+ return this.dot(otherVector) / this.magnitude() || 0
+}
+
+/**
+ * Converts the vector to an array of the elements within the vector.
+ *
+ * @returns {Number[]}
+ */
+lunr.Vector.prototype.toArray = function () {
+ var output = new Array (this.elements.length / 2)
+
+ for (var i = 1, j = 0; i < this.elements.length; i += 2, j++) {
+ output[j] = this.elements[i]
+ }
+
+ return output
+}
+
+/**
+ * A JSON serializable representation of the vector.
+ *
+ * @returns {Number[]}
+ */
+lunr.Vector.prototype.toJSON = function () {
+ return this.elements
+}
+/* eslint-disable */
+/*!
+ * lunr.stemmer
+ * Copyright (C) 2020 Oliver Nightingale
+ * Includes code from - http://tartarus.org/~martin/PorterStemmer/js.txt
+ */
+
+/**
+ * lunr.stemmer is an english language stemmer, this is a JavaScript
+ * implementation of the PorterStemmer taken from http://tartarus.org/~martin
+ *
+ * @static
+ * @implements {lunr.PipelineFunction}
+ * @param {lunr.Token} token - The string to stem
+ * @returns {lunr.Token}
+ * @see {@link lunr.Pipeline}
+ * @function
+ */
+lunr.stemmer = (function(){
+ var step2list = {
+ "ational" : "ate",
+ "tional" : "tion",
+ "enci" : "ence",
+ "anci" : "ance",
+ "izer" : "ize",
+ "bli" : "ble",
+ "alli" : "al",
+ "entli" : "ent",
+ "eli" : "e",
+ "ousli" : "ous",
+ "ization" : "ize",
+ "ation" : "ate",
+ "ator" : "ate",
+ "alism" : "al",
+ "iveness" : "ive",
+ "fulness" : "ful",
+ "ousness" : "ous",
+ "aliti" : "al",
+ "iviti" : "ive",
+ "biliti" : "ble",
+ "logi" : "log"
+ },
+
+ step3list = {
+ "icate" : "ic",
+ "ative" : "",
+ "alize" : "al",
+ "iciti" : "ic",
+ "ical" : "ic",
+ "ful" : "",
+ "ness" : ""
+ },
+
+ c = "[^aeiou]", // consonant
+ v = "[aeiouy]", // vowel
+ C = c + "[^aeiouy]*", // consonant sequence
+ V = v + "[aeiou]*", // vowel sequence
+
+ mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
+ meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
+ mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
+ s_v = "^(" + C + ")?" + v; // vowel in stem
+
+ var re_mgr0 = new RegExp(mgr0);
+ var re_mgr1 = new RegExp(mgr1);
+ var re_meq1 = new RegExp(meq1);
+ var re_s_v = new RegExp(s_v);
+
+ var re_1a = /^(.+?)(ss|i)es$/;
+ var re2_1a = /^(.+?)([^s])s$/;
+ var re_1b = /^(.+?)eed$/;
+ var re2_1b = /^(.+?)(ed|ing)$/;
+ var re_1b_2 = /.$/;
+ var re2_1b_2 = /(at|bl|iz)$/;
+ var re3_1b_2 = new RegExp("([^aeiouylsz])\\1$");
+ var re4_1b_2 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+
+ var re_1c = /^(.+?[^aeiou])y$/;
+ var re_2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+
+ var re_3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+
+ var re_4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+ var re2_4 = /^(.+?)(s|t)(ion)$/;
+
+ var re_5 = /^(.+?)e$/;
+ var re_5_1 = /ll$/;
+ var re3_5 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+
+ var porterStemmer = function porterStemmer(w) {
+ var stem,
+ suffix,
+ firstch,
+ re,
+ re2,
+ re3,
+ re4;
+
+ if (w.length < 3) { return w; }
+
+ firstch = w.substr(0,1);
+ if (firstch == "y") {
+ w = firstch.toUpperCase() + w.substr(1);
+ }
+
+ // Step 1a
+ re = re_1a
+ re2 = re2_1a;
+
+ if (re.test(w)) { w = w.replace(re,"$1$2"); }
+ else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
+
+ // Step 1b
+ re = re_1b;
+ re2 = re2_1b;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ re = re_mgr0;
+ if (re.test(fp[1])) {
+ re = re_1b_2;
+ w = w.replace(re,"");
+ }
+ } else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1];
+ re2 = re_s_v;
+ if (re2.test(stem)) {
+ w = stem;
+ re2 = re2_1b_2;
+ re3 = re3_1b_2;
+ re4 = re4_1b_2;
+ if (re2.test(w)) { w = w + "e"; }
+ else if (re3.test(w)) { re = re_1b_2; w = w.replace(re,""); }
+ else if (re4.test(w)) { w = w + "e"; }
+ }
+ }
+
+ // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
+ re = re_1c;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ w = stem + "i";
+ }
+
+ // Step 2
+ re = re_2;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = re_mgr0;
+ if (re.test(stem)) {
+ w = stem + step2list[suffix];
+ }
+ }
+
+ // Step 3
+ re = re_3;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ suffix = fp[2];
+ re = re_mgr0;
+ if (re.test(stem)) {
+ w = stem + step3list[suffix];
+ }
+ }
+
+ // Step 4
+ re = re_4;
+ re2 = re2_4;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = re_mgr1;
+ if (re.test(stem)) {
+ w = stem;
+ }
+ } else if (re2.test(w)) {
+ var fp = re2.exec(w);
+ stem = fp[1] + fp[2];
+ re2 = re_mgr1;
+ if (re2.test(stem)) {
+ w = stem;
+ }
+ }
+
+ // Step 5
+ re = re_5;
+ if (re.test(w)) {
+ var fp = re.exec(w);
+ stem = fp[1];
+ re = re_mgr1;
+ re2 = re_meq1;
+ re3 = re3_5;
+ if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
+ w = stem;
+ }
+ }
+
+ re = re_5_1;
+ re2 = re_mgr1;
+ if (re.test(w) && re2.test(w)) {
+ re = re_1b_2;
+ w = w.replace(re,"");
+ }
+
+ // and turn initial Y back to y
+
+ if (firstch == "y") {
+ w = firstch.toLowerCase() + w.substr(1);
+ }
+
+ return w;
+ };
+
+ return function (token) {
+ return token.update(porterStemmer);
+ }
+})();
+
+lunr.Pipeline.registerFunction(lunr.stemmer, 'stemmer')
+/*!
+ * lunr.stopWordFilter
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * lunr.generateStopWordFilter builds a stopWordFilter function from the provided
+ * list of stop words.
+ *
+ * The built in lunr.stopWordFilter is built using this generator and can be used
+ * to generate custom stopWordFilters for applications or non English languages.
+ *
+ * @function
+ * @param {Array} token The token to pass through the filter
+ * @returns {lunr.PipelineFunction}
+ * @see lunr.Pipeline
+ * @see lunr.stopWordFilter
+ */
+lunr.generateStopWordFilter = function (stopWords) {
+ var words = stopWords.reduce(function (memo, stopWord) {
+ memo[stopWord] = stopWord
+ return memo
+ }, {})
+
+ return function (token) {
+ if (token && words[token.toString()] !== token.toString()) return token
+ }
+}
+
+/**
+ * lunr.stopWordFilter is an English language stop word list filter, any words
+ * contained in the list will not be passed through the filter.
+ *
+ * This is intended to be used in the Pipeline. If the token does not pass the
+ * filter then undefined will be returned.
+ *
+ * @function
+ * @implements {lunr.PipelineFunction}
+ * @params {lunr.Token} token - A token to check for being a stop word.
+ * @returns {lunr.Token}
+ * @see {@link lunr.Pipeline}
+ */
+lunr.stopWordFilter = lunr.generateStopWordFilter([
+ 'a',
+ 'able',
+ 'about',
+ 'across',
+ 'after',
+ 'all',
+ 'almost',
+ 'also',
+ 'am',
+ 'among',
+ 'an',
+ 'and',
+ 'any',
+ 'are',
+ 'as',
+ 'at',
+ 'be',
+ 'because',
+ 'been',
+ 'but',
+ 'by',
+ 'can',
+ 'cannot',
+ 'could',
+ 'dear',
+ 'did',
+ 'do',
+ 'does',
+ 'either',
+ 'else',
+ 'ever',
+ 'every',
+ 'for',
+ 'from',
+ 'get',
+ 'got',
+ 'had',
+ 'has',
+ 'have',
+ 'he',
+ 'her',
+ 'hers',
+ 'him',
+ 'his',
+ 'how',
+ 'however',
+ 'i',
+ 'if',
+ 'in',
+ 'into',
+ 'is',
+ 'it',
+ 'its',
+ 'just',
+ 'least',
+ 'let',
+ 'like',
+ 'likely',
+ 'may',
+ 'me',
+ 'might',
+ 'most',
+ 'must',
+ 'my',
+ 'neither',
+ 'no',
+ 'nor',
+ 'not',
+ 'of',
+ 'off',
+ 'often',
+ 'on',
+ 'only',
+ 'or',
+ 'other',
+ 'our',
+ 'own',
+ 'rather',
+ 'said',
+ 'say',
+ 'says',
+ 'she',
+ 'should',
+ 'since',
+ 'so',
+ 'some',
+ 'than',
+ 'that',
+ 'the',
+ 'their',
+ 'them',
+ 'then',
+ 'there',
+ 'these',
+ 'they',
+ 'this',
+ 'tis',
+ 'to',
+ 'too',
+ 'twas',
+ 'us',
+ 'wants',
+ 'was',
+ 'we',
+ 'were',
+ 'what',
+ 'when',
+ 'where',
+ 'which',
+ 'while',
+ 'who',
+ 'whom',
+ 'why',
+ 'will',
+ 'with',
+ 'would',
+ 'yet',
+ 'you',
+ 'your'
+])
+
+lunr.Pipeline.registerFunction(lunr.stopWordFilter, 'stopWordFilter')
+/*!
+ * lunr.trimmer
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * lunr.trimmer is a pipeline function for trimming non word
+ * characters from the beginning and end of tokens before they
+ * enter the index.
+ *
+ * This implementation may not work correctly for non latin
+ * characters and should either be removed or adapted for use
+ * with languages with non-latin characters.
+ *
+ * @static
+ * @implements {lunr.PipelineFunction}
+ * @param {lunr.Token} token The token to pass through the filter
+ * @returns {lunr.Token}
+ * @see lunr.Pipeline
+ */
+lunr.trimmer = function (token) {
+ return token.update(function (s) {
+ return s.replace(/^\W+/, '').replace(/\W+$/, '')
+ })
+}
+
+lunr.Pipeline.registerFunction(lunr.trimmer, 'trimmer')
+/*!
+ * lunr.TokenSet
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * A token set is used to store the unique list of all tokens
+ * within an index. Token sets are also used to represent an
+ * incoming query to the index, this query token set and index
+ * token set are then intersected to find which tokens to look
+ * up in the inverted index.
+ *
+ * A token set can hold multiple tokens, as in the case of the
+ * index token set, or it can hold a single token as in the
+ * case of a simple query token set.
+ *
+ * Additionally token sets are used to perform wildcard matching.
+ * Leading, contained and trailing wildcards are supported, and
+ * from this edit distance matching can also be provided.
+ *
+ * Token sets are implemented as a minimal finite state automata,
+ * where both common prefixes and suffixes are shared between tokens.
+ * This helps to reduce the space used for storing the token set.
+ *
+ * @constructor
+ */
+lunr.TokenSet = function () {
+ this.final = false
+ this.edges = {}
+ this.id = lunr.TokenSet._nextId
+ lunr.TokenSet._nextId += 1
+}
+
+/**
+ * Keeps track of the next, auto increment, identifier to assign
+ * to a new tokenSet.
+ *
+ * TokenSets require a unique identifier to be correctly minimised.
+ *
+ * @private
+ */
+lunr.TokenSet._nextId = 1
+
+/**
+ * Creates a TokenSet instance from the given sorted array of words.
+ *
+ * @param {String[]} arr - A sorted array of strings to create the set from.
+ * @returns {lunr.TokenSet}
+ * @throws Will throw an error if the input array is not sorted.
+ */
+lunr.TokenSet.fromArray = function (arr) {
+ var builder = new lunr.TokenSet.Builder
+
+ for (var i = 0, len = arr.length; i < len; i++) {
+ builder.insert(arr[i])
+ }
+
+ builder.finish()
+ return builder.root
+}
+
+/**
+ * Creates a token set from a query clause.
+ *
+ * @private
+ * @param {Object} clause - A single clause from lunr.Query.
+ * @param {string} clause.term - The query clause term.
+ * @param {number} [clause.editDistance] - The optional edit distance for the term.
+ * @returns {lunr.TokenSet}
+ */
+lunr.TokenSet.fromClause = function (clause) {
+ if ('editDistance' in clause) {
+ return lunr.TokenSet.fromFuzzyString(clause.term, clause.editDistance)
+ } else {
+ return lunr.TokenSet.fromString(clause.term)
+ }
+}
+
+/**
+ * Creates a token set representing a single string with a specified
+ * edit distance.
+ *
+ * Insertions, deletions, substitutions and transpositions are each
+ * treated as an edit distance of 1.
+ *
+ * Increasing the allowed edit distance will have a dramatic impact
+ * on the performance of both creating and intersecting these TokenSets.
+ * It is advised to keep the edit distance less than 3.
+ *
+ * @param {string} str - The string to create the token set from.
+ * @param {number} editDistance - The allowed edit distance to match.
+ * @returns {lunr.Vector}
+ */
+lunr.TokenSet.fromFuzzyString = function (str, editDistance) {
+ var root = new lunr.TokenSet
+
+ var stack = [{
+ node: root,
+ editsRemaining: editDistance,
+ str: str
+ }]
+
+ while (stack.length) {
+ var frame = stack.pop()
+
+ // no edit
+ if (frame.str.length > 0) {
+ var char = frame.str.charAt(0),
+ noEditNode
+
+ if (char in frame.node.edges) {
+ noEditNode = frame.node.edges[char]
+ } else {
+ noEditNode = new lunr.TokenSet
+ frame.node.edges[char] = noEditNode
+ }
+
+ if (frame.str.length == 1) {
+ noEditNode.final = true
+ }
+
+ stack.push({
+ node: noEditNode,
+ editsRemaining: frame.editsRemaining,
+ str: frame.str.slice(1)
+ })
+ }
+
+ if (frame.editsRemaining == 0) {
+ continue
+ }
+
+ // insertion
+ if ("*" in frame.node.edges) {
+ var insertionNode = frame.node.edges["*"]
+ } else {
+ var insertionNode = new lunr.TokenSet
+ frame.node.edges["*"] = insertionNode
+ }
+
+ if (frame.str.length == 0) {
+ insertionNode.final = true
+ }
+
+ stack.push({
+ node: insertionNode,
+ editsRemaining: frame.editsRemaining - 1,
+ str: frame.str
+ })
+
+ // deletion
+ // can only do a deletion if we have enough edits remaining
+ // and if there are characters left to delete in the string
+ if (frame.str.length > 1) {
+ stack.push({
+ node: frame.node,
+ editsRemaining: frame.editsRemaining - 1,
+ str: frame.str.slice(1)
+ })
+ }
+
+ // deletion
+ // just removing the last character from the str
+ if (frame.str.length == 1) {
+ frame.node.final = true
+ }
+
+ // substitution
+ // can only do a substitution if we have enough edits remaining
+ // and if there are characters left to substitute
+ if (frame.str.length >= 1) {
+ if ("*" in frame.node.edges) {
+ var substitutionNode = frame.node.edges["*"]
+ } else {
+ var substitutionNode = new lunr.TokenSet
+ frame.node.edges["*"] = substitutionNode
+ }
+
+ if (frame.str.length == 1) {
+ substitutionNode.final = true
+ }
+
+ stack.push({
+ node: substitutionNode,
+ editsRemaining: frame.editsRemaining - 1,
+ str: frame.str.slice(1)
+ })
+ }
+
+ // transposition
+ // can only do a transposition if there are edits remaining
+ // and there are enough characters to transpose
+ if (frame.str.length > 1) {
+ var charA = frame.str.charAt(0),
+ charB = frame.str.charAt(1),
+ transposeNode
+
+ if (charB in frame.node.edges) {
+ transposeNode = frame.node.edges[charB]
+ } else {
+ transposeNode = new lunr.TokenSet
+ frame.node.edges[charB] = transposeNode
+ }
+
+ if (frame.str.length == 1) {
+ transposeNode.final = true
+ }
+
+ stack.push({
+ node: transposeNode,
+ editsRemaining: frame.editsRemaining - 1,
+ str: charA + frame.str.slice(2)
+ })
+ }
+ }
+
+ return root
+}
+
+/**
+ * Creates a TokenSet from a string.
+ *
+ * The string may contain one or more wildcard characters (*)
+ * that will allow wildcard matching when intersecting with
+ * another TokenSet.
+ *
+ * @param {string} str - The string to create a TokenSet from.
+ * @returns {lunr.TokenSet}
+ */
+lunr.TokenSet.fromString = function (str) {
+ var node = new lunr.TokenSet,
+ root = node
+
+ /*
+ * Iterates through all characters within the passed string
+ * appending a node for each character.
+ *
+ * When a wildcard character is found then a self
+ * referencing edge is introduced to continually match
+ * any number of any characters.
+ */
+ for (var i = 0, len = str.length; i < len; i++) {
+ var char = str[i],
+ final = (i == len - 1)
+
+ if (char == "*") {
+ node.edges[char] = node
+ node.final = final
+
+ } else {
+ var next = new lunr.TokenSet
+ next.final = final
+
+ node.edges[char] = next
+ node = next
+ }
+ }
+
+ return root
+}
+
+/**
+ * Converts this TokenSet into an array of strings
+ * contained within the TokenSet.
+ *
+ * This is not intended to be used on a TokenSet that
+ * contains wildcards, in these cases the results are
+ * undefined and are likely to cause an infinite loop.
+ *
+ * @returns {string[]}
+ */
+lunr.TokenSet.prototype.toArray = function () {
+ var words = []
+
+ var stack = [{
+ prefix: "",
+ node: this
+ }]
+
+ while (stack.length) {
+ var frame = stack.pop(),
+ edges = Object.keys(frame.node.edges),
+ len = edges.length
+
+ if (frame.node.final) {
+ /* In Safari, at this point the prefix is sometimes corrupted, see:
+ * https://github.com/olivernn/lunr.js/issues/279 Calling any
+ * String.prototype method forces Safari to "cast" this string to what
+ * it's supposed to be, fixing the bug. */
+ frame.prefix.charAt(0)
+ words.push(frame.prefix)
+ }
+
+ for (var i = 0; i < len; i++) {
+ var edge = edges[i]
+
+ stack.push({
+ prefix: frame.prefix.concat(edge),
+ node: frame.node.edges[edge]
+ })
+ }
+ }
+
+ return words
+}
+
+/**
+ * Generates a string representation of a TokenSet.
+ *
+ * This is intended to allow TokenSets to be used as keys
+ * in objects, largely to aid the construction and minimisation
+ * of a TokenSet. As such it is not designed to be a human
+ * friendly representation of the TokenSet.
+ *
+ * @returns {string}
+ */
+lunr.TokenSet.prototype.toString = function () {
+ // NOTE: Using Object.keys here as this.edges is very likely
+ // to enter 'hash-mode' with many keys being added
+ //
+ // avoiding a for-in loop here as it leads to the function
+ // being de-optimised (at least in V8). From some simple
+ // benchmarks the performance is comparable, but allowing
+ // V8 to optimize may mean easy performance wins in the future.
+
+ if (this._str) {
+ return this._str
+ }
+
+ var str = this.final ? '1' : '0',
+ labels = Object.keys(this.edges).sort(),
+ len = labels.length
+
+ for (var i = 0; i < len; i++) {
+ var label = labels[i],
+ node = this.edges[label]
+
+ str = str + label + node.id
+ }
+
+ return str
+}
+
+/**
+ * Returns a new TokenSet that is the intersection of
+ * this TokenSet and the passed TokenSet.
+ *
+ * This intersection will take into account any wildcards
+ * contained within the TokenSet.
+ *
+ * @param {lunr.TokenSet} b - An other TokenSet to intersect with.
+ * @returns {lunr.TokenSet}
+ */
+lunr.TokenSet.prototype.intersect = function (b) {
+ var output = new lunr.TokenSet,
+ frame = undefined
+
+ var stack = [{
+ qNode: b,
+ output: output,
+ node: this
+ }]
+
+ while (stack.length) {
+ frame = stack.pop()
+
+ // NOTE: As with the #toString method, we are using
+ // Object.keys and a for loop instead of a for-in loop
+ // as both of these objects enter 'hash' mode, causing
+ // the function to be de-optimised in V8
+ var qEdges = Object.keys(frame.qNode.edges),
+ qLen = qEdges.length,
+ nEdges = Object.keys(frame.node.edges),
+ nLen = nEdges.length
+
+ for (var q = 0; q < qLen; q++) {
+ var qEdge = qEdges[q]
+
+ for (var n = 0; n < nLen; n++) {
+ var nEdge = nEdges[n]
+
+ if (nEdge == qEdge || qEdge == '*') {
+ var node = frame.node.edges[nEdge],
+ qNode = frame.qNode.edges[qEdge],
+ final = node.final && qNode.final,
+ next = undefined
+
+ if (nEdge in frame.output.edges) {
+ // an edge already exists for this character
+ // no need to create a new node, just set the finality
+ // bit unless this node is already final
+ next = frame.output.edges[nEdge]
+ next.final = next.final || final
+
+ } else {
+ // no edge exists yet, must create one
+ // set the finality bit and insert it
+ // into the output
+ next = new lunr.TokenSet
+ next.final = final
+ frame.output.edges[nEdge] = next
+ }
+
+ stack.push({
+ qNode: qNode,
+ output: next,
+ node: node
+ })
+ }
+ }
+ }
+ }
+
+ return output
+}
+lunr.TokenSet.Builder = function () {
+ this.previousWord = ""
+ this.root = new lunr.TokenSet
+ this.uncheckedNodes = []
+ this.minimizedNodes = {}
+}
+
+lunr.TokenSet.Builder.prototype.insert = function (word) {
+ var node,
+ commonPrefix = 0
+
+ if (word < this.previousWord) {
+ throw new Error ("Out of order word insertion")
+ }
+
+ for (var i = 0; i < word.length && i < this.previousWord.length; i++) {
+ if (word[i] != this.previousWord[i]) break
+ commonPrefix++
+ }
+
+ this.minimize(commonPrefix)
+
+ if (this.uncheckedNodes.length == 0) {
+ node = this.root
+ } else {
+ node = this.uncheckedNodes[this.uncheckedNodes.length - 1].child
+ }
+
+ for (var i = commonPrefix; i < word.length; i++) {
+ var nextNode = new lunr.TokenSet,
+ char = word[i]
+
+ node.edges[char] = nextNode
+
+ this.uncheckedNodes.push({
+ parent: node,
+ char: char,
+ child: nextNode
+ })
+
+ node = nextNode
+ }
+
+ node.final = true
+ this.previousWord = word
+}
+
+lunr.TokenSet.Builder.prototype.finish = function () {
+ this.minimize(0)
+}
+
+lunr.TokenSet.Builder.prototype.minimize = function (downTo) {
+ for (var i = this.uncheckedNodes.length - 1; i >= downTo; i--) {
+ var node = this.uncheckedNodes[i],
+ childKey = node.child.toString()
+
+ if (childKey in this.minimizedNodes) {
+ node.parent.edges[node.char] = this.minimizedNodes[childKey]
+ } else {
+ // Cache the key for this node since
+ // we know it can't change anymore
+ node.child._str = childKey
+
+ this.minimizedNodes[childKey] = node.child
+ }
+
+ this.uncheckedNodes.pop()
+ }
+}
+/*!
+ * lunr.Index
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * An index contains the built index of all documents and provides a query interface
+ * to the index.
+ *
+ * Usually instances of lunr.Index will not be created using this constructor, instead
+ * lunr.Builder should be used to construct new indexes, or lunr.Index.load should be
+ * used to load previously built and serialized indexes.
+ *
+ * @constructor
+ * @param {Object} attrs - The attributes of the built search index.
+ * @param {Object} attrs.invertedIndex - An index of term/field to document reference.
+ * @param {Object} attrs.fieldVectors - Field vectors
+ * @param {lunr.TokenSet} attrs.tokenSet - An set of all corpus tokens.
+ * @param {string[]} attrs.fields - The names of indexed document fields.
+ * @param {lunr.Pipeline} attrs.pipeline - The pipeline to use for search terms.
+ */
+lunr.Index = function (attrs) {
+ this.invertedIndex = attrs.invertedIndex
+ this.fieldVectors = attrs.fieldVectors
+ this.tokenSet = attrs.tokenSet
+ this.fields = attrs.fields
+ this.pipeline = attrs.pipeline
+}
+
+/**
+ * A result contains details of a document matching a search query.
+ * @typedef {Object} lunr.Index~Result
+ * @property {string} ref - The reference of the document this result represents.
+ * @property {number} score - A number between 0 and 1 representing how similar this document is to the query.
+ * @property {lunr.MatchData} matchData - Contains metadata about this match including which term(s) caused the match.
+ */
+
+/**
+ * Although lunr provides the ability to create queries using lunr.Query, it also provides a simple
+ * query language which itself is parsed into an instance of lunr.Query.
+ *
+ * For programmatically building queries it is advised to directly use lunr.Query, the query language
+ * is best used for human entered text rather than program generated text.
+ *
+ * At its simplest queries can just be a single term, e.g. `hello`, multiple terms are also supported
+ * and will be combined with OR, e.g `hello world` will match documents that contain either 'hello'
+ * or 'world', though those that contain both will rank higher in the results.
+ *
+ * Wildcards can be included in terms to match one or more unspecified characters, these wildcards can
+ * be inserted anywhere within the term, and more than one wildcard can exist in a single term. Adding
+ * wildcards will increase the number of documents that will be found but can also have a negative
+ * impact on query performance, especially with wildcards at the beginning of a term.
+ *
+ * Terms can be restricted to specific fields, e.g. `title:hello`, only documents with the term
+ * hello in the title field will match this query. Using a field not present in the index will lead
+ * to an error being thrown.
+ *
+ * Modifiers can also be added to terms, lunr supports edit distance and boost modifiers on terms. A term
+ * boost will make documents matching that term score higher, e.g. `foo^5`. Edit distance is also supported
+ * to provide fuzzy matching, e.g. 'hello~2' will match documents with hello with an edit distance of 2.
+ * Avoid large values for edit distance to improve query performance.
+ *
+ * Each term also supports a presence modifier. By default a term's presence in document is optional, however
+ * this can be changed to either required or prohibited. For a term's presence to be required in a document the
+ * term should be prefixed with a '+', e.g. `+foo bar` is a search for documents that must contain 'foo' and
+ * optionally contain 'bar'. Conversely a leading '-' sets the terms presence to prohibited, i.e. it must not
+ * appear in a document, e.g. `-foo bar` is a search for documents that do not contain 'foo' but may contain 'bar'.
+ *
+ * To escape special characters the backslash character '\' can be used, this allows searches to include
+ * characters that would normally be considered modifiers, e.g. `foo\~2` will search for a term "foo~2" instead
+ * of attempting to apply a boost of 2 to the search term "foo".
+ *
+ * @typedef {string} lunr.Index~QueryString
+ * @example
Simple single term query
+ * hello
+ * @example
Multiple term query
+ * hello world
+ * @example
term scoped to a field
+ * title:hello
+ * @example
term with a boost of 10
+ * hello^10
+ * @example
term with an edit distance of 2
+ * hello~2
+ * @example
terms with presence modifiers
+ * -foo +bar baz
+ */
+
+/**
+ * Performs a search against the index using lunr query syntax.
+ *
+ * Results will be returned sorted by their score, the most relevant results
+ * will be returned first. For details on how the score is calculated, please see
+ * the {@link https://lunrjs.com/guides/searching.html#scoring|guide}.
+ *
+ * For more programmatic querying use lunr.Index#query.
+ *
+ * @param {lunr.Index~QueryString} queryString - A string containing a lunr query.
+ * @throws {lunr.QueryParseError} If the passed query string cannot be parsed.
+ * @returns {lunr.Index~Result[]}
+ */
+lunr.Index.prototype.search = function (queryString) {
+ return this.query(function (query) {
+ var parser = new lunr.QueryParser(queryString, query)
+ parser.parse()
+ })
+}
+
+/**
+ * A query builder callback provides a query object to be used to express
+ * the query to perform on the index.
+ *
+ * @callback lunr.Index~queryBuilder
+ * @param {lunr.Query} query - The query object to build up.
+ * @this lunr.Query
+ */
+
+/**
+ * Performs a query against the index using the yielded lunr.Query object.
+ *
+ * If performing programmatic queries against the index, this method is preferred
+ * over lunr.Index#search so as to avoid the additional query parsing overhead.
+ *
+ * A query object is yielded to the supplied function which should be used to
+ * express the query to be run against the index.
+ *
+ * Note that although this function takes a callback parameter it is _not_ an
+ * asynchronous operation, the callback is just yielded a query object to be
+ * customized.
+ *
+ * @param {lunr.Index~queryBuilder} fn - A function that is used to build the query.
+ * @returns {lunr.Index~Result[]}
+ */
+lunr.Index.prototype.query = function (fn) {
+ // for each query clause
+ // * process terms
+ // * expand terms from token set
+ // * find matching documents and metadata
+ // * get document vectors
+ // * score documents
+
+ var query = new lunr.Query(this.fields),
+ matchingFields = Object.create(null),
+ queryVectors = Object.create(null),
+ termFieldCache = Object.create(null),
+ requiredMatches = Object.create(null),
+ prohibitedMatches = Object.create(null)
+
+ /*
+ * To support field level boosts a query vector is created per
+ * field. An empty vector is eagerly created to support negated
+ * queries.
+ */
+ for (var i = 0; i < this.fields.length; i++) {
+ queryVectors[this.fields[i]] = new lunr.Vector
+ }
+
+ fn.call(query, query)
+
+ for (var i = 0; i < query.clauses.length; i++) {
+ /*
+ * Unless the pipeline has been disabled for this term, which is
+ * the case for terms with wildcards, we need to pass the clause
+ * term through the search pipeline. A pipeline returns an array
+ * of processed terms. Pipeline functions may expand the passed
+ * term, which means we may end up performing multiple index lookups
+ * for a single query term.
+ */
+ var clause = query.clauses[i],
+ terms = null,
+ clauseMatches = lunr.Set.empty
+
+ if (clause.usePipeline) {
+ terms = this.pipeline.runString(clause.term, {
+ fields: clause.fields
+ })
+ } else {
+ terms = [clause.term]
+ }
+
+ for (var m = 0; m < terms.length; m++) {
+ var term = terms[m]
+
+ /*
+ * Each term returned from the pipeline needs to use the same query
+ * clause object, e.g. the same boost and or edit distance. The
+ * simplest way to do this is to re-use the clause object but mutate
+ * its term property.
+ */
+ clause.term = term
+
+ /*
+ * From the term in the clause we create a token set which will then
+ * be used to intersect the indexes token set to get a list of terms
+ * to lookup in the inverted index
+ */
+ var termTokenSet = lunr.TokenSet.fromClause(clause),
+ expandedTerms = this.tokenSet.intersect(termTokenSet).toArray()
+
+ /*
+ * If a term marked as required does not exist in the tokenSet it is
+ * impossible for the search to return any matches. We set all the field
+ * scoped required matches set to empty and stop examining any further
+ * clauses.
+ */
+ if (expandedTerms.length === 0 && clause.presence === lunr.Query.presence.REQUIRED) {
+ for (var k = 0; k < clause.fields.length; k++) {
+ var field = clause.fields[k]
+ requiredMatches[field] = lunr.Set.empty
+ }
+
+ break
+ }
+
+ for (var j = 0; j < expandedTerms.length; j++) {
+ /*
+ * For each term get the posting and termIndex, this is required for
+ * building the query vector.
+ */
+ var expandedTerm = expandedTerms[j],
+ posting = this.invertedIndex[expandedTerm],
+ termIndex = posting._index
+
+ for (var k = 0; k < clause.fields.length; k++) {
+ /*
+ * For each field that this query term is scoped by (by default
+ * all fields are in scope) we need to get all the document refs
+ * that have this term in that field.
+ *
+ * The posting is the entry in the invertedIndex for the matching
+ * term from above.
+ */
+ var field = clause.fields[k],
+ fieldPosting = posting[field],
+ matchingDocumentRefs = Object.keys(fieldPosting),
+ termField = expandedTerm + "/" + field,
+ matchingDocumentsSet = new lunr.Set(matchingDocumentRefs)
+
+ /*
+ * if the presence of this term is required ensure that the matching
+ * documents are added to the set of required matches for this clause.
+ *
+ */
+ if (clause.presence == lunr.Query.presence.REQUIRED) {
+ clauseMatches = clauseMatches.union(matchingDocumentsSet)
+
+ if (requiredMatches[field] === undefined) {
+ requiredMatches[field] = lunr.Set.complete
+ }
+ }
+
+ /*
+ * if the presence of this term is prohibited ensure that the matching
+ * documents are added to the set of prohibited matches for this field,
+ * creating that set if it does not yet exist.
+ */
+ if (clause.presence == lunr.Query.presence.PROHIBITED) {
+ if (prohibitedMatches[field] === undefined) {
+ prohibitedMatches[field] = lunr.Set.empty
+ }
+
+ prohibitedMatches[field] = prohibitedMatches[field].union(matchingDocumentsSet)
+
+ /*
+ * Prohibited matches should not be part of the query vector used for
+ * similarity scoring and no metadata should be extracted so we continue
+ * to the next field
+ */
+ continue
+ }
+
+ /*
+ * The query field vector is populated using the termIndex found for
+ * the term and a unit value with the appropriate boost applied.
+ * Using upsert because there could already be an entry in the vector
+ * for the term we are working with. In that case we just add the scores
+ * together.
+ */
+ queryVectors[field].upsert(termIndex, clause.boost, function (a, b) { return a + b })
+
+ /**
+ * If we've already seen this term, field combo then we've already collected
+ * the matching documents and metadata, no need to go through all that again
+ */
+ if (termFieldCache[termField]) {
+ continue
+ }
+
+ for (var l = 0; l < matchingDocumentRefs.length; l++) {
+ /*
+ * All metadata for this term/field/document triple
+ * are then extracted and collected into an instance
+ * of lunr.MatchData ready to be returned in the query
+ * results
+ */
+ var matchingDocumentRef = matchingDocumentRefs[l],
+ matchingFieldRef = new lunr.FieldRef (matchingDocumentRef, field),
+ metadata = fieldPosting[matchingDocumentRef],
+ fieldMatch
+
+ if ((fieldMatch = matchingFields[matchingFieldRef]) === undefined) {
+ matchingFields[matchingFieldRef] = new lunr.MatchData (expandedTerm, field, metadata)
+ } else {
+ fieldMatch.add(expandedTerm, field, metadata)
+ }
+
+ }
+
+ termFieldCache[termField] = true
+ }
+ }
+ }
+
+ /**
+ * If the presence was required we need to update the requiredMatches field sets.
+ * We do this after all fields for the term have collected their matches because
+ * the clause terms presence is required in _any_ of the fields not _all_ of the
+ * fields.
+ */
+ if (clause.presence === lunr.Query.presence.REQUIRED) {
+ for (var k = 0; k < clause.fields.length; k++) {
+ var field = clause.fields[k]
+ requiredMatches[field] = requiredMatches[field].intersect(clauseMatches)
+ }
+ }
+ }
+
+ /**
+ * Need to combine the field scoped required and prohibited
+ * matching documents into a global set of required and prohibited
+ * matches
+ */
+ var allRequiredMatches = lunr.Set.complete,
+ allProhibitedMatches = lunr.Set.empty
+
+ for (var i = 0; i < this.fields.length; i++) {
+ var field = this.fields[i]
+
+ if (requiredMatches[field]) {
+ allRequiredMatches = allRequiredMatches.intersect(requiredMatches[field])
+ }
+
+ if (prohibitedMatches[field]) {
+ allProhibitedMatches = allProhibitedMatches.union(prohibitedMatches[field])
+ }
+ }
+
+ var matchingFieldRefs = Object.keys(matchingFields),
+ results = [],
+ matches = Object.create(null)
+
+ /*
+ * If the query is negated (contains only prohibited terms)
+ * we need to get _all_ fieldRefs currently existing in the
+ * index. This is only done when we know that the query is
+ * entirely prohibited terms to avoid any cost of getting all
+ * fieldRefs unnecessarily.
+ *
+ * Additionally, blank MatchData must be created to correctly
+ * populate the results.
+ */
+ if (query.isNegated()) {
+ matchingFieldRefs = Object.keys(this.fieldVectors)
+
+ for (var i = 0; i < matchingFieldRefs.length; i++) {
+ var matchingFieldRef = matchingFieldRefs[i]
+ var fieldRef = lunr.FieldRef.fromString(matchingFieldRef)
+ matchingFields[matchingFieldRef] = new lunr.MatchData
+ }
+ }
+
+ for (var i = 0; i < matchingFieldRefs.length; i++) {
+ /*
+ * Currently we have document fields that match the query, but we
+ * need to return documents. The matchData and scores are combined
+ * from multiple fields belonging to the same document.
+ *
+ * Scores are calculated by field, using the query vectors created
+ * above, and combined into a final document score using addition.
+ */
+ var fieldRef = lunr.FieldRef.fromString(matchingFieldRefs[i]),
+ docRef = fieldRef.docRef
+
+ if (!allRequiredMatches.contains(docRef)) {
+ continue
+ }
+
+ if (allProhibitedMatches.contains(docRef)) {
+ continue
+ }
+
+ var fieldVector = this.fieldVectors[fieldRef],
+ score = queryVectors[fieldRef.fieldName].similarity(fieldVector),
+ docMatch
+
+ if ((docMatch = matches[docRef]) !== undefined) {
+ docMatch.score += score
+ docMatch.matchData.combine(matchingFields[fieldRef])
+ } else {
+ var match = {
+ ref: docRef,
+ score: score,
+ matchData: matchingFields[fieldRef]
+ }
+ matches[docRef] = match
+ results.push(match)
+ }
+ }
+
+ /*
+ * Sort the results objects by score, highest first.
+ */
+ return results.sort(function (a, b) {
+ return b.score - a.score
+ })
+}
+
+/**
+ * Prepares the index for JSON serialization.
+ *
+ * The schema for this JSON blob will be described in a
+ * separate JSON schema file.
+ *
+ * @returns {Object}
+ */
+lunr.Index.prototype.toJSON = function () {
+ var invertedIndex = Object.keys(this.invertedIndex)
+ .sort()
+ .map(function (term) {
+ return [term, this.invertedIndex[term]]
+ }, this)
+
+ var fieldVectors = Object.keys(this.fieldVectors)
+ .map(function (ref) {
+ return [ref, this.fieldVectors[ref].toJSON()]
+ }, this)
+
+ return {
+ version: lunr.version,
+ fields: this.fields,
+ fieldVectors: fieldVectors,
+ invertedIndex: invertedIndex,
+ pipeline: this.pipeline.toJSON()
+ }
+}
+
+/**
+ * Loads a previously serialized lunr.Index
+ *
+ * @param {Object} serializedIndex - A previously serialized lunr.Index
+ * @returns {lunr.Index}
+ */
+lunr.Index.load = function (serializedIndex) {
+ var attrs = {},
+ fieldVectors = {},
+ serializedVectors = serializedIndex.fieldVectors,
+ invertedIndex = Object.create(null),
+ serializedInvertedIndex = serializedIndex.invertedIndex,
+ tokenSetBuilder = new lunr.TokenSet.Builder,
+ pipeline = lunr.Pipeline.load(serializedIndex.pipeline)
+
+ if (serializedIndex.version != lunr.version) {
+ lunr.utils.warn("Version mismatch when loading serialised index. Current version of lunr '" + lunr.version + "' does not match serialized index '" + serializedIndex.version + "'")
+ }
+
+ for (var i = 0; i < serializedVectors.length; i++) {
+ var tuple = serializedVectors[i],
+ ref = tuple[0],
+ elements = tuple[1]
+
+ fieldVectors[ref] = new lunr.Vector(elements)
+ }
+
+ for (var i = 0; i < serializedInvertedIndex.length; i++) {
+ var tuple = serializedInvertedIndex[i],
+ term = tuple[0],
+ posting = tuple[1]
+
+ tokenSetBuilder.insert(term)
+ invertedIndex[term] = posting
+ }
+
+ tokenSetBuilder.finish()
+
+ attrs.fields = serializedIndex.fields
+
+ attrs.fieldVectors = fieldVectors
+ attrs.invertedIndex = invertedIndex
+ attrs.tokenSet = tokenSetBuilder.root
+ attrs.pipeline = pipeline
+
+ return new lunr.Index(attrs)
+}
+/*!
+ * lunr.Builder
+ * Copyright (C) 2020 Oliver Nightingale
+ */
+
+/**
+ * lunr.Builder performs indexing on a set of documents and
+ * returns instances of lunr.Index ready for querying.
+ *
+ * All configuration of the index is done via the builder, the
+ * fields to index, the document reference, the text processing
+ * pipeline and document scoring parameters are all set on the
+ * builder before indexing.
+ *
+ * @constructor
+ * @property {string} _ref - Internal reference to the document reference field.
+ * @property {string[]} _fields - Internal reference to the document fields to index.
+ * @property {object} invertedIndex - The inverted index maps terms to document fields.
+ * @property {object} documentTermFrequencies - Keeps track of document term frequencies.
+ * @property {object} documentLengths - Keeps track of the length of documents added to the index.
+ * @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing.
+ * @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing.
+ * @property {lunr.Pipeline} searchPipeline - A pipeline for processing search terms before querying the index.
+ * @property {number} documentCount - Keeps track of the total number of documents indexed.
+ * @property {number} _b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75.
+ * @property {number} _k1 - A parameter to control how quickly an increase in term frequency results in term frequency saturation, the default value is 1.2.
+ * @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space.
+ * @property {array} metadataWhitelist - A list of metadata keys that have been whitelisted for entry in the index.
+ */
+lunr.Builder = function () {
+ this._ref = "id"
+ this._fields = Object.create(null)
+ this._documents = Object.create(null)
+ this.invertedIndex = Object.create(null)
+ this.fieldTermFrequencies = {}
+ this.fieldLengths = {}
+ this.tokenizer = lunr.tokenizer
+ this.pipeline = new lunr.Pipeline
+ this.searchPipeline = new lunr.Pipeline
+ this.documentCount = 0
+ this._b = 0.75
+ this._k1 = 1.2
+ this.termIndex = 0
+ this.metadataWhitelist = []
+}
+
+/**
+ * Sets the document field used as the document reference. Every document must have this field.
+ * The type of this field in the document should be a string, if it is not a string it will be
+ * coerced into a string by calling toString.
+ *
+ * The default ref is 'id'.
+ *
+ * The ref should _not_ be changed during indexing, it should be set before any documents are
+ * added to the index. Changing it during indexing can lead to inconsistent results.
+ *
+ * @param {string} ref - The name of the reference field in the document.
+ */
+lunr.Builder.prototype.ref = function (ref) {
+ this._ref = ref
+}
+
+/**
+ * A function that is used to extract a field from a document.
+ *
+ * Lunr expects a field to be at the top level of a document, if however the field
+ * is deeply nested within a document an extractor function can be used to extract
+ * the right field for indexing.
+ *
+ * @callback fieldExtractor
+ * @param {object} doc - The document being added to the index.
+ * @returns {?(string|object|object[])} obj - The object that will be indexed for this field.
+ * @example
Extracting a nested field
+ * function (doc) { return doc.nested.field }
+ */
+
+/**
+ * Adds a field to the list of document fields that will be indexed. Every document being
+ * indexed should have this field. Null values for this field in indexed documents will
+ * not cause errors but will limit the chance of that document being retrieved by searches.
+ *
+ * All fields should be added before adding documents to the index. Adding fields after
+ * a document has been indexed will have no effect on already indexed documents.
+ *
+ * Fields can be boosted at build time. This allows terms within that field to have more
+ * importance when ranking search results. Use a field boost to specify that matches within
+ * one field are more important than other fields.
+ *
+ * @param {string} fieldName - The name of a field to index in all documents.
+ * @param {object} attributes - Optional attributes associated with this field.
+ * @param {number} [attributes.boost=1] - Boost applied to all terms within this field.
+ * @param {fieldExtractor} [attributes.extractor] - Function to extract a field from a document.
+ * @throws {RangeError} fieldName cannot contain unsupported characters '/'
+ */
+lunr.Builder.prototype.field = function (fieldName, attributes) {
+ if (/\//.test(fieldName)) {
+ throw new RangeError ("Field '" + fieldName + "' contains illegal character '/'")
+ }
+
+ this._fields[fieldName] = attributes || {}
+}
+
+/**
+ * A parameter to tune the amount of field length normalisation that is applied when
+ * calculating relevance scores. A value of 0 will completely disable any normalisation
+ * and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b
+ * will be clamped to the range 0 - 1.
+ *
+ * @param {number} number - The value to set for this tuning parameter.
+ */
+lunr.Builder.prototype.b = function (number) {
+ if (number < 0) {
+ this._b = 0
+ } else if (number > 1) {
+ this._b = 1
+ } else {
+ this._b = number
+ }
+}
+
+/**
+ * A parameter that controls the speed at which a rise in term frequency results in term
+ * frequency saturation. The default value is 1.2. Setting this to a higher value will give
+ * slower saturation levels, a lower value will result in quicker saturation.
+ *
+ * @param {number} number - The value to set for this tuning parameter.
+ */
+lunr.Builder.prototype.k1 = function (number) {
+ this._k1 = number
+}
+
+/**
+ * Adds a document to the index.
+ *
+ * Before adding fields to the index the index should have been fully setup, with the document
+ * ref and all fields to index already having been specified.
+ *
+ * The document must have a field name as specified by the ref (by default this is 'id') and
+ * it should have all fields defined for indexing, though null or undefined values will not
+ * cause errors.
+ *
+ * Entire documents can be boosted at build time. Applying a boost to a document indicates that
+ * this document should rank higher in search results than other documents.
+ *
+ * @param {object} doc - The document to add to the index.
+ * @param {object} attributes - Optional attributes associated with this document.
+ * @param {number} [attributes.boost=1] - Boost applied to all terms within this document.
+ */
+lunr.Builder.prototype.add = function (doc, attributes) {
+ var docRef = doc[this._ref],
+ fields = Object.keys(this._fields)
+
+ this._documents[docRef] = attributes || {}
+ this.documentCount += 1
+
+ for (var i = 0; i < fields.length; i++) {
+ var fieldName = fields[i],
+ extractor = this._fields[fieldName].extractor,
+ field = extractor ? extractor(doc) : doc[fieldName],
+ tokens = this.tokenizer(field, {
+ fields: [fieldName]
+ }),
+ terms = this.pipeline.run(tokens),
+ fieldRef = new lunr.FieldRef (docRef, fieldName),
+ fieldTerms = Object.create(null)
+
+ this.fieldTermFrequencies[fieldRef] = fieldTerms
+ this.fieldLengths[fieldRef] = 0
+
+ // store the length of this field for this document
+ this.fieldLengths[fieldRef] += terms.length
+
+ // calculate term frequencies for this field
+ for (var j = 0; j < terms.length; j++) {
+ var term = terms[j]
+
+ if (fieldTerms[term] == undefined) {
+ fieldTerms[term] = 0
+ }
+
+ fieldTerms[term] += 1
+
+ // add to inverted index
+ // create an initial posting if one doesn't exist
+ if (this.invertedIndex[term] == undefined) {
+ var posting = Object.create(null)
+ posting["_index"] = this.termIndex
+ this.termIndex += 1
+
+ for (var k = 0; k < fields.length; k++) {
+ posting[fields[k]] = Object.create(null)
+ }
+
+ this.invertedIndex[term] = posting
+ }
+
+ // add an entry for this term/fieldName/docRef to the invertedIndex
+ if (this.invertedIndex[term][fieldName][docRef] == undefined) {
+ this.invertedIndex[term][fieldName][docRef] = Object.create(null)
+ }
+
+ // store all whitelisted metadata about this token in the
+ // inverted index
+ for (var l = 0; l < this.metadataWhitelist.length; l++) {
+ var metadataKey = this.metadataWhitelist[l],
+ metadata = term.metadata[metadataKey]
+
+ if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) {
+ this.invertedIndex[term][fieldName][docRef][metadataKey] = []
+ }
+
+ this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata)
+ }
+ }
+
+ }
+}
+
+/**
+ * Calculates the average document length for this index
+ *
+ * @private
+ */
+lunr.Builder.prototype.calculateAverageFieldLengths = function () {
+
+ var fieldRefs = Object.keys(this.fieldLengths),
+ numberOfFields = fieldRefs.length,
+ accumulator = {},
+ documentsWithField = {}
+
+ for (var i = 0; i < numberOfFields; i++) {
+ var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
+ field = fieldRef.fieldName
+
+ documentsWithField[field] || (documentsWithField[field] = 0)
+ documentsWithField[field] += 1
+
+ accumulator[field] || (accumulator[field] = 0)
+ accumulator[field] += this.fieldLengths[fieldRef]
+ }
+
+ var fields = Object.keys(this._fields)
+
+ for (var i = 0; i < fields.length; i++) {
+ var fieldName = fields[i]
+ accumulator[fieldName] = accumulator[fieldName] / documentsWithField[fieldName]
+ }
+
+ this.averageFieldLength = accumulator
+}
+
+/**
+ * Builds a vector space model of every document using lunr.Vector
+ *
+ * @private
+ */
+lunr.Builder.prototype.createFieldVectors = function () {
+ var fieldVectors = {},
+ fieldRefs = Object.keys(this.fieldTermFrequencies),
+ fieldRefsLength = fieldRefs.length,
+ termIdfCache = Object.create(null)
+
+ for (var i = 0; i < fieldRefsLength; i++) {
+ var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
+ fieldName = fieldRef.fieldName,
+ fieldLength = this.fieldLengths[fieldRef],
+ fieldVector = new lunr.Vector,
+ termFrequencies = this.fieldTermFrequencies[fieldRef],
+ terms = Object.keys(termFrequencies),
+ termsLength = terms.length
+
+
+ var fieldBoost = this._fields[fieldName].boost || 1,
+ docBoost = this._documents[fieldRef.docRef].boost || 1
+
+ for (var j = 0; j < termsLength; j++) {
+ var term = terms[j],
+ tf = termFrequencies[term],
+ termIndex = this.invertedIndex[term]._index,
+ idf, score, scoreWithPrecision
+
+ if (termIdfCache[term] === undefined) {
+ idf = lunr.idf(this.invertedIndex[term], this.documentCount)
+ termIdfCache[term] = idf
+ } else {
+ idf = termIdfCache[term]
+ }
+
+ score = idf * ((this._k1 + 1) * tf) / (this._k1 * (1 - this._b + this._b * (fieldLength / this.averageFieldLength[fieldName])) + tf)
+ score *= fieldBoost
+ score *= docBoost
+ scoreWithPrecision = Math.round(score * 1000) / 1000
+ // Converts 1.23456789 to 1.234.
+ // Reducing the precision so that the vectors take up less
+ // space when serialised. Doing it now so that they behave
+ // the same before and after serialisation. Also, this is
+ // the fastest approach to reducing a number's precision in
+ // JavaScript.
+
+ fieldVector.insert(termIndex, scoreWithPrecision)
+ }
+
+ fieldVectors[fieldRef] = fieldVector
+ }
+
+ this.fieldVectors = fieldVectors
+}
+
+/**
+ * Creates a token set of all tokens in the index using lunr.TokenSet
+ *
+ * @private
+ */
+lunr.Builder.prototype.createTokenSet = function () {
+ this.tokenSet = lunr.TokenSet.fromArray(
+ Object.keys(this.invertedIndex).sort()
+ )
+}
+
+/**
+ * Builds the index, creating an instance of lunr.Index.
+ *
+ * This completes the indexing process and should only be called
+ * once all documents have been added to the index.
+ *
+ * @returns {lunr.Index}
+ */
+lunr.Builder.prototype.build = function () {
+ this.calculateAverageFieldLengths()
+ this.createFieldVectors()
+ this.createTokenSet()
+
+ return new lunr.Index({
+ invertedIndex: this.invertedIndex,
+ fieldVectors: this.fieldVectors,
+ tokenSet: this.tokenSet,
+ fields: Object.keys(this._fields),
+ pipeline: this.searchPipeline
+ })
+}
+
+/**
+ * Applies a plugin to the index builder.
+ *
+ * A plugin is a function that is called with the index builder as its context.
+ * Plugins can be used to customise or extend the behaviour of the index
+ * in some way. A plugin is just a function, that encapsulated the custom
+ * behaviour that should be applied when building the index.
+ *
+ * The plugin function will be called with the index builder as its argument, additional
+ * arguments can also be passed when calling use. The function will be called
+ * with the index builder as its context.
+ *
+ * @param {Function} plugin The plugin to apply.
+ */
+lunr.Builder.prototype.use = function (fn) {
+ var args = Array.prototype.slice.call(arguments, 1)
+ args.unshift(this)
+ fn.apply(this, args)
+}
+/**
+ * Contains and collects metadata about a matching document.
+ * A single instance of lunr.MatchData is returned as part of every
+ * lunr.Index~Result.
+ *
+ * @constructor
+ * @param {string} term - The term this match data is associated with
+ * @param {string} field - The field in which the term was found
+ * @param {object} metadata - The metadata recorded about this term in this field
+ * @property {object} metadata - A cloned collection of metadata associated with this document.
+ * @see {@link lunr.Index~Result}
+ */
+lunr.MatchData = function (term, field, metadata) {
+ var clonedMetadata = Object.create(null),
+ metadataKeys = Object.keys(metadata || {})
+
+ // Cloning the metadata to prevent the original
+ // being mutated during match data combination.
+ // Metadata is kept in an array within the inverted
+ // index so cloning the data can be done with
+ // Array#slice
+ for (var i = 0; i < metadataKeys.length; i++) {
+ var key = metadataKeys[i]
+ clonedMetadata[key] = metadata[key].slice()
+ }
+
+ this.metadata = Object.create(null)
+
+ if (term !== undefined) {
+ this.metadata[term] = Object.create(null)
+ this.metadata[term][field] = clonedMetadata
+ }
+}
+
+/**
+ * An instance of lunr.MatchData will be created for every term that matches a
+ * document. However only one instance is required in a lunr.Index~Result. This
+ * method combines metadata from another instance of lunr.MatchData with this
+ * objects metadata.
+ *
+ * @param {lunr.MatchData} otherMatchData - Another instance of match data to merge with this one.
+ * @see {@link lunr.Index~Result}
+ */
+lunr.MatchData.prototype.combine = function (otherMatchData) {
+ var terms = Object.keys(otherMatchData.metadata)
+
+ for (var i = 0; i < terms.length; i++) {
+ var term = terms[i],
+ fields = Object.keys(otherMatchData.metadata[term])
+
+ if (this.metadata[term] == undefined) {
+ this.metadata[term] = Object.create(null)
+ }
+
+ for (var j = 0; j < fields.length; j++) {
+ var field = fields[j],
+ keys = Object.keys(otherMatchData.metadata[term][field])
+
+ if (this.metadata[term][field] == undefined) {
+ this.metadata[term][field] = Object.create(null)
+ }
+
+ for (var k = 0; k < keys.length; k++) {
+ var key = keys[k]
+
+ if (this.metadata[term][field][key] == undefined) {
+ this.metadata[term][field][key] = otherMatchData.metadata[term][field][key]
+ } else {
+ this.metadata[term][field][key] = this.metadata[term][field][key].concat(otherMatchData.metadata[term][field][key])
+ }
+
+ }
+ }
+ }
+}
+
+/**
+ * Add metadata for a term/field pair to this instance of match data.
+ *
+ * @param {string} term - The term this match data is associated with
+ * @param {string} field - The field in which the term was found
+ * @param {object} metadata - The metadata recorded about this term in this field
+ */
+lunr.MatchData.prototype.add = function (term, field, metadata) {
+ if (!(term in this.metadata)) {
+ this.metadata[term] = Object.create(null)
+ this.metadata[term][field] = metadata
+ return
+ }
+
+ if (!(field in this.metadata[term])) {
+ this.metadata[term][field] = metadata
+ return
+ }
+
+ var metadataKeys = Object.keys(metadata)
+
+ for (var i = 0; i < metadataKeys.length; i++) {
+ var key = metadataKeys[i]
+
+ if (key in this.metadata[term][field]) {
+ this.metadata[term][field][key] = this.metadata[term][field][key].concat(metadata[key])
+ } else {
+ this.metadata[term][field][key] = metadata[key]
+ }
+ }
+}
+/**
+ * A lunr.Query provides a programmatic way of defining queries to be performed
+ * against a {@link lunr.Index}.
+ *
+ * Prefer constructing a lunr.Query using the {@link lunr.Index#query} method
+ * so the query object is pre-initialized with the right index fields.
+ *
+ * @constructor
+ * @property {lunr.Query~Clause[]} clauses - An array of query clauses.
+ * @property {string[]} allFields - An array of all available fields in a lunr.Index.
+ */
+lunr.Query = function (allFields) {
+ this.clauses = []
+ this.allFields = allFields
+}
+
+/**
+ * Constants for indicating what kind of automatic wildcard insertion will be used when constructing a query clause.
+ *
+ * This allows wildcards to be added to the beginning and end of a term without having to manually do any string
+ * concatenation.
+ *
+ * The wildcard constants can be bitwise combined to select both leading and trailing wildcards.
+ *
+ * @constant
+ * @default
+ * @property {number} wildcard.NONE - The term will have no wildcards inserted, this is the default behaviour
+ * @property {number} wildcard.LEADING - Prepend the term with a wildcard, unless a leading wildcard already exists
+ * @property {number} wildcard.TRAILING - Append a wildcard to the term, unless a trailing wildcard already exists
+ * @see lunr.Query~Clause
+ * @see lunr.Query#clause
+ * @see lunr.Query#term
+ * @example
+ * query.term('foo', {
+ * wildcard: lunr.Query.wildcard.LEADING | lunr.Query.wildcard.TRAILING
+ * })
+ */
+
+lunr.Query.wildcard = new String ("*")
+lunr.Query.wildcard.NONE = 0
+lunr.Query.wildcard.LEADING = 1
+lunr.Query.wildcard.TRAILING = 2
+
+/**
+ * Constants for indicating what kind of presence a term must have in matching documents.
+ *
+ * @constant
+ * @enum {number}
+ * @see lunr.Query~Clause
+ * @see lunr.Query#clause
+ * @see lunr.Query#term
+ * @example
query term with required presence
+ * query.term('foo', { presence: lunr.Query.presence.REQUIRED })
+ */
+lunr.Query.presence = {
+ /**
+ * Term's presence in a document is optional, this is the default value.
+ */
+ OPTIONAL: 1,
+
+ /**
+ * Term's presence in a document is required, documents that do not contain
+ * this term will not be returned.
+ */
+ REQUIRED: 2,
+
+ /**
+ * Term's presence in a document is prohibited, documents that do contain
+ * this term will not be returned.
+ */
+ PROHIBITED: 3
+}
+
+/**
+ * A single clause in a {@link lunr.Query} contains a term and details on how to
+ * match that term against a {@link lunr.Index}.
+ *
+ * @typedef {Object} lunr.Query~Clause
+ * @property {string[]} fields - The fields in an index this clause should be matched against.
+ * @property {number} [boost=1] - Any boost that should be applied when matching this clause.
+ * @property {number} [editDistance] - Whether the term should have fuzzy matching applied, and how fuzzy the match should be.
+ * @property {boolean} [usePipeline] - Whether the term should be passed through the search pipeline.
+ * @property {number} [wildcard=lunr.Query.wildcard.NONE] - Whether the term should have wildcards appended or prepended.
+ * @property {number} [presence=lunr.Query.presence.OPTIONAL] - The terms presence in any matching documents.
+ */
+
+/**
+ * Adds a {@link lunr.Query~Clause} to this query.
+ *
+ * Unless the clause contains the fields to be matched all fields will be matched. In addition
+ * a default boost of 1 is applied to the clause.
+ *
+ * @param {lunr.Query~Clause} clause - The clause to add to this query.
+ * @see lunr.Query~Clause
+ * @returns {lunr.Query}
+ */
+lunr.Query.prototype.clause = function (clause) {
+ if (!('fields' in clause)) {
+ clause.fields = this.allFields
+ }
+
+ if (!('boost' in clause)) {
+ clause.boost = 1
+ }
+
+ if (!('usePipeline' in clause)) {
+ clause.usePipeline = true
+ }
+
+ if (!('wildcard' in clause)) {
+ clause.wildcard = lunr.Query.wildcard.NONE
+ }
+
+ if ((clause.wildcard & lunr.Query.wildcard.LEADING) && (clause.term.charAt(0) != lunr.Query.wildcard)) {
+ clause.term = "*" + clause.term
+ }
+
+ if ((clause.wildcard & lunr.Query.wildcard.TRAILING) && (clause.term.slice(-1) != lunr.Query.wildcard)) {
+ clause.term = "" + clause.term + "*"
+ }
+
+ if (!('presence' in clause)) {
+ clause.presence = lunr.Query.presence.OPTIONAL
+ }
+
+ this.clauses.push(clause)
+
+ return this
+}
+
+/**
+ * A negated query is one in which every clause has a presence of
+ * prohibited. These queries require some special processing to return
+ * the expected results.
+ *
+ * @returns boolean
+ */
+lunr.Query.prototype.isNegated = function () {
+ for (var i = 0; i < this.clauses.length; i++) {
+ if (this.clauses[i].presence != lunr.Query.presence.PROHIBITED) {
+ return false
+ }
+ }
+
+ return true
+}
+
+/**
+ * Adds a term to the current query, under the covers this will create a {@link lunr.Query~Clause}
+ * to the list of clauses that make up this query.
+ *
+ * The term is used as is, i.e. no tokenization will be performed by this method. Instead conversion
+ * to a token or token-like string should be done before calling this method.
+ *
+ * The term will be converted to a string by calling `toString`. Multiple terms can be passed as an
+ * array, each term in the array will share the same options.
+ *
+ * @param {object|object[]} term - The term(s) to add to the query.
+ * @param {object} [options] - Any additional properties to add to the query clause.
+ * @returns {lunr.Query}
+ * @see lunr.Query#clause
+ * @see lunr.Query~Clause
+ * @example
adding a single term to a query
+ * query.term("foo")
+ * @example
adding a single term to a query and specifying search fields, term boost and automatic trailing wildcard
';
+}
+
+function displayResults (results) {
+ var search_results = document.getElementById("mkdocs-search-results");
+ while (search_results.firstChild) {
+ search_results.removeChild(search_results.firstChild);
+ }
+ if (results.length > 0){
+ for (var i=0; i < results.length; i++){
+ var result = results[i];
+ var html = formatResult(result.location, result.title, result.summary);
+ search_results.insertAdjacentHTML('beforeend', html);
+ }
+ } else {
+ var noResultsText = search_results.getAttribute('data-no-results-text');
+ if (!noResultsText) {
+ noResultsText = "No results found";
+ }
+ search_results.insertAdjacentHTML('beforeend', '
' + noResultsText + '
');
+ }
+}
+
+function doSearch () {
+ var query = document.getElementById('mkdocs-search-query').value;
+ if (query.length > min_search_length) {
+ if (!window.Worker) {
+ displayResults(search(query));
+ } else {
+ searchWorker.postMessage({query: query});
+ }
+ } else {
+ // Clear results for short queries
+ displayResults([]);
+ }
+}
+
+function initSearch () {
+ var search_input = document.getElementById('mkdocs-search-query');
+ if (search_input) {
+ search_input.addEventListener("keyup", doSearch);
+ }
+ var term = getSearchTermFromLocation();
+ if (term) {
+ search_input.value = term;
+ doSearch();
+ }
+}
+
+function onWorkerMessage (e) {
+ if (e.data.allowSearch) {
+ initSearch();
+ } else if (e.data.results) {
+ var results = e.data.results;
+ displayResults(results);
+ } else if (e.data.config) {
+ min_search_length = e.data.config.min_search_length-1;
+ }
+}
+
+if (!window.Worker) {
+ console.log('Web Worker API not supported');
+ // load index in main thread
+ $.getScript(joinUrl(base_url, "search/worker.js")).done(function () {
+ console.log('Loaded worker');
+ init();
+ window.postMessage = function (msg) {
+ onWorkerMessage({data: msg});
+ };
+ }).fail(function (jqxhr, settings, exception) {
+ console.error('Could not load worker.js');
+ });
+} else {
+ // Wrap search in a web worker
+ var searchWorker = new Worker(joinUrl(base_url, "search/worker.js"));
+ searchWorker.postMessage({init: true});
+ searchWorker.onmessage = onWorkerMessage;
+}
diff --git a/docs/search/search_index.json b/docs/search/search_index.json
new file mode 100644
index 0000000..e550f02
--- /dev/null
+++ b/docs/search/search_index.json
@@ -0,0 +1 @@
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Simstring Getting started Install with pip install simstring-fast from simstring.feature_extractor import CharacterNgramFeatureExtractor from simstring.measure import CosineMeasure from simstring.database import DictDatabase from simstring.searcher import Searcher db = DictDatabase(CharacterNgramFeatureExtractor(2)) db.add('foo') db.add('bar') db.add('fooo') searcher = Searcher(db, CosineMeasure()) results = searcher.search('foo', 0.8) print(results) Simstring The original method is described in this paper . There is an even faster C++ implimentation by the original authors available here This module is a fork of this repo which is no longer actively maintained. This module adds documentation, speedups and more measures and features such saving compiled databases. Banchmarks Without compilation the code takes 14 seconds to run through this particular banchmark, which is only on the data retrieval. With compiltion this time is dropped to below 5 seconds.","title":"Home"},{"location":"#simstring","text":"","title":"Simstring"},{"location":"#getting-started","text":"Install with pip install simstring-fast from simstring.feature_extractor import CharacterNgramFeatureExtractor from simstring.measure import CosineMeasure from simstring.database import DictDatabase from simstring.searcher import Searcher db = DictDatabase(CharacterNgramFeatureExtractor(2)) db.add('foo') db.add('bar') db.add('fooo') searcher = Searcher(db, CosineMeasure()) results = searcher.search('foo', 0.8) print(results)","title":"Getting started"},{"location":"#simstring_1","text":"The original method is described in this paper . There is an even faster C++ implimentation by the original authors available here This module is a fork of this repo which is no longer actively maintained. This module adds documentation, speedups and more measures and features such saving compiled databases.","title":"Simstring"},{"location":"#banchmarks","text":"Without compilation the code takes 14 seconds to run through this particular banchmark, which is only on the data retrieval. With compiltion this time is dropped to below 5 seconds.","title":"Banchmarks"},{"location":"database/","text":"Database Dict based database Bases: BaseDatabase Source code in simstring\\database\\dict.py class DictDatabase(BaseDatabase): def __init__(self, feature_extractor): self.feature_extractor = feature_extractor self.strings: List[str] = [] self.feature_set_size_to_string_map: Dict[int, Set[str]] = defaultdict(set) # 3.10 and up only self.feature_set_size_and_feature_to_string_map: dict = defaultdict(defaultdict_set) def add(self, string: str): features = self.feature_extractor.features(string) size = len(features) self.strings.append(string) self.feature_set_size_to_string_map[size].add(string) for feature in features: self.feature_set_size_and_feature_to_string_map[size][feature].add(string) def all(self) -> List[str]: return self.strings def lookup_strings_by_feature_set_size_and_feature(self, size: int, feature: str) -> Set[str]: return self.feature_set_size_and_feature_to_string_map[size][feature] def min_feature_size(self) -> int: return min(self.feature_set_size_to_string_map.keys()) def max_feature_size(self) -> int: return max(self.feature_set_size_to_string_map.keys()) # def __getstate__(self): # \"\"\"To pickle the object\"\"\" # return self.__dict__ # def __setstate__(self, d): # \"\"\"To unpickle the object\"\"\" # self.__dict__ = d def save(self, filename:str): \"\"\"Save the database to a file as defined by filename. Args: filename: Filename to save the db at. Should include file extention. Returns: None \"\"\" with open(filename, \"wb\") as f: pickle.dump(self, f) @staticmethod def load(filename:str) -> \"DictDatabase\": \"\"\"Load db from a file Args: filename (str): Name of the file to load Returns: DictDatabase: the db \"\"\" with open(filename, \"rb\") as f: db = pickle.load(f) return db def dumps(self) -> bytes: \"\"\"Generate pickle byte stream Returns: _type_: _description_ \"\"\" return pickle.dumps(self) @staticmethod def loads(binary_data: bytes) -> \"DictDatabase\": \"\"\"Load a binary string representing a database Initially only unpickles the data Args: binary_data (str): String of data to unpickle Returns: Model object \"\"\" return pickle.loads(binary_data) dumps() Generate pickle byte stream Returns: Name Type Description _type_ bytes description Source code in simstring\\database\\dict.py def dumps(self) -> bytes: \"\"\"Generate pickle byte stream Returns: _type_: _description_ \"\"\" return pickle.dumps(self) load(filename) staticmethod Load db from a file Parameters: Name Type Description Default filename str Name of the file to load required Returns: Name Type Description DictDatabase DictDatabase the db Source code in simstring\\database\\dict.py @staticmethod def load(filename:str) -> \"DictDatabase\": \"\"\"Load db from a file Args: filename (str): Name of the file to load Returns: DictDatabase: the db \"\"\" with open(filename, \"rb\") as f: db = pickle.load(f) return db loads(binary_data) staticmethod Load a binary string representing a database Initially only unpickles the data Parameters: Name Type Description Default binary_data str String of data to unpickle required Returns: Type Description DictDatabase Model object Source code in simstring\\database\\dict.py @staticmethod def loads(binary_data: bytes) -> \"DictDatabase\": \"\"\"Load a binary string representing a database Initially only unpickles the data Args: binary_data (str): String of data to unpickle Returns: Model object \"\"\" return pickle.loads(binary_data) save(filename) Save the database to a file as defined by filename. Parameters: Name Type Description Default filename str Filename to save the db at. Should include file extention. required Returns: Type Description None Source code in simstring\\database\\dict.py def save(self, filename:str): \"\"\"Save the database to a file as defined by filename. Args: filename: Filename to save the db at. Should include file extention. Returns: None \"\"\" with open(filename, \"wb\") as f: pickle.dump(self, f) PyMongo based database Bases: BaseDatabase Source code in simstring\\database\\mongo.py class MongoDatabase(BaseDatabase): def __init__(self, feature_extractor, host=(os.environ[\"MONGO_HOST\"] if \"MONGO_HOST\" in os.environ else 'localhost'), port=27017, database='simstring'): self.feature_extractor = feature_extractor client = MongoClient(host, port) db = client[database] self.collection = db.strings self.ensure_index() def add(self, string): features = self.feature_extractor.features(string) self.collection.insert_one({\"string\": string, \"features\": features, \"size\": len(features)}) def all(self): return list(map(lambda x: x['string'], self.all_documents())) def all_documents(self): return list(self.collection.find()) def ensure_index(self): self.collection.create_index('size') self.collection.create_index('features') def lookup_strings_by_feature_set_size_and_feature(self, size, feature): documents = list(self.collection.find({\"size\": size, \"features\": feature})) return set(list(map(lambda x: x['string'], documents))) def reset_collection(self): self.collection.remove() self.ensure_index()","title":"Database"},{"location":"database/#database","text":"","title":"Database"},{"location":"database/#dict-based-database","text":"Bases: BaseDatabase Source code in simstring\\database\\dict.py class DictDatabase(BaseDatabase): def __init__(self, feature_extractor): self.feature_extractor = feature_extractor self.strings: List[str] = [] self.feature_set_size_to_string_map: Dict[int, Set[str]] = defaultdict(set) # 3.10 and up only self.feature_set_size_and_feature_to_string_map: dict = defaultdict(defaultdict_set) def add(self, string: str): features = self.feature_extractor.features(string) size = len(features) self.strings.append(string) self.feature_set_size_to_string_map[size].add(string) for feature in features: self.feature_set_size_and_feature_to_string_map[size][feature].add(string) def all(self) -> List[str]: return self.strings def lookup_strings_by_feature_set_size_and_feature(self, size: int, feature: str) -> Set[str]: return self.feature_set_size_and_feature_to_string_map[size][feature] def min_feature_size(self) -> int: return min(self.feature_set_size_to_string_map.keys()) def max_feature_size(self) -> int: return max(self.feature_set_size_to_string_map.keys()) # def __getstate__(self): # \"\"\"To pickle the object\"\"\" # return self.__dict__ # def __setstate__(self, d): # \"\"\"To unpickle the object\"\"\" # self.__dict__ = d def save(self, filename:str): \"\"\"Save the database to a file as defined by filename. Args: filename: Filename to save the db at. Should include file extention. Returns: None \"\"\" with open(filename, \"wb\") as f: pickle.dump(self, f) @staticmethod def load(filename:str) -> \"DictDatabase\": \"\"\"Load db from a file Args: filename (str): Name of the file to load Returns: DictDatabase: the db \"\"\" with open(filename, \"rb\") as f: db = pickle.load(f) return db def dumps(self) -> bytes: \"\"\"Generate pickle byte stream Returns: _type_: _description_ \"\"\" return pickle.dumps(self) @staticmethod def loads(binary_data: bytes) -> \"DictDatabase\": \"\"\"Load a binary string representing a database Initially only unpickles the data Args: binary_data (str): String of data to unpickle Returns: Model object \"\"\" return pickle.loads(binary_data)","title":"Dict based database"},{"location":"database/#simstring.database.dict.DictDatabase.dumps","text":"Generate pickle byte stream Returns: Name Type Description _type_ bytes description Source code in simstring\\database\\dict.py def dumps(self) -> bytes: \"\"\"Generate pickle byte stream Returns: _type_: _description_ \"\"\" return pickle.dumps(self)","title":"dumps()"},{"location":"database/#simstring.database.dict.DictDatabase.load","text":"Load db from a file Parameters: Name Type Description Default filename str Name of the file to load required Returns: Name Type Description DictDatabase DictDatabase the db Source code in simstring\\database\\dict.py @staticmethod def load(filename:str) -> \"DictDatabase\": \"\"\"Load db from a file Args: filename (str): Name of the file to load Returns: DictDatabase: the db \"\"\" with open(filename, \"rb\") as f: db = pickle.load(f) return db","title":"load()"},{"location":"database/#simstring.database.dict.DictDatabase.loads","text":"Load a binary string representing a database Initially only unpickles the data Parameters: Name Type Description Default binary_data str String of data to unpickle required Returns: Type Description DictDatabase Model object Source code in simstring\\database\\dict.py @staticmethod def loads(binary_data: bytes) -> \"DictDatabase\": \"\"\"Load a binary string representing a database Initially only unpickles the data Args: binary_data (str): String of data to unpickle Returns: Model object \"\"\" return pickle.loads(binary_data)","title":"loads()"},{"location":"database/#simstring.database.dict.DictDatabase.save","text":"Save the database to a file as defined by filename. Parameters: Name Type Description Default filename str Filename to save the db at. Should include file extention. required Returns: Type Description None Source code in simstring\\database\\dict.py def save(self, filename:str): \"\"\"Save the database to a file as defined by filename. Args: filename: Filename to save the db at. Should include file extention. Returns: None \"\"\" with open(filename, \"wb\") as f: pickle.dump(self, f)","title":"save()"},{"location":"database/#pymongo-based-database","text":"Bases: BaseDatabase Source code in simstring\\database\\mongo.py class MongoDatabase(BaseDatabase): def __init__(self, feature_extractor, host=(os.environ[\"MONGO_HOST\"] if \"MONGO_HOST\" in os.environ else 'localhost'), port=27017, database='simstring'): self.feature_extractor = feature_extractor client = MongoClient(host, port) db = client[database] self.collection = db.strings self.ensure_index() def add(self, string): features = self.feature_extractor.features(string) self.collection.insert_one({\"string\": string, \"features\": features, \"size\": len(features)}) def all(self): return list(map(lambda x: x['string'], self.all_documents())) def all_documents(self): return list(self.collection.find()) def ensure_index(self): self.collection.create_index('size') self.collection.create_index('features') def lookup_strings_by_feature_set_size_and_feature(self, size, feature): documents = list(self.collection.find({\"size\": size, \"features\": feature})) return set(list(map(lambda x: x['string'], documents))) def reset_collection(self): self.collection.remove() self.ensure_index()","title":"PyMongo based database"},{"location":"features/","text":"Feature extractors Bases: BaseFeatureExtractor Source code in simstring\\feature_extractor\\character_ngram.py class CharacterNgramFeatureExtractor(BaseFeatureExtractor): def __init__(self, n:int=2): self.n = n def features(self, string: str) -> List[str]: list_of_ngrams = self._each_cons('$' * (self.n - 1) + string + '$' * (self.n - 1), self.n) return self.uniquify_list(list_of_ngrams) Bases: BaseFeatureExtractor Source code in simstring\\feature_extractor\\word_ngram.py class WordNgramFeatureExtractor(BaseFeatureExtractor): def __init__(self, n=2, splitter=\" \"): self.n = n self.splitter = splitter def features(self, text: str) -> List[str]: # Split text by white space. # If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own. words = text.split(self.splitter) return self._words_ngram(words, self.n, SENTINAL_CHAR) Bases: BaseFeatureExtractor Source code in simstring\\feature_extractor\\mecab_ngram.py class MecabNgramFeatureExtractor(BaseFeatureExtractor): def __init__(self, n=2, user_dic_path='', sys_dic_path=''): self.n = n self.mecab = MecabTokenizer(user_dic_path, sys_dic_path) def features(self, text: str) -> List[str]: words = [x.surface() for x in self.mecab.tokenize(text)] return self._words_ngram(words, self.n, SENTINAL_CHAR)","title":"Features"},{"location":"features/#feature-extractors","text":"Bases: BaseFeatureExtractor Source code in simstring\\feature_extractor\\character_ngram.py class CharacterNgramFeatureExtractor(BaseFeatureExtractor): def __init__(self, n:int=2): self.n = n def features(self, string: str) -> List[str]: list_of_ngrams = self._each_cons('$' * (self.n - 1) + string + '$' * (self.n - 1), self.n) return self.uniquify_list(list_of_ngrams) Bases: BaseFeatureExtractor Source code in simstring\\feature_extractor\\word_ngram.py class WordNgramFeatureExtractor(BaseFeatureExtractor): def __init__(self, n=2, splitter=\" \"): self.n = n self.splitter = splitter def features(self, text: str) -> List[str]: # Split text by white space. # If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own. words = text.split(self.splitter) return self._words_ngram(words, self.n, SENTINAL_CHAR) Bases: BaseFeatureExtractor Source code in simstring\\feature_extractor\\mecab_ngram.py class MecabNgramFeatureExtractor(BaseFeatureExtractor): def __init__(self, n=2, user_dic_path='', sys_dic_path=''): self.n = n self.mecab = MecabTokenizer(user_dic_path, sys_dic_path) def features(self, text: str) -> List[str]: words = [x.surface() for x in self.mecab.tokenize(text)] return self._words_ngram(words, self.n, SENTINAL_CHAR)","title":"Feature extractors"},{"location":"measure/","text":"Measure The measure defines the formula by which the distance between strings is measured. Use as: from simstring.measure import CosineMeasure, JaccardMeasure, OverlapMeasure, DiceMeasure But be carefull, they are not identical to the normal definitions of these measures. Cosine Measure is different to scipy.spatial.distance.cosine as it works on strings and not vectors. Jaccard distance does not discard duplicates in its sets, unlike in the normally used definition. This means that 'fooo' is seen as more different from 'fo' than 'foo', which is a more useful way of lookng at the string difference, but is not the usual definition of the distance as implimanted by scipy.spatial.distance.jaccard or wikipedia or any public calculator . Cosine Measure Bases: BaseMeasure Source code in simstring\\measure\\cosine.py class CosineMeasure(BaseMeasure): def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * alpha * query_size)) def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor(query_size / (alpha * alpha))) def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(alpha * math.sqrt(query_size * y_size))) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) / math.sqrt(len(set(X)) * len(set(Y))) Jaccard Measure Bases: BaseMeasure Source code in simstring\\measure\\jaccard.py class JaccardMeasure(BaseMeasure): def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * query_size)) def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor(query_size / alpha)) def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(alpha * (query_size + y_size) * 1.0 / (1 + alpha))) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) * 1.0 / len(set(X) | set(Y)) OverlapMeasures Bases: BaseMeasure Source code in simstring\\measure\\overlap.py class OverlapMeasure(BaseMeasure): def __init__(self, db=None, maxsize: int=100) -> None: super().__init__() if db: self.maxsize = db.max_feature_size() else: self.maxsize = maxsize def min_feature_size(self, query_size, alpha) -> int: # return 1 # Not sure the below isn't sufficient return math.floor(query_size*alpha) or 1 def max_feature_size(self, query_size, alpha) -> int: return self.maxsize def minimum_common_feature_count(self, query_size: int, y_size: int, alpha: float) -> int: return int(math.ceil(alpha * min(query_size, y_size))) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> int: return min(len(set(X)), len(set(Y))) Bases: BaseMeasure Source code in simstring\\measure\\overlap.py class LeftOverlapMeasure(BaseMeasure): def __init__(self, db=None, maxsize: int=100) -> None: super().__init__() if db: self.maxsize = db.max_feature_size() else: self.maxsize = maxsize def min_feature_size(self, query_size, alpha) -> int: return math.floor(query_size*alpha) or 1 def max_feature_size(self, query_size, alpha) -> int: return self.maxsize def minimum_common_feature_count(self, query_size: int, y_size: int, alpha: float) -> int: return math.floor(query_size*alpha) or 1 def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return 1- len(set(X) - set(Y) )/len(set(X)) DiceMeasure Bases: BaseMeasure Source code in simstring\\measure\\dice.py class DiceMeasure(BaseMeasure): def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * 1.0 / (2 - alpha) * query_size)) def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor((2 - alpha) * query_size * 1.0 / alpha)) def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(0.5 * alpha * query_size * y_size)) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) * 2.0 / (len(set(X)) + len(set(Y)))","title":"Measure"},{"location":"measure/#measure","text":"The measure defines the formula by which the distance between strings is measured. Use as: from simstring.measure import CosineMeasure, JaccardMeasure, OverlapMeasure, DiceMeasure But be carefull, they are not identical to the normal definitions of these measures. Cosine Measure is different to scipy.spatial.distance.cosine as it works on strings and not vectors. Jaccard distance does not discard duplicates in its sets, unlike in the normally used definition. This means that 'fooo' is seen as more different from 'fo' than 'foo', which is a more useful way of lookng at the string difference, but is not the usual definition of the distance as implimanted by scipy.spatial.distance.jaccard or wikipedia or any public calculator .","title":"Measure"},{"location":"measure/#cosine-measure","text":"Bases: BaseMeasure Source code in simstring\\measure\\cosine.py class CosineMeasure(BaseMeasure): def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * alpha * query_size)) def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor(query_size / (alpha * alpha))) def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(alpha * math.sqrt(query_size * y_size))) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) / math.sqrt(len(set(X)) * len(set(Y)))","title":"Cosine Measure"},{"location":"measure/#jaccard-measure","text":"Bases: BaseMeasure Source code in simstring\\measure\\jaccard.py class JaccardMeasure(BaseMeasure): def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * query_size)) def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor(query_size / alpha)) def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(alpha * (query_size + y_size) * 1.0 / (1 + alpha))) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) * 1.0 / len(set(X) | set(Y))","title":"Jaccard Measure"},{"location":"measure/#overlapmeasures","text":"Bases: BaseMeasure Source code in simstring\\measure\\overlap.py class OverlapMeasure(BaseMeasure): def __init__(self, db=None, maxsize: int=100) -> None: super().__init__() if db: self.maxsize = db.max_feature_size() else: self.maxsize = maxsize def min_feature_size(self, query_size, alpha) -> int: # return 1 # Not sure the below isn't sufficient return math.floor(query_size*alpha) or 1 def max_feature_size(self, query_size, alpha) -> int: return self.maxsize def minimum_common_feature_count(self, query_size: int, y_size: int, alpha: float) -> int: return int(math.ceil(alpha * min(query_size, y_size))) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> int: return min(len(set(X)), len(set(Y))) Bases: BaseMeasure Source code in simstring\\measure\\overlap.py class LeftOverlapMeasure(BaseMeasure): def __init__(self, db=None, maxsize: int=100) -> None: super().__init__() if db: self.maxsize = db.max_feature_size() else: self.maxsize = maxsize def min_feature_size(self, query_size, alpha) -> int: return math.floor(query_size*alpha) or 1 def max_feature_size(self, query_size, alpha) -> int: return self.maxsize def minimum_common_feature_count(self, query_size: int, y_size: int, alpha: float) -> int: return math.floor(query_size*alpha) or 1 def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return 1- len(set(X) - set(Y) )/len(set(X))","title":"OverlapMeasures"},{"location":"measure/#dicemeasure","text":"Bases: BaseMeasure Source code in simstring\\measure\\dice.py class DiceMeasure(BaseMeasure): def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * 1.0 / (2 - alpha) * query_size)) def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor((2 - alpha) * query_size * 1.0 / alpha)) def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(0.5 * alpha * query_size * y_size)) def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) * 2.0 / (len(set(X)) + len(set(Y)))","title":"DiceMeasure"},{"location":"searcher/","text":"Searcher Source code in simstring\\searcher.py class Searcher: def __init__(self, db, measure) -> None: \"\"\"Searcher class This is the main way of interacting with the simsting search. Args: db (database): A database, can be a dict or mongo one as defined by the `database` modeule measure (measure): The similarity measure as defined by `measure` \"\"\" self.db = db self.measure = measure self.feature_extractor = db.feature_extractor self.lookup_strings_result: dict = defaultdict(dict) def search(self, query_string: str, alpha: float) -> List[str]: features = self.feature_extractor.features(query_string) lf = len(features) min_feature_size = self.measure.min_feature_size(lf, alpha) max_feature_size = self.measure.max_feature_size(lf, alpha) results = [] for candidate_feature_size in range(min_feature_size, max_feature_size + 1): tau = self.__min_overlap(lf, candidate_feature_size, alpha) results.extend(self.__overlap_join(features, tau, candidate_feature_size)) return results def ranked_search(self, query_string: str, alpha: float) -> List[Tuple[float, str]]: results = self.search(query_string, alpha) features = self.feature_extractor.features(query_string) results_with_score = list( map( lambda x: [ self.measure.similarity( features, self.feature_extractor.features(x) ), x, ], results, ) ) # Why change the signature? is this used in ASAP? # return { # name: score # for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1])) # } return [(score, name) for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1])) ] def __min_overlap( self, query_size: int, candidate_feature_size: int, alpha: float ) -> int: return self.measure.minimum_common_feature_count( query_size, candidate_feature_size, alpha ) def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str]: query_feature_size = len(features) features_mapped_to_lookup_strings_sets = { x: self.__lookup_strings_by_feature_set_size_and_feature( candidate_feature_size, x ) for x in features } features.sort(key=lambda x: len(features_mapped_to_lookup_strings_sets[x])) #candidate_string_to_matched_count : Dict[str,int] = defaultdict(int) # Only in 3.10 and later candidate_string_to_matched_count : Dict = defaultdict(int) results = [] for feature in features[0 : query_feature_size - tau + 1]: for s in features_mapped_to_lookup_strings_sets[feature]: candidate_string_to_matched_count[s] += 1 # The next loop does not run for tau = 1, hence candidates are never checked, while all satisfies the criteria if tau == 1: results = list(candidate_string_to_matched_count.keys()) for ( candidate, candidate_match_count, ) in candidate_string_to_matched_count.items(): for i in range(query_feature_size - tau + 1, query_feature_size): feature = features[i] if candidate in features_mapped_to_lookup_strings_sets[feature]: candidate_match_count += 1 if candidate_match_count >= tau: results.append(candidate) break remaining_feature_count = query_feature_size - i - 1 if candidate_match_count + remaining_feature_count < tau: break return results def __lookup_strings_by_feature_set_size_and_feature(self, feature_size: int, feature: str): if feature not in self.lookup_strings_result[feature_size]: self.lookup_strings_result[feature_size][ feature ] = self.db.lookup_strings_by_feature_set_size_and_feature( feature_size, feature ) return self.lookup_strings_result[feature_size][feature] __init__(db, measure) Searcher class This is the main way of interacting with the simsting search. Parameters: Name Type Description Default db database A database, can be a dict or mongo one as defined by the database modeule required measure measure The similarity measure as defined by measure required Source code in simstring\\searcher.py def __init__(self, db, measure) -> None: \"\"\"Searcher class This is the main way of interacting with the simsting search. Args: db (database): A database, can be a dict or mongo one as defined by the `database` modeule measure (measure): The similarity measure as defined by `measure` \"\"\" self.db = db self.measure = measure self.feature_extractor = db.feature_extractor self.lookup_strings_result: dict = defaultdict(dict)","title":"Searcher"},{"location":"searcher/#searcher","text":"Source code in simstring\\searcher.py class Searcher: def __init__(self, db, measure) -> None: \"\"\"Searcher class This is the main way of interacting with the simsting search. Args: db (database): A database, can be a dict or mongo one as defined by the `database` modeule measure (measure): The similarity measure as defined by `measure` \"\"\" self.db = db self.measure = measure self.feature_extractor = db.feature_extractor self.lookup_strings_result: dict = defaultdict(dict) def search(self, query_string: str, alpha: float) -> List[str]: features = self.feature_extractor.features(query_string) lf = len(features) min_feature_size = self.measure.min_feature_size(lf, alpha) max_feature_size = self.measure.max_feature_size(lf, alpha) results = [] for candidate_feature_size in range(min_feature_size, max_feature_size + 1): tau = self.__min_overlap(lf, candidate_feature_size, alpha) results.extend(self.__overlap_join(features, tau, candidate_feature_size)) return results def ranked_search(self, query_string: str, alpha: float) -> List[Tuple[float, str]]: results = self.search(query_string, alpha) features = self.feature_extractor.features(query_string) results_with_score = list( map( lambda x: [ self.measure.similarity( features, self.feature_extractor.features(x) ), x, ], results, ) ) # Why change the signature? is this used in ASAP? # return { # name: score # for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1])) # } return [(score, name) for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1])) ] def __min_overlap( self, query_size: int, candidate_feature_size: int, alpha: float ) -> int: return self.measure.minimum_common_feature_count( query_size, candidate_feature_size, alpha ) def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str]: query_feature_size = len(features) features_mapped_to_lookup_strings_sets = { x: self.__lookup_strings_by_feature_set_size_and_feature( candidate_feature_size, x ) for x in features } features.sort(key=lambda x: len(features_mapped_to_lookup_strings_sets[x])) #candidate_string_to_matched_count : Dict[str,int] = defaultdict(int) # Only in 3.10 and later candidate_string_to_matched_count : Dict = defaultdict(int) results = [] for feature in features[0 : query_feature_size - tau + 1]: for s in features_mapped_to_lookup_strings_sets[feature]: candidate_string_to_matched_count[s] += 1 # The next loop does not run for tau = 1, hence candidates are never checked, while all satisfies the criteria if tau == 1: results = list(candidate_string_to_matched_count.keys()) for ( candidate, candidate_match_count, ) in candidate_string_to_matched_count.items(): for i in range(query_feature_size - tau + 1, query_feature_size): feature = features[i] if candidate in features_mapped_to_lookup_strings_sets[feature]: candidate_match_count += 1 if candidate_match_count >= tau: results.append(candidate) break remaining_feature_count = query_feature_size - i - 1 if candidate_match_count + remaining_feature_count < tau: break return results def __lookup_strings_by_feature_set_size_and_feature(self, feature_size: int, feature: str): if feature not in self.lookup_strings_result[feature_size]: self.lookup_strings_result[feature_size][ feature ] = self.db.lookup_strings_by_feature_set_size_and_feature( feature_size, feature ) return self.lookup_strings_result[feature_size][feature]","title":"Searcher"},{"location":"searcher/#simstring.searcher.Searcher.__init__","text":"Searcher class This is the main way of interacting with the simsting search. Parameters: Name Type Description Default db database A database, can be a dict or mongo one as defined by the database modeule required measure measure The similarity measure as defined by measure required Source code in simstring\\searcher.py def __init__(self, db, measure) -> None: \"\"\"Searcher class This is the main way of interacting with the simsting search. Args: db (database): A database, can be a dict or mongo one as defined by the `database` modeule measure (measure): The similarity measure as defined by `measure` \"\"\" self.db = db self.measure = measure self.feature_extractor = db.feature_extractor self.lookup_strings_result: dict = defaultdict(dict)","title":"__init__()"}]}
\ No newline at end of file
diff --git a/docs/search/worker.js b/docs/search/worker.js
new file mode 100644
index 0000000..8628dbc
--- /dev/null
+++ b/docs/search/worker.js
@@ -0,0 +1,133 @@
+var base_path = 'function' === typeof importScripts ? '.' : '/search/';
+var allowSearch = false;
+var index;
+var documents = {};
+var lang = ['en'];
+var data;
+
+function getScript(script, callback) {
+ console.log('Loading script: ' + script);
+ $.getScript(base_path + script).done(function () {
+ callback();
+ }).fail(function (jqxhr, settings, exception) {
+ console.log('Error: ' + exception);
+ });
+}
+
+function getScriptsInOrder(scripts, callback) {
+ if (scripts.length === 0) {
+ callback();
+ return;
+ }
+ getScript(scripts[0], function() {
+ getScriptsInOrder(scripts.slice(1), callback);
+ });
+}
+
+function loadScripts(urls, callback) {
+ if( 'function' === typeof importScripts ) {
+ importScripts.apply(null, urls);
+ callback();
+ } else {
+ getScriptsInOrder(urls, callback);
+ }
+}
+
+function onJSONLoaded () {
+ data = JSON.parse(this.responseText);
+ var scriptsToLoad = ['lunr.js'];
+ if (data.config && data.config.lang && data.config.lang.length) {
+ lang = data.config.lang;
+ }
+ if (lang.length > 1 || lang[0] !== "en") {
+ scriptsToLoad.push('lunr.stemmer.support.js');
+ if (lang.length > 1) {
+ scriptsToLoad.push('lunr.multi.js');
+ }
+ if (lang.includes("ja") || lang.includes("jp")) {
+ scriptsToLoad.push('tinyseg.js');
+ }
+ for (var i=0; i < lang.length; i++) {
+ if (lang[i] != 'en') {
+ scriptsToLoad.push(['lunr', lang[i], 'js'].join('.'));
+ }
+ }
+ }
+ loadScripts(scriptsToLoad, onScriptsLoaded);
+}
+
+function onScriptsLoaded () {
+ console.log('All search scripts loaded, building Lunr index...');
+ if (data.config && data.config.separator && data.config.separator.length) {
+ lunr.tokenizer.separator = new RegExp(data.config.separator);
+ }
+
+ if (data.index) {
+ index = lunr.Index.load(data.index);
+ data.docs.forEach(function (doc) {
+ documents[doc.location] = doc;
+ });
+ console.log('Lunr pre-built index loaded, search ready');
+ } else {
+ index = lunr(function () {
+ if (lang.length === 1 && lang[0] !== "en" && lunr[lang[0]]) {
+ this.use(lunr[lang[0]]);
+ } else if (lang.length > 1) {
+ this.use(lunr.multiLanguage.apply(null, lang)); // spread operator not supported in all browsers: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_operator#Browser_compatibility
+ }
+ this.field('title');
+ this.field('text');
+ this.ref('location');
+
+ for (var i=0; i < data.docs.length; i++) {
+ var doc = data.docs[i];
+ this.add(doc);
+ documents[doc.location] = doc;
+ }
+ });
+ console.log('Lunr index built, search ready');
+ }
+ allowSearch = true;
+ postMessage({config: data.config});
+ postMessage({allowSearch: allowSearch});
+}
+
+function init () {
+ var oReq = new XMLHttpRequest();
+ oReq.addEventListener("load", onJSONLoaded);
+ var index_path = base_path + '/search_index.json';
+ if( 'function' === typeof importScripts ){
+ index_path = 'search_index.json';
+ }
+ oReq.open("GET", index_path);
+ oReq.send();
+}
+
+function search (query) {
+ if (!allowSearch) {
+ console.error('Assets for search still loading');
+ return;
+ }
+
+ var resultDocuments = [];
+ var results = index.search(query);
+ for (var i=0; i < results.length; i++){
+ var result = results[i];
+ doc = documents[result.ref];
+ doc.summary = doc.text.substring(0, 200);
+ resultDocuments.push(doc);
+ }
+ return resultDocuments;
+}
+
+if( 'function' === typeof importScripts ) {
+ onmessage = function (e) {
+ if (e.data.init) {
+ init();
+ } else if (e.data.query) {
+ postMessage({ results: search(e.data.query) });
+ } else {
+ console.error("Worker - Unrecognized message: " + e);
+ }
+ };
+}
diff --git a/docs/searcher/index.html b/docs/searcher/index.html
new file mode 100644
index 0000000..735b97d
--- /dev/null
+++ b/docs/searcher/index.html
@@ -0,0 +1,387 @@
+
+
+
+
+
+
+
+
+
+
+ Searcher - Simstring docs
+
+
+
+
+
+
+
+
+
+
+
+
+
+
class Searcher:
+ def __init__(self, db, measure) -> None:
+ """Searcher class
+
+ This is the main way of interacting with the simsting search.
+
+ Args:
+ db (database): A database, can be a dict or mongo one as defined by the `database` modeule
+ measure (measure): The similarity measure as defined by `measure`
+ """
+ self.db = db
+ self.measure = measure
+ self.feature_extractor = db.feature_extractor
+ self.lookup_strings_result: dict = defaultdict(dict)
+
+ def search(self, query_string: str, alpha: float) -> List[str]:
+ features = self.feature_extractor.features(query_string)
+ lf = len(features)
+ min_feature_size = self.measure.min_feature_size(lf, alpha)
+ max_feature_size = self.measure.max_feature_size(lf, alpha)
+ results = []
+
+ for candidate_feature_size in range(min_feature_size, max_feature_size + 1):
+ tau = self.__min_overlap(lf, candidate_feature_size, alpha)
+ results.extend(self.__overlap_join(features, tau, candidate_feature_size))
+ return results
+
+ def ranked_search(self, query_string: str, alpha: float) -> List[Tuple[float, str]]:
+ results = self.search(query_string, alpha)
+ features = self.feature_extractor.features(query_string)
+ results_with_score = list(
+ map(
+ lambda x: [
+ self.measure.similarity(
+ features, self.feature_extractor.features(x)
+ ),
+ x,
+ ],
+ results,
+ )
+ )
+ # Why change the signature? is this used in ASAP?
+ # return {
+ # name: score
+ # for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1]))
+ # }
+ return [(score, name) for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1])) ]
+
+ def __min_overlap(
+ self, query_size: int, candidate_feature_size: int, alpha: float
+ ) -> int:
+ return self.measure.minimum_common_feature_count(
+ query_size, candidate_feature_size, alpha
+ )
+
+ def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str]:
+ query_feature_size = len(features)
+
+ features_mapped_to_lookup_strings_sets = {
+ x: self.__lookup_strings_by_feature_set_size_and_feature(
+ candidate_feature_size, x
+ )
+ for x in features
+ }
+
+ features.sort(key=lambda x: len(features_mapped_to_lookup_strings_sets[x]))
+
+ #candidate_string_to_matched_count : Dict[str,int] = defaultdict(int) # Only in 3.10 and later
+ candidate_string_to_matched_count : Dict = defaultdict(int)
+ results = []
+ for feature in features[0 : query_feature_size - tau + 1]:
+ for s in features_mapped_to_lookup_strings_sets[feature]:
+ candidate_string_to_matched_count[s] += 1
+
+ # The next loop does not run for tau = 1, hence candidates are never checked, while all satisfies the criteria
+ if tau == 1:
+ results = list(candidate_string_to_matched_count.keys())
+
+ for (
+ candidate,
+ candidate_match_count,
+ ) in candidate_string_to_matched_count.items():
+ for i in range(query_feature_size - tau + 1, query_feature_size):
+ feature = features[i]
+ if candidate in features_mapped_to_lookup_strings_sets[feature]:
+ candidate_match_count += 1
+ if candidate_match_count >= tau:
+ results.append(candidate)
+ break
+ remaining_feature_count = query_feature_size - i - 1
+ if candidate_match_count + remaining_feature_count < tau:
+ break
+
+ return results
+
+ def __lookup_strings_by_feature_set_size_and_feature(self, feature_size: int, feature: str):
+ if feature not in self.lookup_strings_result[feature_size]:
+ self.lookup_strings_result[feature_size][
+ feature
+ ] = self.db.lookup_strings_by_feature_set_size_and_feature(
+ feature_size, feature
+ )
+ return self.lookup_strings_result[feature_size][feature]
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+__init__(db, measure)
+
+
+
+
+
+
+
Searcher class
+
This is the main way of interacting with the simsting search.
+
+
Parameters:
+
+
+
+
Name
+
Type
+
Description
+
Default
+
+
+
+
+
db
+
+ database
+
+
A database, can be a dict or mongo one as defined by the database modeule
+
+ required
+
+
+
+
measure
+
+ measure
+
+
The similarity measure as defined by measure
+
+ required
+
+
+
+
+
+
+ Source code in simstring\searcher.py
+
def __init__(self, db, measure) -> None:
+ """Searcher class
+
+ This is the main way of interacting with the simsting search.
+
+ Args:
+ db (database): A database, can be a dict or mongo one as defined by the `database` modeule
+ measure (measure): The similarity measure as defined by `measure`
+ """
+ self.db = db
+ self.measure = measure
+ self.feature_extractor = db.feature_extractor
+ self.lookup_strings_result: dict = defaultdict(dict)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Search
+
+
+
+
From here you can search these documents. Enter your search terms below.
+
+
+
+
+
+
+
+
+
+
+
Keyboard Shortcuts
+
+
+
+
+
+
+
Keys
+
Action
+
+
+
+
+
?
+
Open this help
+
+
+
n
+
Next page
+
+
+
p
+
Previous page
+
+
+
s
+
Search
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
new file mode 100644
index 0000000..c40a912
--- /dev/null
+++ b/docs/sitemap.xml
@@ -0,0 +1,28 @@
+
+
+
+ https://icfly2.github.io/simstring/
+ 2022-06-13
+ daily
+
+
+ https://icfly2.github.io/simstring/database/
+ 2022-06-13
+ daily
+
+
+ https://icfly2.github.io/simstring/features/
+ 2022-06-13
+ daily
+
+
+ https://icfly2.github.io/simstring/measure/
+ 2022-06-13
+ daily
+
+
+ https://icfly2.github.io/simstring/searcher/
+ 2022-06-13
+ daily
+
+
\ No newline at end of file
diff --git a/docs/sitemap.xml.gz b/docs/sitemap.xml.gz
new file mode 100644
index 0000000..d7f0b06
Binary files /dev/null and b/docs/sitemap.xml.gz differ
diff --git a/docs/slow.png b/docs/slow.png
new file mode 100644
index 0000000..d89e020
Binary files /dev/null and b/docs/slow.png differ
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..7fad644
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,125 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "simstring-fast"
+description = "A fork of the Python implementation of the SimString by (Katsuma Narisawa), a simple and efficient algorithm for approximate string matching. Uses mypyc to improve speed"
+readme = "README.md"
+requires-python = ">=3.9"
+license = "MIT"
+keywords = []
+authors = [
+ { name = "Ruben Menke", email = "rum@bankingcircle.com" },
+]
+classifiers = [
+ "Development Status :: 4 - Beta",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: Implementation :: CPython",
+]
+dependencies = ["diskcache==5.6.3", "setuptools>=70.0.0"]
+dynamic = ["version"]
+
+[project.urls]
+Documentation = "https://banking-circle-advanced-analytics.github.io/simstring-fast/"
+Issues = "https://github.com/banking-circle-advanced-analytics/simstring-fast"
+Source = "https://github.com/banking-circle-advanced-analytics/simstring-fast/issues"
+
+[tool.hatch.version]
+path = "simstring/__init__.py"
+
+
+[tool.hatch.build]
+include = [
+ "simstring/*.py",
+ "simstring/**/*.py",
+]
+
+[tool.hatch.build.targets.wheel.hooks.mypyc]
+dependencies = ["hatch-mypyc==0.16.0"]
+mypy-args = [
+ "--check-untyped-defs",
+ "--install-types"
+]
+exclude = [
+ "simstring/database/disk.py",
+ "simstring/database/base.py",
+]
+
+
+[tool.hatch.envs.default.scripts]
+pre-install-commands = [
+ "python -m ensurepip --upgrade",
+ "python -m pip install --upgrade setuptools",
+]
+version = "python --version"
+cov = "pytest --cov-report=term --cov-config=pyproject.toml {args}"
+test = "pytest"
+build = "python -m build"
+
+[tool.hatch.envs.test]
+dependencies = [
+ "pytest==7.4.2",
+ "pytest-cov==4.1.0",
+ "build",
+ "cython==3.0.12",
+ "faker",
+ "tqdm",
+ "pdbpp",
+ "setuptools"
+]
+
+
+
+[[tool.hatch.envs.test.matrix]]
+python = ["310", "311", "312", "3.13"]
+
+[tool.coverage.run]
+branch = true
+parallel = true
+include = [
+ "simsting/**/*.py",
+ "simsting/*.py"
+]
+omit = [
+ "simstring/__init__.py",
+ "simstring/**/__init__.py",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+ "no cov",
+ "if __name__ == .__main__.:",
+ "if TYPE_CHECKING:",
+]
+
+[tool.hatch.envs.docs]
+dependencies = [
+ "mkdocs", "mkdocstrings[python]"
+]
+[tool.hatch.envs.docs.scripts]
+build = "cd simstring && mkdocs build --clean --strict"
+serve = "cd simstring && mkdocs serve --dev-addr localhost:8000"
+
+
+
+[tool.hatch.envs.benchmark]
+pre-install-commands = [
+ "python -m ensurepip --upgrade",
+ "python -m pip install --upgrade setuptools",
+]
+dependencies = [
+ "pyinstrument", "benchmarker" , "numpy", "tqdm"
+]
+[[tool.hatch.envs.benchmark.matrix]]
+python = [ "310", "311", "312", "313"]
+
+[tool.hatch.envs.benchmark.scripts]
+
+run = "python dev/benchmark.py"
+instrument = "pip install . && python dev/company_names.py"
+adds = "pip install . && python dev/benchmarking_adds.py"
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 1f8fce1..0000000
--- a/setup.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import setuptools
-
-with open("README.md", "r") as fh:
- long_description = fh.read()
-
-setuptools.setup(
- name="simstring-pure",
- version="1.0.0",
- author="Katsuma Narisawa",
- author_email="katsuma.narisawa@gmail.com",
- description="A Python implementation of the SimString, a simple and efficient algorithm for approximate string matching.",
- long_description=long_description,
- long_description_content_type="text/markdown",
- url="https://github.com/nullnull/simstring",
- packages=setuptools.find_packages(),
- classifiers=(
- "Development Status :: 5 - Production/Stable",
- "Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.7",
- "License :: OSI Approved :: MIT License",
- "Operating System :: OS Independent",
- ),
-)
diff --git a/simstring/__init__.py b/simstring/__init__.py
index e69de29..5a28280 100644
--- a/simstring/__init__.py
+++ b/simstring/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.5.0"
\ No newline at end of file
diff --git a/simstring/database/__init__.py b/simstring/database/__init__.py
index e69de29..433d548 100644
--- a/simstring/database/__init__.py
+++ b/simstring/database/__init__.py
@@ -0,0 +1 @@
+# from .dict import DictDatabase
diff --git a/simstring/database/base.py b/simstring/database/base.py
index 53020ea..a2bbe7e 100644
--- a/simstring/database/base.py
+++ b/simstring/database/base.py
@@ -1,15 +1,9 @@
class BaseDatabase:
def __init__(self, feature_extractor):
- raise 'Not Implemented'
+ raise NotImplementedError
def add(self, string):
- raise 'Not Implemented'
-
- def min_feature_size(self):
- raise 'Not Implemented'
-
- def max_feature_size(self):
- raise 'Not Implemented'
+ raise NotImplementedError
def lookup_strings_by_feature_set_size_and_feature(self, size, feature):
- raise 'Not Implemented'
+ raise NotImplementedError
diff --git a/simstring/database/dict.py b/simstring/database/dict.py
index 14205fd..a777078 100644
--- a/simstring/database/dict.py
+++ b/simstring/database/dict.py
@@ -1,33 +1,141 @@
from collections import defaultdict
+from typing import Union
from .base import BaseDatabase
+from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
+from simstring.feature_extractor.word_ngram import WordNgramFeatureExtractor
+import pickle
+import ast
+from io import BufferedWriter
+
def defaultdict_set():
return defaultdict(set)
+
class DictDatabase(BaseDatabase):
- def __init__(self, feature_extractor):
+ def __init__(
+ self,
+ feature_extractor: Union[
+ CharacterNgramFeatureExtractor, WordNgramFeatureExtractor
+ ],
+ ):
self.feature_extractor = feature_extractor
- self.strings = []
- self.feature_set_size_to_string_map = defaultdict(set)
- self.feature_set_size_and_feature_to_string_map = defaultdict(defaultdict_set)
+ self.strings: list[str] = []
+ self.feature_set_size_to_string_map: dict[int, set[str]] = dict()
+ self.feature_set_size_and_feature_to_string_map: dict = defaultdict(
+ defaultdict_set
+ )
+ self._min_feature_size = 9999999
+ self._max_feature_size = 0
- def add(self, string):
+ def add(self, string: str) -> None:
features = self.feature_extractor.features(string)
size = len(features)
self.strings.append(string)
+
+ if size not in self.feature_set_size_to_string_map:
+ self.feature_set_size_to_string_map[size] = set()
+
self.feature_set_size_to_string_map[size].add(string)
+
for feature in features:
self.feature_set_size_and_feature_to_string_map[size][feature].add(string)
- def all(self):
+ def all(self) -> list[str]:
return self.strings
- def lookup_strings_by_feature_set_size_and_feature(self, size, feature):
+ def lookup_strings_by_feature_set_size_and_feature(
+ self, size: int, feature: str
+ ) -> set[str]:
return self.feature_set_size_and_feature_to_string_map[size][feature]
- def min_feature_size(self):
- return min(self.feature_set_size_to_string_map.keys())
+ def to_pickle(self, f: BufferedWriter) -> None:
+ """Hack to get object savable with mypyc
+
+ Save a db object to pickle with:
+
+ >>> with open("test.pkl", "wb") as f:
+ ... db.to_pickle(f)
+
+ Args:
+ f (BufferedWriter): File object writer, where to save the data
+ """
+ data = {
+ "feature_extractor": self.feature_extractor.__define__(),
+ "strings": self.strings,
+ "feature_set_size_to_string_map": self.feature_set_size_to_string_map,
+ "feature_set_size_and_feature_to_string_map": self.feature_set_size_and_feature_to_string_map,
+ "_min_feature_size": self._min_feature_size,
+ "_max_feature_size": self._max_feature_size,
+ }
+ pickle.dump(data, f)
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "DictDatabase":
+ """Hack to get object loadable with mypyc
+
+ Careful, this runs eval on data["feature_extractor"], so only use pickles you trust.
+
+ Load a saved DB as a dict and then instatiate an object from that dict:
+
+ Example:
+ >>> with open("test.pkl", "rb") as f:
+ ... data = pickle.load(f)
+
+ >>> new = DictDatabase.from_dict(data)
+
+ Args:
+ data (dict): A dictionary as created by `to_pickle`
+
+ """
+ obj = cls(eval(data["feature_extractor"]))
+ obj.strings = data["strings"]
+ obj.feature_set_size_to_string_map.update(
+ data["feature_set_size_to_string_map"]
+ )
+ obj.feature_set_size_and_feature_to_string_map.update(
+ data["feature_set_size_and_feature_to_string_map"]
+ )
+ obj._min_feature_size = data["_min_feature_size"]
+ obj._max_feature_size = data["_max_feature_size"]
+ return obj
+
+ def save(self, filename: str) -> None:
+ """Save the database to a file as defined by filename.
+
+ Args:
+ filename: Filename to save the db at. Should include file extension. Saves as pickled json
+
+ Returns:
+ None
+ """
+ with open(filename, "wb") as f:
+ self.to_pickle(f)
+
+
+ @classmethod
+ def load(cls, filename: str) -> "DictDatabase":
+ """Load db from a file
+
+ Loads what you saved with the save function.
+
+ Args:
+ filename (str): Name of the file to load
+
+ Returns:
+ DictDatabase: the db
+ """
+ with open(filename, "rb") as f:
+ data = pickle.load(f)
+
+ return cls.from_dict(data)
+
+
+ def dumps(self) -> bytes:
+ """Generate pickle byte stream
- def max_feature_size(self):
- return max(self.feature_set_size_to_string_map.keys())
+ Returns:
+ _type_: _description_
+ """
+ return pickle.dumps(self)
diff --git a/simstring/database/disk.py b/simstring/database/disk.py
new file mode 100644
index 0000000..1568e73
--- /dev/null
+++ b/simstring/database/disk.py
@@ -0,0 +1,81 @@
+
+from typing import Union
+from .base import BaseDatabase
+from collections import defaultdict
+
+from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
+from simstring.feature_extractor.word_ngram import WordNgramFeatureExtractor
+
+from io import BufferedWriter
+import diskcache as dc
+from multiprocessing import Pool, cpu_count
+
+from functools import lru_cache
+
+import os
+
+FeatureExtractor = Union[
+ CharacterNgramFeatureExtractor, WordNgramFeatureExtractor
+ ]
+
+class DiskDatabase(BaseDatabase):
+ def __init__(
+ self,
+ feature_extractor: FeatureExtractor,
+ path:str= 'tmp'
+ ):
+ self.feature_extractor = feature_extractor
+ self.feature_set_size_to_string_map: dc.Cache = dc.Cache(os.path.join(path,'feature_set_size_to_string_map'))
+ self.feature_set_size_and_feature_to_string_map: dc.Cache = dc.Cache(os.path.join(path,'feature_set_size_and_feature_to_string_map'))
+ self.path = path
+
+ @staticmethod
+ def _make_key(size: int, feature: str) -> str:
+ return f"{size}-{feature}"
+
+ def add_feature_set_size_and_feature_to_string_map(self, size, feature, string)-> None:
+ key = self._make_key(size, feature)
+ with self.feature_set_size_and_feature_to_string_map.transact():
+ if key in self.feature_set_size_and_feature_to_string_map:
+ d = self.feature_set_size_and_feature_to_string_map[key]
+ if string in d:
+ return
+ else:
+ d = set()
+
+ d.add(string)
+ self.feature_set_size_and_feature_to_string_map[key] = d
+
+ def get_feature_set_size_and_feature_to_string_map(self, size: int, feature: str
+ ) -> set[str]:
+ try:
+ return self.feature_set_size_and_feature_to_string_map[self._make_key(size,feature)]
+ except KeyError:
+ return set()
+
+ def add(self, string: str) -> None:
+ features = self.feature_extractor.features(string)
+
+ size = len(features)
+ with self.feature_set_size_to_string_map.transact():
+ if size not in self.feature_set_size_to_string_map:
+ size_to_string_map = set()
+ else:
+ size_to_string_map = self.feature_set_size_to_string_map[size]
+
+ size_to_string_map.add(string)
+ self.feature_set_size_to_string_map[size] = size_to_string_map
+
+ for feature in features:
+ self.add_feature_set_size_and_feature_to_string_map(size, feature, string)
+
+ def all(self) -> list[str]:
+ strings = []
+ for k in self.feature_set_size_to_string_map.iterkeys():
+ strings.extend(self.feature_set_size_to_string_map[k])
+ return strings
+
+ def lookup_strings_by_feature_set_size_and_feature(
+ self, size: int, feature: str
+ ) -> set[str]:
+ return self.get_feature_set_size_and_feature_to_string_map(size,feature)
\ No newline at end of file
diff --git a/simstring/database/mongo.py b/simstring/database/mongo.py
deleted file mode 100644
index 43a7a76..0000000
--- a/simstring/database/mongo.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-from pymongo import MongoClient
-from .base import BaseDatabase
-
-class MongoDatabase(BaseDatabase):
- def __init__(self, feature_extractor, host=(os.environ["MONGO_HOST"] if "MONGO_HOST" in os.environ else 'localhost'), port=27017, database='simstring'):
- self.feature_extractor = feature_extractor
-
- client = MongoClient(host, port)
- db = client[database]
- self.collection = db.strings
- self.ensure_index()
-
- def add(self, string):
- features = self.feature_extractor.features(string)
- self.collection.insert_one({"string": string, "features": features, "size": len(features)})
-
- def all(self):
- return list(map(lambda x: x['string'], self.all_documents()))
-
- def all_documents(self):
- return list(self.collection.find())
-
- def ensure_index(self):
- self.collection.create_index('size')
- self.collection.create_index('features')
-
- def lookup_strings_by_feature_set_size_and_feature(self, size, feature):
- documents = list(self.collection.find({"size": size, "features": feature}))
- return set(list(map(lambda x: x['string'], documents)))
-
- def reset_collection(self):
- self.collection.remove()
- self.ensure_index()
diff --git a/simstring/docs/database.md b/simstring/docs/database.md
new file mode 100644
index 0000000..fa05a7c
--- /dev/null
+++ b/simstring/docs/database.md
@@ -0,0 +1,8 @@
+# Database
+
+
+## Dict based database
+::: simstring.database.dict.DictDatabase
+ :docstring:
+ :members:
+
diff --git a/simstring/docs/fast.png b/simstring/docs/fast.png
new file mode 100644
index 0000000..4952a66
Binary files /dev/null and b/simstring/docs/fast.png differ
diff --git a/simstring/docs/features.md b/simstring/docs/features.md
new file mode 100644
index 0000000..ac92dfa
--- /dev/null
+++ b/simstring/docs/features.md
@@ -0,0 +1,14 @@
+# Feature extractors
+
+
+::: simstring.feature_extractor.character_ngram.CharacterNgramFeatureExtractor
+ :docstring:
+ :members:
+
+::: simstring.feature_extractor.word_ngram.WordNgramFeatureExtractor
+ :docstring:
+ :members:
+
+
diff --git a/simstring/docs/index.md b/simstring/docs/index.md
new file mode 100644
index 0000000..27e1134
--- /dev/null
+++ b/simstring/docs/index.md
@@ -0,0 +1,34 @@
+# Simstring
+
+
+## Getting started
+
+Install with `pip install simstring-fast`
+
+```python
+from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
+from simstring.measure.cosine import CosineMeasure
+from simstring.database.dict import DictDatabase
+from simstring.searcher import Searcher
+
+db = DictDatabase(CharacterNgramFeatureExtractor(2))
+db.add('foo')
+db.add('bar')
+db.add('fooo')
+
+searcher = Searcher(db, CosineMeasure())
+results = searcher.search('foo', 0.8)
+print(results)
+```
+
+## Simstring
+The original method is described in this [paper](https://aclanthology.org/C10-1096.pdf). There is an even faster C++ implimentation by the original authors available [here](http://chokkan.org/software/simstring/)
+
+This module is a fork of [this repo](https://github.com/nullnull/simstring) which is no longer actively maintained. This module adds documentation, speedups and more measures and features such saving compiled databases.
+
+## Banchmarks
+
+Without compilation the code takes 14 seconds to run through this particular banchmark, which is only on the data retrieval.
+
+
+With compiltion this time is dropped to below 5 seconds.
\ No newline at end of file
diff --git a/simstring/docs/measure.md b/simstring/docs/measure.md
new file mode 100644
index 0000000..cb49be3
--- /dev/null
+++ b/simstring/docs/measure.md
@@ -0,0 +1,52 @@
+# Measure
+
+The measure defines the formula by which the distance between strings is measured.
+
+Use as:
+
+```python
+from simstring.measure.cosine import CosineMeasure
+from simstring.measure.jaccard import JaccardMeasure
+from simstring.measure.overlap import OverlapMeasure
+from from simstring.measure.dice import DiceMeasure
+
+```
+
+But be carefull, they are not identical to the normal definitions of these measures.
+
+
+Cosine Measure is different to `scipy.spatial.distance.cosine` as it works on strings and not vectors.
+
+
+Jaccard distance does not discard duplicates in its sets, unlike in the normally used definition. This means that 'fooo' is seen as more different from 'fo' than 'foo', which is a more useful way of lookng at the string difference, but is not the usual definition of the distance as implimanted by `scipy.spatial.distance.jaccard` or [wikipedia](https://en.wikipedia.org/wiki/Jaccard_index) or any public [calculator](https://planetcalc.com/1664/).
+
+
+## Cosine Measure
+
+::: simstring.measure.cosine.CosineMeasure
+ handler: python
+ options:
+ show_root_heading: false
+
+## Jaccard Measure
+
+::: simstring.measure.jaccard.JaccardMeasure
+ :docstring:
+ :members:
+
+## OverlapMeasures
+
+::: simstring.measure.overlap.OverlapMeasure
+ :docstring:
+ :members:
+
+::: simstring.measure.overlap.LeftOverlapMeasure
+ :docstring:
+ :members:
+
+
+## DiceMeasure
+
+::: simstring.measure.dice.DiceMeasure
+ :docstring:
+ :members:
diff --git a/simstring/docs/searcher.md b/simstring/docs/searcher.md
new file mode 100644
index 0000000..f492486
--- /dev/null
+++ b/simstring/docs/searcher.md
@@ -0,0 +1,5 @@
+# Searcher
+
+::: simstring.searcher.Searcher
+ :docstring:
+ :members:
diff --git a/simstring/docs/slow.png b/simstring/docs/slow.png
new file mode 100644
index 0000000..d89e020
Binary files /dev/null and b/simstring/docs/slow.png differ
diff --git a/simstring/docs/strings_icon.png b/simstring/docs/strings_icon.png
new file mode 100644
index 0000000..5cc08e4
Binary files /dev/null and b/simstring/docs/strings_icon.png differ
diff --git a/simstring/feature_extractor/__init__.py b/simstring/feature_extractor/__init__.py
index e69de29..0cf9706 100644
--- a/simstring/feature_extractor/__init__.py
+++ b/simstring/feature_extractor/__init__.py
@@ -0,0 +1,2 @@
+# from .character_ngram import CharacterNgramFeatureExtractor
+# from .word_ngram import WordNgramFeatureExtractor
diff --git a/simstring/feature_extractor/base.py b/simstring/feature_extractor/base.py
index 46db86a..b10a450 100644
--- a/simstring/feature_extractor/base.py
+++ b/simstring/feature_extractor/base.py
@@ -1,9 +1,40 @@
+from collections import defaultdict
+
+SENTINAL_CHAR = " " # non breaking space
+
+
class BaseFeatureExtractor:
- def features(self, _string):
+ def features(self, string: str) -> list[str]:
raise NotImplementedError()
- def _each_cons(self, xs, n):
- return [xs[i:i+n] for i in range(len(xs)-n+1)]
- def _words_ngram(self, words, n, SENTINAL_CHAR):
- return [tuple(x) for x in self._each_cons([SENTINAL_CHAR] + words + [SENTINAL_CHAR], n)]
+ def _words_ngram(self, words: list[str], n: int, SENTINAL_CHAR: str):
+ xs = [SENTINAL_CHAR] + words + [SENTINAL_CHAR]
+ combinations = [xs[i : i + n] for i in range(len(xs) - n + 1)]
+ return [tuple(x) for x in combinations]
+
+ def uniquify_list(self, non_unique_list: list[str]) -> list[str]:
+ """Function to ensure a list has only unique values
+
+ All values get "_n" appended where n is the number that entry occurred
+ Args:
+ non_unique_list (list): list to be uniquefied
+
+ Returns:
+ list: uniquified list
+
+ Example:
+ ['a', 'b', 'a'] -> ['a_1', 'b_1', 'a_2']
+
+ """
+ counter: dict[str, int] = defaultdict(int)
+ unique_list = []
+ for val in non_unique_list:
+ counter[val] += 1
+ unique_list.append(f"{val}_{counter[val]}")
+
+ return unique_list
+
+ def __define__(self) -> str:
+ "Custom representation string"
+ raise NotImplementedError()
diff --git a/simstring/feature_extractor/character_ngram.py b/simstring/feature_extractor/character_ngram.py
index 7f68dd8..c2a10ea 100644
--- a/simstring/feature_extractor/character_ngram.py
+++ b/simstring/feature_extractor/character_ngram.py
@@ -1,10 +1,17 @@
-from .base import BaseFeatureExtractor
+from .base import BaseFeatureExtractor, SENTINAL_CHAR
-SENTINAL_CHAR = " " # non breaking space
class CharacterNgramFeatureExtractor(BaseFeatureExtractor):
- def __init__(self, n=2):
+ def __init__(self, n: int = 2, endmarker: str="$"):
self.n = n
+ self.endmarker = endmarker
- def features(self, string):
- return self._each_cons(SENTINAL_CHAR + string + SENTINAL_CHAR, self.n)
+ def features(self, string: str) -> list[str]:
+ xs = self.endmarker * (self.n - 1) + string + self.endmarker * (self.n - 1)
+ list_of_ngrams = [xs[i : i + self.n] for i in range(len(xs) - self.n + 1)]
+
+ return self.uniquify_list(list_of_ngrams)
+
+ def __define__(self) -> str:
+ "Custom representation string"
+ return f"CharacterNgramFeatureExtractor({self.n})"
diff --git a/simstring/feature_extractor/mecab_ngram.py b/simstring/feature_extractor/mecab_ngram.py
deleted file mode 100644
index b064173..0000000
--- a/simstring/feature_extractor/mecab_ngram.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import MeCab
-from collections import namedtuple
-from .base import BaseFeatureExtractor
-
-SENTINAL_CHAR = " " # non breaking space
-
-class MecabNgramFeatureExtractor(BaseFeatureExtractor):
- def __init__(self, n=2, user_dic_path='', sys_dic_path=''):
- self.n = n
- self.mecab = MecabTokenizer(user_dic_path, sys_dic_path)
-
- def features(self, text):
- words = [x.surface() for x in self.mecab.tokenize(text)]
- return self._words_ngram(words, self.n, SENTINAL_CHAR)
-
-class Token:
- def __init__(self, surface, feature):
- token = namedtuple('Token', 'surface, pos, pos_detail1, pos_detail2, pos_detail3, infl_type, infl_form, base_form, reading, phonetic')
- self.token = token(surface, *feature)
-
- def baseform_or_surface(self):
- return self.token.base_form if self.token.base_form != '*' else self.token.surface
-
- def pos(self):
- return self.token.pos
-
- def pos_detail1(self):
- return self.token.pos_detail1
-
- def surface(self):
- return self.token.surface
-
-class MecabTokenizer:
- def __init__(self, user_dic_path='', sys_dic_path=''):
- option = ''
- if user_dic_path:
- option += ' -d {0}'.format(user_dic_path)
- if sys_dic_path:
- option += ' -u {0}'.format(sys_dic_path)
- self._tagger = MeCab.Tagger(option)
-
- def tokenize(self, text):
- self._tagger.parse('')
- chunks = self._tagger.parse(text.rstrip()).splitlines()[:-1] # Skip EOS
-
- tokens = []
- for chunk in chunks:
- if chunk == '':
- continue
- surface, feature = chunk.split('\t')
- feature = feature.split(',')
- if len(feature) <= 7: # 読みがない
- feature.append('')
- if len(feature) <= 8: # 発音がない
- feature.append('')
- tokens.append(Token(surface, feature))
- return tokens
diff --git a/simstring/feature_extractor/word_ngram.py b/simstring/feature_extractor/word_ngram.py
index 8bcf464..9966f63 100644
--- a/simstring/feature_extractor/word_ngram.py
+++ b/simstring/feature_extractor/word_ngram.py
@@ -1,14 +1,16 @@
-from .base import BaseFeatureExtractor
+from .base import BaseFeatureExtractor, SENTINAL_CHAR
-SENTINAL_CHAR = " " # non breaking space
class WordNgramFeatureExtractor(BaseFeatureExtractor):
def __init__(self, n=2, splitter=" "):
self.n = n
self.splitter = splitter
- def features(self, text):
+ def features(self, text: str) -> list[str]:
# Split text by white space.
# If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own.
words = text.split(self.splitter)
return self._words_ngram(words, self.n, SENTINAL_CHAR)
+
+ def __define__(self) -> str:
+ return f"WordNgramFeatureExtractor({self.n},{self.splitter})"
diff --git a/simstring/measure/__init__.py b/simstring/measure/__init__.py
index e69de29..535934a 100644
--- a/simstring/measure/__init__.py
+++ b/simstring/measure/__init__.py
@@ -0,0 +1,4 @@
+# from .cosine import CosineMeasure
+# from .jaccard import JaccardMeasure
+# from .dice import DiceMeasure
+# from .overlap import OverlapMeasure, LeftOverlapMeasure
diff --git a/simstring/measure/base.py b/simstring/measure/base.py
index d23fc19..b84d374 100644
--- a/simstring/measure/base.py
+++ b/simstring/measure/base.py
@@ -1,12 +1,12 @@
class BaseMeasure:
- def min_feature_size(self, _query_size, _alpha):
- raise 'Not Implemented'
+ def min_feature_size(self, _query_size, _alpha) -> int:
+ raise NotImplementedError
- def max_feature_size(self, _query_size, _alpha):
- raise 'Not Implemented'
+ def max_feature_size(self, _query_size, _alpha) -> int:
+ raise NotImplementedError
- def minimum_common_feature_count(self, _query_size, _y_size, _alpha):
- raise 'Not Implemented'
+ def minimum_common_feature_count(self, _query_size, _y_size, _alpha) -> int:
+ raise NotImplementedError
- def similarity(self, X, Y):
- raise 'Not Implemented'
+ def similarity(self, X, Y) -> float:
+ raise NotImplementedError
diff --git a/simstring/measure/cosine.py b/simstring/measure/cosine.py
index 0475258..6d7d113 100644
--- a/simstring/measure/cosine.py
+++ b/simstring/measure/cosine.py
@@ -1,15 +1,18 @@
import math
-from .base import BaseMeasure
+from typing import Iterable
-class CosineMeasure(BaseMeasure):
- def min_feature_size(self, query_size, alpha):
+
+class CosineMeasure:
+ def min_feature_size(self, query_size: int, alpha: float) -> int:
return int(math.ceil(alpha * alpha * query_size))
- def max_feature_size(self, query_size, alpha):
- return int(math.floor(query_size * 1.0 / (alpha * alpha)))
+ def max_feature_size(self, query_size: int, alpha: float) -> int:
+ return int(math.floor(query_size / (alpha * alpha)))
- def minimum_common_feature_count(self, query_size, y_size, alpha):
+ def minimum_common_feature_count(
+ self, query_size: int, y_size: int, alpha: float
+ ) -> int:
return int(math.ceil(alpha * math.sqrt(query_size * y_size)))
- def similarity(self, X, Y):
- return len(set(X) & set(Y)) * 1.0 / math.sqrt(len(set(X)) * len(set(Y)))
+ def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
+ return len(set(X) & set(Y)) / math.sqrt(len(set(X)) * len(set(Y)))
diff --git a/simstring/measure/dice.py b/simstring/measure/dice.py
index 97b0fba..04976d3 100644
--- a/simstring/measure/dice.py
+++ b/simstring/measure/dice.py
@@ -1,15 +1,18 @@
import math
-from .base import BaseMeasure
+from typing import Iterable
-class DiceMeasure(BaseMeasure):
- def min_feature_size(self, query_size, alpha):
+
+class DiceMeasure:
+ def min_feature_size(self, query_size: int, alpha: float) -> int:
return int(math.ceil(alpha * 1.0 / (2 - alpha) * query_size))
- def max_feature_size(self, query_size, alpha):
+ def max_feature_size(self, query_size: int, alpha: float) -> int:
return int(math.floor((2 - alpha) * query_size * 1.0 / alpha))
- def minimum_common_feature_count(self, query_size, y_size, alpha):
+ def minimum_common_feature_count(
+ self, query_size: int, y_size: int, alpha: float
+ ) -> int:
return int(math.ceil(0.5 * alpha * query_size * y_size))
- def similarity(self, X, Y):
+ def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
return len(set(X) & set(Y)) * 2.0 / (len(set(X)) + len(set(Y)))
diff --git a/simstring/measure/jaccard.py b/simstring/measure/jaccard.py
index d12abb7..4ca54d4 100644
--- a/simstring/measure/jaccard.py
+++ b/simstring/measure/jaccard.py
@@ -1,15 +1,18 @@
import math
-from .base import BaseMeasure
+from typing import Iterable
-class JaccardMeasure(BaseMeasure):
- def min_feature_size(self, query_size, alpha):
+
+class JaccardMeasure:
+ def min_feature_size(self, query_size: int, alpha: float) -> int:
return int(math.ceil(alpha * query_size))
- def max_feature_size(self, query_size, alpha):
+ def max_feature_size(self, query_size: int, alpha: float) -> int:
return int(math.floor(query_size / alpha))
- def minimum_common_feature_count(self, query_size, y_size, alpha):
+ def minimum_common_feature_count(
+ self, query_size: int, y_size: int, alpha: float
+ ) -> int:
return int(math.ceil(alpha * (query_size + y_size) * 1.0 / (1 + alpha)))
- def similarity(self, X, Y):
+ def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
return len(set(X) & set(Y)) * 1.0 / len(set(X) | set(Y))
diff --git a/simstring/measure/overlap.py b/simstring/measure/overlap.py
new file mode 100644
index 0000000..c000477
--- /dev/null
+++ b/simstring/measure/overlap.py
@@ -0,0 +1,49 @@
+import math
+from typing import Iterable
+
+
+class OverlapMeasure:
+ def __init__(self, db=None, maxsize: int = 100) -> None:
+ super().__init__()
+ if db:
+ self.maxsize = db.max_feature_size()
+ else:
+ self.maxsize = maxsize
+
+ def min_feature_size(self, query_size: int, alpha: float) -> int:
+ # return 1 # Not sure the below isn't sufficient
+ return math.floor(query_size * alpha) or 1
+
+ def max_feature_size(self, query_size: int, alpha: float) -> int:
+ return self.maxsize
+
+ def minimum_common_feature_count(
+ self, query_size: int, y_size: int, alpha: float
+ ) -> int:
+ return int(math.ceil(alpha * min(query_size, y_size)))
+
+ def similarity(self, X: Iterable[str], Y: Iterable[str]) -> int:
+ return min(len(set(X)), len(set(Y)))
+
+
+class LeftOverlapMeasure:
+ def __init__(self, db=None, maxsize: int = 100) -> None:
+ super().__init__()
+ if db:
+ self.maxsize = db.max_feature_size()
+ else:
+ self.maxsize = maxsize
+
+ def min_feature_size(self, query_size: int, alpha: float) -> int:
+ return math.floor(query_size * alpha) or 1
+
+ def max_feature_size(self, query_size: int, alpha: float) -> int:
+ return self.maxsize
+
+ def minimum_common_feature_count(
+ self, query_size: int, y_size: int, alpha: float
+ ) -> int:
+ return math.floor(query_size * alpha) or 1
+
+ def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
+ return 1 - len(set(X) - set(Y)) / len(set(X))
diff --git a/simstring/mkdocs.yml b/simstring/mkdocs.yml
new file mode 100644
index 0000000..3a95fb3
--- /dev/null
+++ b/simstring/mkdocs.yml
@@ -0,0 +1,30 @@
+site_name: Simstring docs
+site_url: https://icfly2.github.io/simstring/
+nav:
+ - Home: index.md
+ - Measure: measure.md
+ - Searcher: searcher.md
+ - Database: database.md
+ - Features: features.md
+
+theme:
+ name: mkdocs
+ shortcuts:
+ help: 191 # ?
+ next: 78 # n
+ previous: 80 # p
+ search: 83 # s
+
+plugins:
+- search
+- mkdocstrings
+
+
+# for local instal:
+# pip install mkdocs mkdocs mkdocstrings[python]
+
+# for local development and checking cd insto this folder and run:
+# mkdocs serve -w ../docs
+
+# build with:
+# mkdocs build -d ../docs
diff --git a/simstring/searcher.py b/simstring/searcher.py
index 8365706..ca65efa 100644
--- a/simstring/searcher.py
+++ b/simstring/searcher.py
@@ -1,60 +1,127 @@
# -*- coding:utf-8 -*-
+from collections import defaultdict, OrderedDict
+from typing import OrderedDict as OrderedDictType
-from collections import defaultdict
-from operator import itemgetter
class Searcher:
- def __init__(self, db, measure):
+ def __init__(self, db, measure) -> None:
+ """Searcher class
+
+ This is the main way of interacting with the simsting search.
+
+ Args:
+ db (database): A database, can be a dict or mongo one as defined by the `database` modeule
+ measure (measure): The similarity measure as defined by `measure`
+ """
self.db = db
self.measure = measure
self.feature_extractor = db.feature_extractor
- self.lookup_strings_result = defaultdict(dict)
+ self.lookup_strings_result: dict = defaultdict(dict)
- def search(self, query_string, alpha):
+ def search(self, query_string: str, alpha: float) -> list[str]:
features = self.feature_extractor.features(query_string)
- min_feature_size = self.measure.min_feature_size(len(features), alpha)
- max_feature_size = self.measure.max_feature_size(len(features), alpha)
+ lf = len(features)
+ min_feature_size = self.measure.min_feature_size(lf, alpha)
+ max_feature_size = self.measure.max_feature_size(lf, alpha)
results = []
for candidate_feature_size in range(min_feature_size, max_feature_size + 1):
- tau = self.__min_overlap(len(features), candidate_feature_size, alpha)
+ tau = self.__min_overlap(lf, candidate_feature_size, alpha)
results.extend(self.__overlap_join(features, tau, candidate_feature_size))
-
return results
- def ranked_search(self, query_string, alpha):
+ def ranked_search(
+ self, query_string: str, alpha: float
+ ) -> OrderedDictType[str, float]:
+ """Find matches for sting returning multiple ranked matches.
+
+ Args:
+ query_string (str): string to match
+ alpha (float): min similarity
+
+ Returns:
+ OrderedDict[str, float]: Matched string with similarity
+ """
results = self.search(query_string, alpha)
features = self.feature_extractor.features(query_string)
- results_with_score = list(map(lambda x: [self.measure.similarity(features, self.feature_extractor.features(x)), x], results))
- return sorted(results_with_score, key=lambda x: (-x[0], x[1]))
+ results_with_score = list(
+ map(
+ lambda x: [
+ self.measure.similarity(
+ features, self.feature_extractor.features(x)
+ ),
+ x,
+ ],
+ results,
+ )
+ )
+ return OrderedDict(
+ (
+ (name, score)
+ for score, name in sorted(
+ results_with_score, key=lambda x: (-x[0], x[1])
+ )
+ )
+ )
- def __min_overlap(self, query_size, candidate_feature_size, alpha):
- return self.measure.minimum_common_feature_count(query_size, candidate_feature_size, alpha)
+ def __min_overlap(
+ self, query_size: int, candidate_feature_size: int, alpha: float
+ ) -> int:
+ return self.measure.minimum_common_feature_count(
+ query_size, candidate_feature_size, alpha
+ )
- def __overlap_join(self, features, tau, candidate_feature_size):
+ def __overlap_join(
+ self, features: list[str], tau: int, candidate_feature_size: int
+ ) -> list[str]:
query_feature_size = len(features)
- sorted_features = sorted(features, key=lambda x: len(self.__lookup_strings_by_feature_set_size_and_feature(candidate_feature_size, x)))
- candidate_string_to_matched_count = defaultdict(int)
- results = []
- for feature in sorted_features[0:query_feature_size - tau + 1]:
- for s in self.__lookup_strings_by_feature_set_size_and_feature(candidate_feature_size, feature):
+ features_mapped_to_lookup_strings_sets = {
+ x: self.__lookup_strings_by_feature_set_size_and_feature(
+ candidate_feature_size, x
+ )
+ for x in features
+ }
+
+ features.sort(key=lambda x: len(features_mapped_to_lookup_strings_sets[x]))
+
+ # candidate_string_to_matched_count : dict[str,int] = defaultdict(int) # Only in 3.10 and later
+ candidate_string_to_matched_count: dict[str, int] = dict()
+ results = []
+ for feature in features[0 : query_feature_size - tau + 1]:
+ for s in features_mapped_to_lookup_strings_sets[feature]:
+ if s not in candidate_string_to_matched_count:
+ candidate_string_to_matched_count[s] = 0
candidate_string_to_matched_count[s] += 1
- for s in candidate_string_to_matched_count.keys():
+ # The next loop does not run for tau = 1, hence candidates are never checked, while all satisfies the criteria
+ if tau == 1:
+ results = list(candidate_string_to_matched_count.keys())
+
+ for (
+ candidate,
+ candidate_match_count,
+ ) in candidate_string_to_matched_count.items():
for i in range(query_feature_size - tau + 1, query_feature_size):
- feature = sorted_features[i]
- if s in self.__lookup_strings_by_feature_set_size_and_feature(candidate_feature_size, feature):
- candidate_string_to_matched_count[s] += 1
- if candidate_string_to_matched_count[s] >= tau:
- results.append(s)
+ feature = features[i]
+ if candidate in features_mapped_to_lookup_strings_sets[feature]:
+ candidate_match_count += 1
+ if candidate_match_count >= tau:
+ results.append(candidate)
break
remaining_feature_count = query_feature_size - i - 1
- if candidate_string_to_matched_count[s] + remaining_feature_count < tau:
+ if candidate_match_count + remaining_feature_count < tau:
break
+
return results
- def __lookup_strings_by_feature_set_size_and_feature(self, feature_size, feature):
- if not (feature in self.lookup_strings_result[feature_size]):
- self.lookup_strings_result[feature_size][feature] = self.db.lookup_strings_by_feature_set_size_and_feature(feature_size, feature)
+ def __lookup_strings_by_feature_set_size_and_feature(
+ self, feature_size: int, feature: str
+ ) -> set[str]:
+ if feature not in self.lookup_strings_result[feature_size]:
+ self.lookup_strings_result[feature_size][
+ feature
+ ] = self.db.lookup_strings_by_feature_set_size_and_feature(
+ feature_size, feature
+ )
return self.lookup_strings_result[feature_size][feature]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/database/test_dbs.py b/tests/database/test_dbs.py
new file mode 100644
index 0000000..43ccac2
--- /dev/null
+++ b/tests/database/test_dbs.py
@@ -0,0 +1,65 @@
+## -*- coding:utf-8 -*-
+
+import pytest
+from simstring.database.dict import DictDatabase
+from simstring.database.disk import DiskDatabase
+from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
+import os
+import shutil
+from multiprocessing import Pool
+from faker import Faker
+import random
+
+# Set up Faker
+f = Faker()
+Faker.seed(0)
+
+# Fixture to create random strings
+@pytest.fixture
+def strings():
+ return [f.name().replace('-', ' ') for _ in range(100)]
+
+# Fixture for DictDatabase
+@pytest.fixture
+def dict_db(strings):
+ db = DictDatabase(CharacterNgramFeatureExtractor(2))
+ for string in strings:
+ db.add(string)
+ return db
+
+# Fixture for DiskDatabase with setup and teardown
+@pytest.fixture
+def disk_db(strings):
+ path = f"tmp_db_for_tests-{random.randint(1000,10000)}"
+ db = DiskDatabase(CharacterNgramFeatureExtractor(2), path=path)
+ with Pool(processes=8) as pool:
+ for _ in pool.imap_unordered(db.add, strings):
+ pass
+ yield db
+ shutil.rmtree(path, ignore_errors=True)
+
+# Test to compare the contents of dict_db and disk_db
+def test_strings(dict_db, disk_db):
+ assert set(dict_db.all()) == set(disk_db.all())
+
+# Test for equivalence from disk_db to dict_db
+def test_equivalence_disk_to_dict(dict_db, disk_db):
+
+ for key in disk_db.feature_set_size_to_string_map.iterkeys():
+ assert dict_db.feature_set_size_to_string_map[key] == disk_db.feature_set_size_to_string_map[key]
+
+ for key in disk_db.feature_set_size_and_feature_to_string_map.iterkeys():
+ disk_val = disk_db.feature_set_size_and_feature_to_string_map[key]
+ k1, k2 = key.split('-')
+ dict_val = dict_db.feature_set_size_and_feature_to_string_map[int(k1)][k2]
+ assert disk_val == dict_val
+
+# Test for equivalence from dict_db to disk_db
+def test_equivalence_dict_to_disk(dict_db, disk_db):
+ for size, value in dict_db.feature_set_size_and_feature_to_string_map.items():
+ for feature, dict_value in value.items():
+ disk_value = disk_db.get_feature_set_size_and_feature_to_string_map(size, feature)
+ assert dict_value == disk_value
+
+ for size, string_set in dict_db.feature_set_size_to_string_map.items():
+ assert string_set == disk_db.feature_set_size_to_string_map[size]
diff --git a/tests/database/test_dict.py b/tests/database/test_dict.py
index 9985ad7..db8a2af 100644
--- a/tests/database/test_dict.py
+++ b/tests/database/test_dict.py
@@ -1,27 +1,60 @@
# -*- coding:utf-8 -*-
-from unittest import TestCase
+import pytest
from simstring.database.dict import DictDatabase
from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
+import pickle
+import os
-class TestDict(TestCase):
- strings = ['a', 'ab', 'abc', 'abcd', 'abcde']
+# Sample strings used in multiple tests
+strings = ["a", "ab", "abc", "abcd", "abcde"]
- def setUp(self):
- self.db = DictDatabase(CharacterNgramFeatureExtractor(2))
- for string in self.strings:
- self.db.add(string)
+@pytest.fixture
+def db():
+ db = DictDatabase(CharacterNgramFeatureExtractor(2))
+ for string in strings:
+ db.add(string)
+ return db
- def test_strings(self):
- self.assertEqual(self.db.strings, self.strings)
+def test_strings(db):
+ assert sorted(db.all()) == sorted(strings)
- def test_min_feature_size(self):
- self.assertEqual(self.db.min_feature_size(), min(map(lambda x: len(x) + 1, self.strings)))
+def test_lookup_strings_by_feature_set_size_and_feature(db):
+ assert db.lookup_strings_by_feature_set_size_and_feature(4, "ab_1") == set(["abc"])
+ assert db.lookup_strings_by_feature_set_size_and_feature(3, "ab_1") == set(["ab"])
+ assert db.lookup_strings_by_feature_set_size_and_feature(2, "ab_1") == set([])
- def test_max_feature_size(self):
- self.assertEqual(self.db.max_feature_size(), max(map(lambda x: len(x) + 1, self.strings)))
+@pytest.fixture
+def setup_multistep_save(db):
+ with open("test.pkl", "wb") as f:
+ db.to_pickle(f)
+ yield "test.pkl"
+ os.remove("test.pkl")
- def test_lookup_strings_by_feature_set_size_and_feature(self):
- self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(4, 'ab'), set(['abc']))
- self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(3, 'ab'), set(['ab']))
- self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(2, 'ab'), set([]))
+def test_multistep_save(db, setup_multistep_save):
+ with open(setup_multistep_save, "rb") as f:
+ data2 = pickle.load(f)
+ new = DictDatabase.from_dict(data2)
+
+ assert db._min_feature_size == new._min_feature_size
+ assert db._max_feature_size == new._max_feature_size
+ assert db.feature_extractor.__class__ == new.feature_extractor.__class__
+ assert db.feature_extractor.n == new.feature_extractor.n
+ assert db.feature_set_size_to_string_map == new.feature_set_size_to_string_map
+ assert db.feature_set_size_and_feature_to_string_map == new.feature_set_size_and_feature_to_string_map
+
+@pytest.fixture
+def setup_compact_save(db):
+ db.save("test2.pkl")
+ yield "test2.pkl"
+ os.remove("test2.pkl")
+
+def test_compact_save(db, setup_compact_save):
+ new = DictDatabase.load(setup_compact_save)
+
+ assert db._min_feature_size == new._min_feature_size
+ assert db._max_feature_size == new._max_feature_size
+ assert db.feature_extractor.__class__ == new.feature_extractor.__class__
+ assert db.feature_extractor.n == new.feature_extractor.n
+ assert db.feature_set_size_to_string_map == new.feature_set_size_to_string_map
+ assert db.feature_set_size_and_feature_to_string_map == new.feature_set_size_and_feature_to_string_map
diff --git a/tests/database/test_disk.py b/tests/database/test_disk.py
new file mode 100644
index 0000000..b927c45
--- /dev/null
+++ b/tests/database/test_disk.py
@@ -0,0 +1,58 @@
+# -*- coding:utf-8 -*-
+
+import pytest
+import random
+import pickle
+import os
+import shutil
+from simstring.database.disk import DiskDatabase
+from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
+
+
+@pytest.fixture
+def disk_db():
+ # Setup the DiskDatabase with a random temporary path and add strings
+ db = DiskDatabase(CharacterNgramFeatureExtractor(2), path=f"tmp_db_for_tests-{random.randint(1000, 10000)}")
+ strings = ["a", "ab", "abc", "abcd", "abcde"]
+ for string in strings:
+ db.add(string)
+ yield db
+ # Teardown: Remove the database folder after test is completed
+ shutil.rmtree(db.path, ignore_errors=True)
+
+
+# Test case to check the strings stored in the database
+def test_strings(disk_db):
+ expected_strings = ["a", "ab", "abc", "abcd", "abcde"]
+ assert sorted(disk_db.all()) == sorted(expected_strings)
+
+
+# Test case for lookup functionality by feature set size and feature
+@pytest.mark.parametrize("feature_size, feature, expected_result", [
+ (4, "ab_1", {"abc"}),
+ (3, "ab_1", {"ab"}),
+ (2, "ab_1", set()),
+])
+def test_lookup_strings_by_feature_set_size_and_feature(disk_db, feature_size, feature, expected_result):
+ result = disk_db.lookup_strings_by_feature_set_size_and_feature(feature_size, feature)
+ assert result == expected_result
+
+
+# Test case to test saving and loading the database using pickle
+def test_load_from_folder(disk_db):
+ # Save the database to a pickle file
+ with open("test.pkl", "wb") as f:
+ pickle.dump(disk_db, f)
+
+ # Load the database from the pickle file
+ with open("test.pkl", "rb") as f:
+ loaded_db = pickle.load(f)
+
+ # Validate the features and mappings
+ assert disk_db.feature_extractor.__class__ == loaded_db.feature_extractor.__class__
+ assert disk_db.feature_extractor.n == loaded_db.feature_extractor.n
+ assert set(disk_db.feature_set_size_to_string_map.iterkeys()) == set(loaded_db.feature_set_size_to_string_map.iterkeys())
+ assert set(disk_db.feature_set_size_and_feature_to_string_map.iterkeys()) == set(loaded_db.feature_set_size_and_feature_to_string_map.iterkeys())
+
+ # Clean up the pickle file after test
+ os.remove("test.pkl")
diff --git a/tests/database/test_mongo.py b/tests/database/test_mongo.py.broken
similarity index 100%
rename from tests/database/test_mongo.py
rename to tests/database/test_mongo.py.broken
diff --git a/tests/feature_extractor/test_character_ngram.py b/tests/feature_extractor/test_character_ngram.py
index aead57e..521f4ec 100644
--- a/tests/feature_extractor/test_character_ngram.py
+++ b/tests/feature_extractor/test_character_ngram.py
@@ -1,12 +1,26 @@
# -*- coding:utf-8 -*-
-from unittest import TestCase
+import pytest
from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
-class TestNgram(TestCase):
- def test_features(self):
- self.assertEqual(CharacterNgramFeatureExtractor().features('abcde'), [' a', 'ab', 'bc', 'cd', 'de', 'e '])
- self.assertEqual(CharacterNgramFeatureExtractor(3).features('abcde'), [' ab', 'abc', 'bcd', 'cde', 'de '])
- self.assertEqual(CharacterNgramFeatureExtractor(4).features('abcde'), [' abc', 'abcd', 'bcde', 'cde '])
- self.assertEqual(CharacterNgramFeatureExtractor(5).features('abcde'), [' abcd', 'abcde', 'bcde '])
- self.assertEqual(CharacterNgramFeatureExtractor().features(u'あいうえお'), [' あ', 'あい', 'いう', 'うえ', 'えお', 'お ']) # Japanese
+@pytest.mark.parametrize("n, input_text, expected_features", [
+ (2, "abcde", ["$a_1", "ab_1", "bc_1", "cd_1", "de_1", "e$_1"]),
+ (3, "abcde", ["$$a_1", "$ab_1", "abc_1", "bcd_1", "cde_1", "de$_1", "e$$_1"]),
+ (2, "あいうえお", ["$あ_1", "あい_1", "いう_1", "うえ_1", "えお_1", "お$_1"]), # Japanese text
+ (2, "marc anthony", [
+ '$m_1', 'ma_1', 'ar_1', 'rc_1', 'c _1', ' a_1', 'an_1', 'nt_1', 'th_1',
+ 'ho_1', 'on_1', 'ny_1', 'y$_1'
+ ]),
+ (2, "anthony marc", [
+ '$a_1', 'an_1', 'nt_1', 'th_1', 'ho_1', 'on_1', 'ny_1', 'y _1',
+ ' m_1', 'ma_1', 'ar_1', 'rc_1', 'c$_1'
+ ])
+])
+def test_features(n, input_text, expected_features):
+ extractor = CharacterNgramFeatureExtractor(n)
+ assert extractor.features(input_text) == expected_features
+
+
+def test_endmarker():
+ c_end = CharacterNgramFeatureExtractor(endmarker=" ")
+ assert set(c_end.features("marc anthony")) == set(c_end.features("anthony marc"))
\ No newline at end of file
diff --git a/tests/feature_extractor/test_mecab_ngram.py b/tests/feature_extractor/test_mecab_ngram.py.broken
similarity index 100%
rename from tests/feature_extractor/test_mecab_ngram.py
rename to tests/feature_extractor/test_mecab_ngram.py.broken
diff --git a/tests/feature_extractor/test_word_ngram.py b/tests/feature_extractor/test_word_ngram.py
new file mode 100644
index 0000000..3506379
--- /dev/null
+++ b/tests/feature_extractor/test_word_ngram.py
@@ -0,0 +1,13 @@
+# -*- coding:utf-8 -*-
+
+import pytest
+from simstring.feature_extractor.word_ngram import WordNgramFeatureExtractor
+
+@pytest.mark.parametrize("n, input_text, expected_features", [
+ (2, "abcd", [(" ", "abcd"), ("abcd", " ")]),
+ (2, "hello world", [(" ", "hello"), ("hello", "world"), ("world", " ")]),
+ (3, "hello world", [(" ", "hello", "world"), ("hello", "world", " ")])
+])
+def test_features(n, input_text, expected_features):
+ extractor = WordNgramFeatureExtractor(n)
+ assert extractor.features(input_text) == expected_features
diff --git a/tests/measure/test_cosine.py b/tests/measure/test_cosine.py
index 195edc2..dbfab53 100644
--- a/tests/measure/test_cosine.py
+++ b/tests/measure/test_cosine.py
@@ -1,29 +1,49 @@
# -*- coding:utf-8 -*-
-from unittest import TestCase
+import pytest
from simstring.measure.cosine import CosineMeasure
-class TestCosine(TestCase):
- measure = CosineMeasure()
+# Instantiate the measure only once for all tests
+measure = CosineMeasure()
- def test_min_feature_size(self):
- self.assertEqual(self.measure.min_feature_size(5, 1.0), 5)
- self.assertEqual(self.measure.min_feature_size(5, 0.5), 2)
+@pytest.mark.parametrize("feature_count, similarity, expected_min_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 2)
+])
+def test_min_feature_size(feature_count, similarity, expected_min_size):
+ assert measure.min_feature_size(feature_count, similarity) == expected_min_size
- def test_max_feature_size(self):
- self.assertEqual(self.measure.max_feature_size(5, 1.0), 5)
- self.assertEqual(self.measure.max_feature_size(5, 0.5), 20)
+@pytest.mark.parametrize("feature_count, similarity, expected_max_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 20)
+])
+def test_max_feature_size(feature_count, similarity, expected_max_size):
+ assert measure.max_feature_size(feature_count, similarity) == expected_max_size
- def test_minimum_common_feature_count(self):
- self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5)
- self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 10)
- self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3)
+@pytest.mark.parametrize("x_size, y_size, similarity, expected_count", [
+ (5, 5, 1.0, 5),
+ (5, 20, 1.0, 10),
+ (5, 5, 0.5, 3)
+])
+def test_minimum_common_feature_count(x_size, y_size, similarity, expected_count):
+ assert measure.minimum_common_feature_count(x_size, y_size, similarity) == expected_count
- def test_similarity(self):
- x = [1, 2, 3]
- y = [1, 2, 3, 4]
- self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0)
- self.assertEqual(round(self.measure.similarity(x, y), 2), 0.87)
-
- z = [1, 1, 2, 3]
- self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0)
+@pytest.mark.parametrize("x, y, expected_similarity", [
+ (["a", "ab", "bc", "c"], ["a", "ab", "bc", "c"], 1.0),
+ (["a", "ab", "bc", "c"], ["a", "ab", "bc", "cd", "e"], 0.67),
+ (["a", "ab", "ba", "ab", "a"], ["a", "ab", "ba", "ab", "a"], 1.0),
+ (["a", "ab", "bc", "c"], ["a", "ab", "ba", "ab", "a"], 0.58),
+ (
+ [
+ ' "m', '"me', "met", "eth", "thy", "hyl", "yl ", "l s", " su", "sul",
+ "ulf", "lfo", "fon", "one", 'ne"', 'e" '
+ ],
+ [
+ ' "m', '"me', "met", "eth", "thy", "hyl", "yl ", "l s", " su", "sul",
+ "ulp", "lph", "pho", "hon", "one", 'ne"', 'e" '
+ ],
+ 0.79
+ )
+])
+def test_similarity(x, y, expected_similarity):
+ assert round(measure.similarity(x, y), 2) == expected_similarity
diff --git a/tests/measure/test_dice.py b/tests/measure/test_dice.py
index e835f70..53c1ee8 100644
--- a/tests/measure/test_dice.py
+++ b/tests/measure/test_dice.py
@@ -1,26 +1,37 @@
# -*- coding:utf-8 -*-
-from unittest import TestCase
+import pytest
from simstring.measure.dice import DiceMeasure
-class TestCosine(TestCase):
- measure = DiceMeasure()
+# Instantiate the measure once for all tests
+measure = DiceMeasure()
- def test_min_feature_size(self):
- self.assertEqual(self.measure.min_feature_size(5, 1.0), 5)
- self.assertEqual(self.measure.min_feature_size(5, 0.5), 2)
+@pytest.mark.parametrize("feature_count, similarity, expected_min_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 2)
+])
+def test_min_feature_size(feature_count, similarity, expected_min_size):
+ assert measure.min_feature_size(feature_count, similarity) == expected_min_size
- def test_max_feature_size(self):
- self.assertEqual(self.measure.max_feature_size(5, 1.0), 5)
- self.assertEqual(self.measure.max_feature_size(5, 0.5), 15)
+@pytest.mark.parametrize("feature_count, similarity, expected_max_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 15)
+])
+def test_max_feature_size(feature_count, similarity, expected_max_size):
+ assert measure.max_feature_size(feature_count, similarity) == expected_max_size
- def test_minimum_common_feature_count(self):
- self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 13)
- self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 50)
- self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 7)
+@pytest.mark.parametrize("x_size, y_size, similarity, expected_count", [
+ (5, 5, 1.0, 13),
+ (5, 20, 1.0, 50),
+ (5, 5, 0.5, 7)
+])
+def test_minimum_common_feature_count(x_size, y_size, similarity, expected_count):
+ assert measure.minimum_common_feature_count(x_size, y_size, similarity) == expected_count
- def test_similarity(self):
- x = [1, 2, 3]
- y = [1, 2, 3, 4]
- self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0)
- self.assertEqual(round(self.measure.similarity(x, y), 2), 0.86)
+@pytest.mark.parametrize("x, y, expected_similarity", [
+ (["1", "2", "3"], ["1", "2", "3"], 1.0),
+ (["1", "2", "3"], ["1", "2", "3", "4"], 0.86),
+ (["ni", "ig", "gh", "ht"], ["na", "ac", "ch", "ht"], 0.25)
+])
+def test_similarity(x, y, expected_similarity):
+ assert round(measure.similarity(x, y), 2) == expected_similarity
diff --git a/tests/measure/test_jaccard.py b/tests/measure/test_jaccard.py
index 53ecd9d..0aec4d8 100644
--- a/tests/measure/test_jaccard.py
+++ b/tests/measure/test_jaccard.py
@@ -1,26 +1,37 @@
# -*- coding:utf-8 -*-
-from unittest import TestCase
+import pytest
from simstring.measure.jaccard import JaccardMeasure
-class TestCosine(TestCase):
- measure = JaccardMeasure()
+# Instantiate the measure once for all tests
+measure = JaccardMeasure()
- def test_min_feature_size(self):
- self.assertEqual(self.measure.min_feature_size(5, 1.0), 5)
- self.assertEqual(self.measure.min_feature_size(5, 0.5), 3)
+@pytest.mark.parametrize("feature_count, similarity, expected_min_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 3)
+])
+def test_min_feature_size(feature_count, similarity, expected_min_size):
+ assert measure.min_feature_size(feature_count, similarity) == expected_min_size
- def test_max_feature_size(self):
- self.assertEqual(self.measure.max_feature_size(5, 1.0), 5)
- self.assertEqual(self.measure.max_feature_size(5, 0.5), 10)
+@pytest.mark.parametrize("feature_count, similarity, expected_max_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 10)
+])
+def test_max_feature_size(feature_count, similarity, expected_max_size):
+ assert measure.max_feature_size(feature_count, similarity) == expected_max_size
- def test_minimum_common_feature_count(self):
- self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5)
- self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 13)
- self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 4)
+@pytest.mark.parametrize("x_size, y_size, similarity, expected_count", [
+ (5, 5, 1.0, 5),
+ (5, 20, 1.0, 13),
+ (5, 5, 0.5, 4)
+])
+def test_minimum_common_feature_count(x_size, y_size, similarity, expected_count):
+ assert measure.minimum_common_feature_count(x_size, y_size, similarity) == expected_count
- def test_similarity(self):
- x = [1, 2, 3]
- y = [1, 2, 3, 4]
- self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0)
- self.assertEqual(round(self.measure.similarity(x, y), 2), 0.75)
+@pytest.mark.parametrize("x, y, expected_similarity", [
+ (["1", "2", "3"], ["1", "2", "3"], 1.0),
+ (["1", "2", "3"], ["1", "2", "3", "4"], 0.75),
+ (["A", "AB", "BC", "C"], ["B", "BC", "CD", "DE", "E"], 0.125)
+])
+def test_similarity(x, y, expected_similarity):
+ assert round(measure.similarity(x, y), 3) == expected_similarity
diff --git a/tests/measure/test_overlap.py b/tests/measure/test_overlap.py
new file mode 100644
index 0000000..571b2fd
--- /dev/null
+++ b/tests/measure/test_overlap.py
@@ -0,0 +1,76 @@
+# -*- coding:utf-8 -*-
+
+import pytest
+from simstring.measure.overlap import OverlapMeasure, LeftOverlapMeasure
+
+maxsize = 5
+
+# Initialize measure instances for both overlap types
+overlap_measure = OverlapMeasure(maxsize=maxsize)
+left_overlap_measure = LeftOverlapMeasure(maxsize=maxsize)
+
+# Test cases for OverlapMeasure
+@pytest.mark.parametrize("feature_count, similarity, expected_min_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 2)
+])
+def test_overlap_min_feature_size(feature_count, similarity, expected_min_size):
+ assert overlap_measure.min_feature_size(feature_count, similarity) == expected_min_size
+
+@pytest.mark.parametrize("feature_count, similarity, expected_max_size", [
+ (5, 1.0, maxsize),
+ (5, 0.5, maxsize)
+])
+def test_overlap_max_feature_size(feature_count, similarity, expected_max_size):
+ assert overlap_measure.max_feature_size(feature_count, similarity) == expected_max_size
+
+@pytest.mark.parametrize("x_size, y_size, similarity, expected_count", [
+ (5, 5, 1.0, 5),
+ (5, 20, 1.0, 5),
+ (5, 5, 0.5, 3)
+])
+def test_overlap_minimum_common_feature_count(x_size, y_size, similarity, expected_count):
+ assert overlap_measure.minimum_common_feature_count(x_size, y_size, similarity) == expected_count
+
+@pytest.mark.parametrize("x, y, expected_similarity", [
+ ([1, 2, 3], [1, 2, 3], 3),
+ ([1, 2, 3], [1, 2, 3, 4], 3),
+ ([1, 2, 3], [1, 1, 2, 3], 3),
+ ([1, 2, 3, 4], [1, 1, 2, 3], 3),
+ ([1, 1, 2, 3], [1, 1, 2, 3], 3)
+])
+def test_overlap_similarity(x, y, expected_similarity):
+ assert round(overlap_measure.similarity(x, y), 2) == expected_similarity
+
+# Test cases for LeftOverlapMeasure
+@pytest.mark.parametrize("feature_count, similarity, expected_min_size", [
+ (5, 1.0, 5),
+ (5, 0.5, 2)
+])
+def test_left_overlap_min_feature_size(feature_count, similarity, expected_min_size):
+ assert left_overlap_measure.min_feature_size(feature_count, similarity) == expected_min_size
+
+@pytest.mark.parametrize("feature_count, similarity, expected_max_size", [
+ (5, 1.0, maxsize),
+ (5, 0.5, maxsize)
+])
+def test_left_overlap_max_feature_size(feature_count, similarity, expected_max_size):
+ assert left_overlap_measure.max_feature_size(feature_count, similarity) == expected_max_size
+
+@pytest.mark.parametrize("x_size, y_size, similarity, expected_count", [
+ (5, 5, 1.0, 5),
+ (5, 20, 1.0, 5),
+ (5, 5, 0.5, 2)
+])
+def test_left_overlap_minimum_common_feature_count(x_size, y_size, similarity, expected_count):
+ assert left_overlap_measure.minimum_common_feature_count(x_size, y_size, similarity) == expected_count
+
+@pytest.mark.parametrize("x, y, expected_similarity", [
+ ([1, 2, 3], [1, 2, 3], 1.0),
+ ([1, 2, 3], [1, 2, 3, 4], 1.0),
+ ([1, 2, 3], [1, 1, 2, 3], 1.0),
+ ([1, 2, 3, 4], [1, 1, 2, 3], 0.75),
+ ([1, 1, 2, 3], [1, 1, 2, 3], 1.0)
+])
+def test_left_overlap_similarity(x, y, expected_similarity):
+ assert round(left_overlap_measure.similarity(x, y), 2) == expected_similarity
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index cb910cd..bb426f8 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -1,22 +1,142 @@
# -*- coding:utf-8 -*-
-from unittest import TestCase
+import pytest
+from collections import OrderedDict
from simstring.searcher import Searcher
from simstring.database.dict import DictDatabase
from simstring.measure.cosine import CosineMeasure
+from simstring.measure.jaccard import JaccardMeasure
from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
-class TestSearcher(TestCase):
- strings = ['a', 'ab', 'abc', 'abcd', 'abcde']
- def setUp(self):
- db = DictDatabase(CharacterNgramFeatureExtractor(2))
- for string in self.strings:
- db.add(string)
- self.searcher = Searcher(db, CosineMeasure())
+@pytest.fixture
+def cosine_searcher():
+ strings = ["a", "ab", "abc", "abcd", "abcde"]
+ db = DictDatabase(CharacterNgramFeatureExtractor(2))
+ for string in strings:
+ db.add(string)
+ return Searcher(db, CosineMeasure())
- def test_search(self):
- self.assertEqual(self.searcher.search('a', 1.0), ['a'])
- self.assertEqual(self.searcher.search('ab', 1.0), ['ab'])
- self.assertEqual(self.searcher.search('ab', 0.9), ['ab'])
- self.assertEqual(self.searcher.search('ab', 0.5), ['ab', 'abc', 'abcd'])
+
+@pytest.fixture
+def ranked_cosine_searcher():
+ db = DictDatabase(CharacterNgramFeatureExtractor(2))
+ db.add("foo")
+ db.add("bar")
+ db.add("fooo")
+ db.add("food")
+ db.add("fool")
+ db.add("follow")
+ return Searcher(db, CosineMeasure())
+
+
+@pytest.fixture
+def ranked_cosine_long_searcher():
+ db = DictDatabase(CharacterNgramFeatureExtractor(2))
+ db.add("Amerikaplads 38 2200 Denmark")
+ db.add("Viktoriagade 8E 1655 Denmark")
+ db.add("Vesterbrogade 13 1655 Denmark")
+ return Searcher(db, CosineMeasure())
+
+
+@pytest.fixture
+def ranked_jaccard_searcher():
+ db = DictDatabase(CharacterNgramFeatureExtractor(2))
+ db.add("foo")
+ db.add("bar")
+ db.add("fooo")
+ db.add("food")
+ db.add("fool")
+ db.add("follow")
+ return Searcher(db, JaccardMeasure())
+
+
+# Test cases for CosineMeasure searcher
+@pytest.mark.parametrize("query, threshold, expected_results", [
+ ("a", 1.0, ["a"]),
+ ("ab", 0.5, ["ab", "abc", "abcd"]),
+ ("ab", 1.0, ["ab"]),
+ ("ab", 0.9, ["ab"]),
+ ("abc", 1.0, ["abc"]),
+ ("abc", 0.9, ["abc"]),
+ ("abcd", 1.0, ["abcd"]),
+ ("abcd", 0.9, ["abcd"])
+])
+def test_searcher(cosine_searcher, query, threshold, expected_results):
+ assert cosine_searcher.search(query, threshold) == expected_results
+
+
+@pytest.mark.parametrize("query, threshold, expected_ranked_results", [
+ ("abcd", 1.0, OrderedDict({"abcd": 1.0})),
+ ("ab", 0.41, OrderedDict({
+ "ab": 1.0,
+ "abc": 0.5773502691896258,
+ "abcd": 0.5163977794943222,
+ "abcde": 0.47140452079103173
+ }))
+])
+def test_ranked_search(cosine_searcher, query, threshold, expected_ranked_results):
+ assert cosine_searcher.ranked_search(query, threshold) == expected_ranked_results
+
+
+# Test cases for RankedSearchCosine
+@pytest.mark.parametrize("query, threshold, expected_results", [
+ ("fo", 0.5, OrderedDict({
+ "foo": 0.8660254037844387,
+ "fooo": 0.7745966692414834,
+ "food": 0.5163977794943222,
+ "fool": 0.5163977794943222
+ })),
+ ("fo", 0.6, OrderedDict({
+ "foo": 0.8660254037844387,
+ "fooo": 0.7745966692414834
+ }))
+])
+def test_ranked_search_example(ranked_cosine_searcher, query, threshold, expected_results):
+ assert ranked_cosine_searcher.ranked_search(query, threshold) == expected_results
+
+
+# Test cases for RankedSearchCosine with longer addresses
+@pytest.mark.parametrize("query, threshold, expected_results", [
+ ("Vesterbrogade 15 1655 Denmark", 0.7, OrderedDict({
+ "Vesterbrogade 13 1655 Denmark": 0.9333333333333333
+ }))
+])
+def test_ranked_search_example_long(ranked_cosine_long_searcher, query, threshold, expected_results):
+ assert ranked_cosine_long_searcher.ranked_search(query, threshold) == expected_results
+
+
+# Test cases for RankedSearchJaccard
+@pytest.mark.parametrize("query, threshold, expected_results", [
+ ("fo", 0.5, OrderedDict({"foo": 0.75, "fooo": 0.6})),
+ ("fo", 0.3, OrderedDict({
+ "foo": 0.75,
+ "fooo": 0.6,
+ "food": 0.3333333333333333,
+ "fool": 0.3333333333333333
+ }))
+])
+def test_ranked_search_jaccard(ranked_jaccard_searcher, query, threshold, expected_results):
+ assert ranked_jaccard_searcher.ranked_search(query, threshold) == expected_results
+
+
+def test_deteminism():
+ db = DictDatabase(CharacterNgramFeatureExtractor(2))
+ db.add("fo")
+ db.add("foo")
+ db.add("fooo")
+ db.add("foooo")
+ db.add("fooooo")
+ db.add("foooooo")
+ db.add("fooooooo")
+ db.add("foooooooo")
+ db.add("fooooooooo")
+ db.add("foooooooooo")
+ db.add("fooooooooooo")
+ db.add("foooooooooooo")
+ db.add("fooooooooooooo")
+ db.add("foooooooooooooo")
+ db.add("fooooooooooooooo")
+ searcher = Searcher(db, CosineMeasure())
+ result = searcher.search("foo", 0.8)
+ # breakpoint()
\ No newline at end of file