diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java index ecd26e342..a19becfe6 100644 --- a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java +++ b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java @@ -469,6 +469,9 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats, } else if (fsSettings.getFs().isXmlSupport()) { // https://github.com/dadoonet/fscrawler/issues/185 : Support Xml files doc.setObject(XmlDocParser.generateMap(inputStream)); + } else if (fsSettings.getFs().isSkipTika()) { + // https://github.com/dadoonet/fscrawler/issues/846 : Skip Tika parser + doc.setContent(inputStreamToString(inputStream)); } else { // Extracting content with Tika TikaDocParser.generate(fsSettings, inputStream, filename, fullFilename, doc, messageDigest, filesize); @@ -592,4 +595,26 @@ private void esDelete(String index, String id) { } } + /** + * Read the stream and get the raw string + * + * @param inputStream + * @return + */ + private String inputStreamToString(InputStream inputStream) { + InputStreamReader isReader = new InputStreamReader(inputStream); + BufferedReader reader = new BufferedReader(isReader); + StringBuilder sb = new StringBuilder(); + String str; + try { + while ((str = reader.readLine()) != null) { + sb.append(str); + } + } catch (IOException e) { + logger.error("Failed to read InputStream: {}", e.getMessage()); + logger.trace("Failed to read InputStream.", e); + } + + return sb.toString(); + } } diff --git a/docs/source/admin/fs/index.rst b/docs/source/admin/fs/index.rst index 65fd8ad18..2a50b3104 100644 --- a/docs/source/admin/fs/index.rst +++ b/docs/source/admin/fs/index.rst @@ -19,6 +19,7 @@ The job file must comply to the following ``yaml`` specifications: add_filesize: true remove_deleted: true add_as_inner_object: false + skip_tika: false store_source: true index_content: true indexed_chars: "10000.0" diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst index ab7a8f822..bade4ab17 100644 --- a/docs/source/admin/fs/local-fs.rst +++ b/docs/source/admin/fs/local-fs.rst @@ -54,6 +54,8 @@ Here is a list of Local FS settings (under ``fs.`` prefix)`: +----------------------------+-----------------------+---------------------------------+ | ``fs.follow_symlinks`` | ``false`` | `Follow Symlinks`_ | +----------------------------+-----------------------+---------------------------------+ +| ``fs.skip_tika: true`` | ``false`` | `Skip tika parser`_ | ++----------------------------+-----------------------+---------------------------------+ .. _root-directory: @@ -747,3 +749,16 @@ If you want FSCrawler to follow the symbolic links, you need to be explicit abou name: "test" fs: follow_symlink: true + +Skip Tika +^^^^^^^^^^^^^^^ + +.. versionadded:: 2.7 + +If you want to skip Tika to parse your content you can. So contents will be indexed without getting parsed by tika. + +.. code:: yaml + + name: "test" + fs: + skip_tika: true diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 3aef0eb33..bffd65c14 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -58,6 +58,7 @@ public class Fs { private Ocr ocr = new Ocr(); private ByteSizeValue ignoreAbove = null; private boolean followSymlinks = false; + private boolean skipTika = false; public static Builder builder() { return new Builder(); @@ -91,6 +92,7 @@ public static class Builder { private Ocr ocr = new Ocr(); private ByteSizeValue ignoreAbove = null; private boolean followSymlinks = false; + private boolean skipTika = false; public Builder setUrl(String url) { this.url = url; @@ -246,10 +248,15 @@ public Builder setFollowSymlinks(boolean followSymlinks) { return this; } + public Builder setSkipTika(boolean skipTika) { + this.skipTika = skipTika; + return this; + } + public Fs build() { return new Fs(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks, skipTika); } } @@ -260,7 +267,7 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, List filters, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks) { + boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks, boolean skipTika) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -284,6 +291,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.ocr = ocr; this.ignoreAbove = ignoreAbove; this.followSymlinks = followSymlinks; + this.skipTika = skipTika; } public String getUrl() { @@ -478,6 +486,10 @@ public void setIgnoreAbove(ByteSizeValue ignoreAbove) { this.ignoreAbove = ignoreAbove; } + public boolean isSkipTika() { + return skipTika; + } + public boolean isFollowSymlinks() { return followSymlinks; } @@ -548,6 +560,7 @@ public String toString() { ", ocr=" + ocr + ", ignoreAbove=" + ignoreAbove + ", followSymlinks=" + followSymlinks + + ", skipTika=" + skipTika + '}'; } }