diff --git a/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java b/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java index 9012cbbf2..31e15f2db 100644 --- a/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java +++ b/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java @@ -19,6 +19,12 @@ package fr.pilato.elasticsearch.crawler.fs.beans; +import fr.pilato.elasticsearch.crawler.fs.framework.FileAcl; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + /** * Represents additional file attributes. */ @@ -27,6 +33,7 @@ public class Attributes { private String owner; private String group; private int permissions; + private List acl; public String getOwner() { return owner; @@ -51,4 +58,12 @@ public int getPermissions() { public void setPermissions(int permissions) { this.permissions = permissions; } + + public List getAcl() { + return acl; + } + + public void setAcl(List acl) { + this.acl = acl; + } } diff --git a/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Folder.java b/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Folder.java index 923971d35..365061f5c 100644 --- a/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Folder.java +++ b/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Folder.java @@ -32,6 +32,7 @@ public class Folder { private Path path; private File file; + private Attributes attributes; public Folder() { path = new Path(); @@ -78,4 +79,12 @@ public File getFile() { public void setFile(File file) { this.file = file; } + + public Attributes getAttributes() { + return attributes; + } + + public void setAttributes(Attributes attributes) { + this.attributes = attributes; + } } diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java index bc2fbf95d..c764ce1d3 100644 --- a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java +++ b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParserAbstract.java @@ -45,6 +45,7 @@ import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Collection; +import java.util.List; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.*; import static fr.pilato.elasticsearch.crawler.fs.framework.JsonUtil.asMap; @@ -493,6 +494,12 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats, if (fileAbstractModel.getPermissions() >= 0) { doc.getAttributes().setPermissions(fileAbstractModel.getPermissions()); } + if (fsSettings.getFs().isAclSupport()) { + List fileAcls = fileAbstractModel.getAcls(); + if (!fileAcls.isEmpty()) { + doc.getAttributes().setAcl(fileAcls); + } + } } // Attributes @@ -634,6 +641,26 @@ private void indexDirectory(String path, String rootPath) throws Exception { getModificationTime(folderInfo), getLastAccessTime(folderInfo)); + if (fsSettings.getFs().isAttributesSupport() && (fsSettings.getServer() == null || PROTOCOL.LOCAL.equals(fsSettings.getServer().getProtocol()))) { + Attributes attributes = new Attributes(); + attributes.setOwner(getOwnerName(folderInfo)); + attributes.setGroup(getGroupName(folderInfo)); + int permissions = getFilePermissions(folderInfo); + if (permissions >= 0) { + attributes.setPermissions(permissions); + } + if (fsSettings.getFs().isAclSupport()) { + List folderAcls = getFileAcls(folderInfo.toPath()); + if (!folderAcls.isEmpty()) { + attributes.setAcl(folderAcls); + } + } + + if (attributes.getOwner() != null || attributes.getGroup() != null || attributes.getAcl() != null || permissions >= 0) { + folder.setAttributes(attributes); + } + } + indexDirectory(SignTool.sign(path), folder); } diff --git a/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java b/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java index e894fe654..7b645e906 100644 --- a/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java +++ b/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java @@ -38,8 +38,10 @@ package fr.pilato.elasticsearch.crawler.fs.crawler; +import fr.pilato.elasticsearch.crawler.fs.framework.FileAcl; import java.time.LocalDateTime; +import java.util.List; public class FileAbstractModel { private final String name; @@ -55,9 +57,11 @@ public class FileAbstractModel { private final String group; private final int permissions; private final String extension; + private final List acls; public FileAbstractModel(String name, boolean file, LocalDateTime lastModifiedDate, LocalDateTime creationDate, LocalDateTime accessDate, - String extension, String path, String fullpath, long size, String owner, String group, int permissions) { + String extension, String path, String fullpath, long size, String owner, String group, int permissions, + List acls) { this.name = name; this.file = file; this.directory = !file; @@ -71,6 +75,7 @@ public FileAbstractModel(String name, boolean file, LocalDateTime lastModifiedDa this.group = group; this.permissions = permissions; this.extension = extension; + this.acls = acls; } public String getName() { @@ -125,6 +130,10 @@ public String getExtension() { return extension; } + public List getAcls() { + return acls; + } + @Override public String toString() { return "FileAbstractModel{" + "name='" + name + '\'' + @@ -137,6 +146,7 @@ public String toString() { ", owner='" + owner + '\'' + ", group='" + group + '\'' + ", permissions=" + permissions + + ", acls=" + acls + ", extension='" + extension + '\'' + ", fullpath='" + fullpath + '\'' + ", size=" + size + diff --git a/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java b/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java index a2d1991c3..20132ef6d 100644 --- a/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java +++ b/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java @@ -34,6 +34,7 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.stream.Stream; @@ -73,7 +74,8 @@ public FileAbstractModel toFileAbstractModel(String path, File file) { file.length(), getOwnerName(file), getGroupName(file), - getFilePermissions(file)); + getFilePermissions(file), + fsSettings.getFs().isAclSupport() ? getFileAcls(file.toPath()) : Collections.emptyList()); } @Override diff --git a/crawler/crawler-ftp/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ftp/FileAbstractorFTP.java b/crawler/crawler-ftp/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ftp/FileAbstractorFTP.java index 441e67989..42e211551 100644 --- a/crawler/crawler-ftp/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ftp/FileAbstractorFTP.java +++ b/crawler/crawler-ftp/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ftp/FileAbstractorFTP.java @@ -108,7 +108,8 @@ public FileAbstractModel toFileAbstractModel(String path, FTPFile file) { file.getSize(), file.getUser(), file.getGroup(), - FTPUtils.getFilePermissions(file)); + FTPUtils.getFilePermissions(file), + Collections.emptyList()); } @Override diff --git a/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java b/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java index d4da386e9..ffaff726f 100644 --- a/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java +++ b/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java @@ -33,6 +33,7 @@ import java.time.LocalDateTime; import java.time.ZoneId; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -82,7 +83,8 @@ public FileAbstractModel toFileAbstractModel(String path, SftpClient.DirEntry fi file.getAttributes().getSize(), Integer.toString(file.getAttributes().getUserId()), Integer.toString(file.getAttributes().getGroupId()), - file.getAttributes().getPermissions()); + file.getAttributes().getPermissions(), + Collections.emptyList()); } @Override diff --git a/docs/source/admin/fs/index.rst b/docs/source/admin/fs/index.rst index 9a8e78bd8..0f15cee57 100644 --- a/docs/source/admin/fs/index.rst +++ b/docs/source/admin/fs/index.rst @@ -122,6 +122,9 @@ The job file (``~/.fscrawler/test/_settings.yaml``) for the job name ``test`` mu # inlcude user/group of file only if needed attributes_support: false + # collect ACL metadata when available + acl_support: false + # do you REALLY want to store every file as a copy in the index ? Then set this to true store_source: false diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst index 4d5a2b9e9..89b372ddf 100644 --- a/docs/source/admin/fs/local-fs.rst +++ b/docs/source/admin/fs/local-fs.rst @@ -30,6 +30,8 @@ Here is a list of Local FS settings (under ``fs.`` prefix)`: +----------------------------+-----------------------+---------------------------------+ | ``fs.attributes_support`` | ``false`` | `Adding file attributes`_ | +----------------------------+-----------------------+---------------------------------+ +| ``fs.acl_support`` | ``false`` | `Collecting ACL metadata`_ | ++----------------------------+-----------------------+---------------------------------+ | ``fs.raw_metadata`` | ``false`` | `Enabling raw metadata`_ | +----------------------------+-----------------------+---------------------------------+ | ``fs.filename_as_id`` | ``false`` | :ref:`filename-as-id` | @@ -422,6 +424,22 @@ and ``attributes.permissions``, you can set ``attributes_support`` to ``true``. On Windows systems, ``attributes.group`` and ``attributes.permissions`` are not generated. +Collecting ACL metadata +^^^^^^^^^^^^^^^^^^^^^^^ + +To extract NTFS access control entries (principal, type, permissions and flags), +enable both ``attributes_support`` and ``acl_support``: + +.. code:: yaml + + name: "test" + fs: + attributes_support: true + acl_support: true + +When ``acl_support`` is disabled, FSCrawler skips resolving ACLs even if +``attributes_support`` is active. + Enabling raw metadata ^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/admin/fs/rest.rst b/docs/source/admin/fs/rest.rst index 14ab40880..252fb574f 100644 --- a/docs/source/admin/fs/rest.rst +++ b/docs/source/admin/fs/rest.rst @@ -59,6 +59,7 @@ It will give you a response similar to: "store_source" : false, "index_content" : true, "attributes_support" : false, + "acl_support" : false, "raw_metadata" : true, "xml_support" : false, "index_folders" : true, diff --git a/elasticsearch-client/src/main/java/fr/pilato/elasticsearch/crawler/fs/client/ElasticsearchClient.java b/elasticsearch-client/src/main/java/fr/pilato/elasticsearch/crawler/fs/client/ElasticsearchClient.java index 8bec31508..6236f5a29 100644 --- a/elasticsearch-client/src/main/java/fr/pilato/elasticsearch/crawler/fs/client/ElasticsearchClient.java +++ b/elasticsearch-client/src/main/java/fr/pilato/elasticsearch/crawler/fs/client/ElasticsearchClient.java @@ -545,6 +545,7 @@ public void createIndexAndComponentTemplates() throws Exception { // If needed, we create the component and index templates for the folder index if (settings.getFs().isIndexFolders()) { logger.debug("Creating/updating component templates for [{}]", settings.getElasticsearch().getIndexFolder()); + loadAndPushComponentTemplate(majorVersion, "fscrawler_mapping_attributes", settings.getElasticsearch().getIndexFolder()); loadAndPushComponentTemplate(majorVersion, "fscrawler_mapping_file", settings.getElasticsearch().getIndexFolder()); loadAndPushComponentTemplate(majorVersion, "fscrawler_mapping_path", settings.getElasticsearch().getIndexFolder()); diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_component_templates/fscrawler_mapping_attributes.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_component_templates/fscrawler_mapping_attributes.json index fcfb9543d..00b11bbbd 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_component_templates/fscrawler_mapping_attributes.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_component_templates/fscrawler_mapping_attributes.json @@ -7,6 +7,22 @@ "group": { "type": "keyword" }, + "acl": { + "properties": { + "flags": { + "type": "keyword" + }, + "permissions": { + "type": "keyword" + }, + "principal": { + "type": "keyword" + }, + "type": { + "type": "keyword" + } + } + }, "owner": { "type": "keyword" } diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_index_templates/fscrawler_folders.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_index_templates/fscrawler_folders.json index 2a879746c..9e431aa00 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_index_templates/fscrawler_folders.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/7/_index_templates/fscrawler_folders.json @@ -4,6 +4,7 @@ ], "priority": 600, "composed_of": [ + "fscrawler_INDEX_NAME_mapping_attributes", "fscrawler_INDEX_NAME_mapping_file", "fscrawler_INDEX_NAME_mapping_path" ], diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_component_templates/fscrawler_mapping_attributes.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_component_templates/fscrawler_mapping_attributes.json index fcfb9543d..00b11bbbd 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_component_templates/fscrawler_mapping_attributes.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_component_templates/fscrawler_mapping_attributes.json @@ -7,6 +7,22 @@ "group": { "type": "keyword" }, + "acl": { + "properties": { + "flags": { + "type": "keyword" + }, + "permissions": { + "type": "keyword" + }, + "principal": { + "type": "keyword" + }, + "type": { + "type": "keyword" + } + } + }, "owner": { "type": "keyword" } diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_index_templates/fscrawler_folders.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_index_templates/fscrawler_folders.json index 2a879746c..9e431aa00 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_index_templates/fscrawler_folders.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8/_index_templates/fscrawler_folders.json @@ -4,6 +4,7 @@ ], "priority": 600, "composed_of": [ + "fscrawler_INDEX_NAME_mapping_attributes", "fscrawler_INDEX_NAME_mapping_file", "fscrawler_INDEX_NAME_mapping_path" ], diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_component_templates/fscrawler_mapping_attributes.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_component_templates/fscrawler_mapping_attributes.json index fcfb9543d..00b11bbbd 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_component_templates/fscrawler_mapping_attributes.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_component_templates/fscrawler_mapping_attributes.json @@ -7,6 +7,22 @@ "group": { "type": "keyword" }, + "acl": { + "properties": { + "flags": { + "type": "keyword" + }, + "permissions": { + "type": "keyword" + }, + "principal": { + "type": "keyword" + }, + "type": { + "type": "keyword" + } + } + }, "owner": { "type": "keyword" } diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_index_templates/fscrawler_folders.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_index_templates/fscrawler_folders.json index 2a879746c..9e431aa00 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_index_templates/fscrawler_folders.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9/_index_templates/fscrawler_folders.json @@ -4,6 +4,7 @@ ], "priority": 600, "composed_of": [ + "fscrawler_INDEX_NAME_mapping_attributes", "fscrawler_INDEX_NAME_mapping_file", "fscrawler_INDEX_NAME_mapping_path" ], diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_component_templates/fscrawler_mapping_attributes.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_component_templates/fscrawler_mapping_attributes.json index fcfb9543d..00b11bbbd 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_component_templates/fscrawler_mapping_attributes.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_component_templates/fscrawler_mapping_attributes.json @@ -7,6 +7,22 @@ "group": { "type": "keyword" }, + "acl": { + "properties": { + "flags": { + "type": "keyword" + }, + "permissions": { + "type": "keyword" + }, + "principal": { + "type": "keyword" + }, + "type": { + "type": "keyword" + } + } + }, "owner": { "type": "keyword" } diff --git a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_index_templates/fscrawler_folders.json b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_index_templates/fscrawler_folders.json index 2a879746c..9e431aa00 100644 --- a/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_index_templates/fscrawler_folders.json +++ b/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/99/_index_templates/fscrawler_folders.json @@ -4,6 +4,7 @@ ], "priority": 600, "composed_of": [ + "fscrawler_INDEX_NAME_mapping_attributes", "fscrawler_INDEX_NAME_mapping_file", "fscrawler_INDEX_NAME_mapping_path" ], diff --git a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FileAcl.java b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FileAcl.java new file mode 100644 index 000000000..2349600fa --- /dev/null +++ b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FileAcl.java @@ -0,0 +1,78 @@ +/* + * Licensed to David Pilato (the "Author") under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Author licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package fr.pilato.elasticsearch.crawler.fs.framework; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Represents a single Access Control List entry for a file. + */ +public class FileAcl { + + private String principal; + private String type; + private List permissions; + private List flags; + + public FileAcl() { + // Default constructor needed for serialization + } + + public FileAcl(String principal, String type, List permissions, List flags) { + this.principal = principal; + this.type = type; + this.permissions = permissions != null ? new ArrayList<>(permissions) : null; + this.flags = flags != null ? new ArrayList<>(flags) : null; + } + + public String getPrincipal() { + return principal; + } + + public void setPrincipal(String principal) { + this.principal = principal; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public List getPermissions() { + return permissions != null ? Collections.unmodifiableList(permissions) : null; + } + + public void setPermissions(List permissions) { + this.permissions = permissions != null ? new ArrayList<>(permissions) : null; + } + + public List getFlags() { + return flags != null ? Collections.unmodifiableList(flags) : null; + } + + public void setFlags(List flags) { + this.flags = flags != null ? new ArrayList<>(flags) : null; + } +} diff --git a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java index 24d5a47cd..0af2d415e 100644 --- a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java +++ b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java @@ -36,6 +36,7 @@ import java.util.*; import java.util.function.Function; import java.util.regex.Pattern; +import java.util.stream.Collectors; public class FsCrawlerUtil { public static final String INDEX_SUFFIX_FOLDER = "_folder"; @@ -196,20 +197,62 @@ public static String computeRealPathName(String _dirname, String filename) { } public static String computeVirtualPathName(String rootPath, String realPath) { - String result = getPathSeparator(rootPath); - if (realPath.startsWith(rootPath) && realPath.length() > rootPath.length()) { - if (rootPath.equals("/")) { - // "/" is very common for FTP - result = realPath; + if (rootPath == null || realPath == null) { + return getPathSeparator(rootPath); + } + + String defaultSeparator = getPathSeparator(rootPath); + String result = defaultSeparator; + + String normalizedRoot = normalizeForComparison(rootPath); + String normalizedReal = normalizeForComparison(realPath); + + if (!normalizedRoot.isEmpty()) { + boolean matches; + if (OsValidator.WINDOWS) { + matches = normalizedReal.regionMatches(true, 0, normalizedRoot, 0, normalizedRoot.length()); } else { - result = realPath.substring(rootPath.length()); + matches = normalizedReal.startsWith(normalizedRoot); } + + if (matches && hasBoundary(normalizedRoot, normalizedReal)) { + if ("/".equals(normalizedRoot)) { + // "/" is very common for FTP + result = normalizedReal; + } else { + String suffix = normalizedReal.substring(normalizedRoot.length()); + result = suffix.isEmpty() ? defaultSeparator : suffix; + } + } + } + + if (!"/".equals(defaultSeparator)) { + result = result.replace('/', defaultSeparator.charAt(0)); } logger.debug("computeVirtualPathName({}, {}) = {}", rootPath, realPath, result); return result; } + private static String normalizeForComparison(String path) { + if (path == null) { + return ""; + } + String normalized = path.replace('\\', '/'); + return normalized; + } + + private static boolean hasBoundary(String normalizedRoot, String normalizedReal) { + if (normalizedReal.length() == normalizedRoot.length()) { + return true; + } + if (normalizedReal.length() > normalizedRoot.length()) { + char c = normalizedReal.charAt(normalizedRoot.length()); + return c == '/'; + } + return false; + } + private static LocalDateTime getFileTime(File file, Function timeFunction) { try { Path path = Paths.get(file.getAbsolutePath()); @@ -280,7 +323,7 @@ public static String getOwnerName(final File file) { */ public static String getGroupName(final File file) { if (OsValidator.WINDOWS) { - logger.trace("Determining 'group' is skipped for file [{}] on [{}]", file, OsValidator.OS); + logger.debug("Determining 'group' is skipped for file [{}] on [{}]", file, OsValidator.OS); return null; } try { @@ -326,6 +369,46 @@ public static int getFilePermissions(final File file) { } } + /** + * Determines Access Control List entries for the given file. + */ + public static List getFileAcls(final Path path) { + logger.trace("Resolving ACLs for [{}]", path); + try { + final AclFileAttributeView aclView = Files.getFileAttributeView(path, AclFileAttributeView.class); + if (aclView == null) { + logger.trace("Determining 'acl' is skipped for file [{}] as ACL view is not supported", path); + return Collections.emptyList(); + } + + final List aclEntries = aclView.getAcl(); + if (aclEntries == null || aclEntries.isEmpty()) { + logger.trace("No ACL entries found for [{}]", path); + return Collections.emptyList(); + } + + final List result = new ArrayList<>(aclEntries.size()); + for (AclEntry entry : aclEntries) { + final String principal = entry.principal() != null ? entry.principal().getName() : null; + final String type = entry.type() != null ? entry.type().name() : null; + final List permissions = entry.permissions().stream() + .map(AclEntryPermission::name) + .sorted() + .collect(Collectors.toList()); + final List flags = entry.flags().stream() + .map(AclEntryFlag::name) + .sorted() + .collect(Collectors.toList()); + result.add(new FileAcl(principal, type, permissions, flags)); + } + logger.debug("ACL entries found for [{}]: {}", path, result); + return result; + } catch (Exception e) { + logger.debug("Failed to determine 'acl' of {}: {}", path, e); + return Collections.emptyList(); + } + } + public static int toOctalPermission(boolean read, boolean write, boolean execute) { return (read ? 4 : 0) + (write ? 2 : 0) + (execute ? 1 : 0); } diff --git a/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtilTest.java b/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtilTest.java index 83d1061e2..80ed4cd1a 100644 --- a/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtilTest.java +++ b/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtilTest.java @@ -22,6 +22,7 @@ import fr.pilato.elasticsearch.crawler.fs.test.framework.AbstractFSCrawlerTestCase; import java.time.LocalDateTime; import java.util.Date; +import java.util.List; import java.util.TimeZone; import org.apache.logging.log4j.LogManager; @@ -79,6 +80,16 @@ public void permissions() { assertThat(permissions).isEqualTo(700); } + @Test + public void aclEntries() { + List aclEntries = FsCrawlerUtil.getFileAcls(file.toPath()); + if (OsValidator.WINDOWS) { + assertThat(aclEntries).as("ACL entries should exist on Windows").isNotEmpty(); + } else { + assertThat(aclEntries).as("ACL entries should be empty when ACL view is not supported").isEmpty(); + } + } + @Test public void isFileSizeUnderLimit() { assertThat(FsCrawlerUtil.isFileSizeUnderLimit(ByteSizeValue.parseBytesSizeValue("1mb"), 1)).isTrue(); diff --git a/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/JsonUtilTest.java b/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/JsonUtilTest.java index 87c4268d8..0cbb434ae 100644 --- a/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/JsonUtilTest.java +++ b/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/JsonUtilTest.java @@ -61,7 +61,13 @@ public void jsonPath() { " \"owner\":\"dpilato\",\n" + " \"group\":\"staff\",\n" + " \"permissions\":644,\n" + - " \"foobar\":null\n" + + " \"foobar\":null,\n" + + " \"acl\":[{\n" + + " \"principal\":\"dpilato\",\n" + + " \"type\":\"ALLOW\",\n" + + " \"permissions\":[\"READ_DATA\"],\n" + + " \"flags\":[\"FILE_INHERIT\"]\n" + + " }]\n" + " }\n" + "}"; @@ -69,6 +75,8 @@ public void jsonPath() { assertThat((String) context.read("$.attributes.owner")).isEqualTo("dpilato"); assertThat((Integer) context.read("$.attributes.permissions")).isEqualTo(644); assertThat((Integer) context.read("$.attributes.foobar")).isNull(); + assertThat((String) context.read("$.attributes.acl[0].principal")).isEqualTo("dpilato"); + assertThat((String) context.read("$.attributes.acl[0].type")).isEqualTo("ALLOW"); } public static class Country { diff --git a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestAttributesIT.java b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestAttributesIT.java index 5343b11a9..b8f8036ec 100644 --- a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestAttributesIT.java +++ b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestAttributesIT.java @@ -29,6 +29,8 @@ import fr.pilato.elasticsearch.crawler.fs.test.integration.AbstractFsCrawlerITCase; import org.junit.Test; +import java.util.List; + import static fr.pilato.elasticsearch.crawler.fs.framework.JsonUtil.parseJsonAsDocumentContext; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -57,4 +59,32 @@ public void attributes() throws Exception { } } } + + @Test + public void aclAttributes() throws Exception { + FsSettings fsSettings = createTestSettings(); + fsSettings.getFs().setAttributesSupport(true); + fsSettings.getFs().setAclSupport(true); + crawler = startCrawler(fsSettings); + + ESSearchResponse searchResponse = countTestHelper(new ESSearchRequest().withIndex(getCrawlerName()), 1L, null); + for (ESSearchHit hit : searchResponse.getHits()) { + DocumentContext document = parseJsonAsDocumentContext(hit.getSource()); + + List aclEntries; + try { + aclEntries = document.read("$.attributes.acl"); + } catch (PathNotFoundException e) { + aclEntries = null; + } + + if (OsValidator.WINDOWS) { + assertThat(aclEntries).as("ACL metadata should be collected on Windows").isNotNull().isNotEmpty(); + assertThat((String) document.read("$.attributes.acl[0].principal")).isNotBlank(); + assertThat((String) document.read("$.attributes.acl[0].type")).isNotBlank(); + } else { + assertThat(aclEntries).as("ACL metadata should not be present when the platform does not expose ACLs").isNullOrEmpty(); + } + } + } } diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 8ec81f2d3..c2c07bd2a 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -65,6 +65,8 @@ public class Fs { @Config(defaultVal = "false") private boolean attributesSupport; @Config(defaultVal = "false") + private boolean aclSupport; + @Config(defaultVal = "false") private boolean storeSource; @Config(defaultVal = "true") private boolean indexContent; @@ -198,6 +200,14 @@ public void setAttributesSupport(boolean attributesSupport) { this.attributesSupport = attributesSupport; } + public boolean isAclSupport() { + return aclSupport; + } + + public void setAclSupport(boolean aclSupport) { + this.aclSupport = aclSupport; + } + public boolean isRawMetadata() { return rawMetadata; } @@ -307,6 +317,7 @@ public boolean equals(Object o) { storeSource == fs.storeSource && indexContent == fs.indexContent && attributesSupport == fs.attributesSupport && + aclSupport == fs.aclSupport && rawMetadata == fs.rawMetadata && xmlSupport == fs.xmlSupport && indexFolders == fs.indexFolders && @@ -328,7 +339,7 @@ public boolean equals(Object o) { @Override public int hashCode() { return Objects.hash(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize, - removeDeleted, addAsInnerObject, storeSource, indexContent, indexedChars, attributesSupport, rawMetadata, xmlSupport, + removeDeleted, addAsInnerObject, storeSource, indexContent, indexedChars, attributesSupport, aclSupport, rawMetadata, xmlSupport, checksum, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks, tikaConfigPath); } @@ -348,6 +359,7 @@ public String toString() { ", indexContent=" + indexContent + ", indexedChars=" + indexedChars + ", attributesSupport=" + attributesSupport + + ", aclSupport=" + aclSupport + ", rawMetadata=" + rawMetadata + ", xmlSupport=" + xmlSupport + ", checksum='" + checksum + '\'' + diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/FsCrawlerValidator.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/FsCrawlerValidator.java index de2f68fdc..ac20f555b 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/FsCrawlerValidator.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/FsCrawlerValidator.java @@ -89,6 +89,10 @@ public static boolean validateSettings(Logger logger, FsSettings settings) { logger.info("attributes_support is set to true but getting group is not available on [{}].", OsValidator.OS); } + if (settings.getFs().isAclSupport() && !settings.getFs().isAttributesSupport()) { + logger.warn("acl_support is set to true but attributes_support is false. ACL collection is disabled."); + } + return false; } } diff --git a/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.properties b/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.properties index 7de1abfc3..ec2af78a9 100644 --- a/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.properties +++ b/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.properties @@ -10,6 +10,7 @@ fs.xml_support=false fs.add_as_inner_object=false fs.index_folders=true fs.attributes_support=false +fs.acl_support=false fs.raw_metadata=false fs.filename_as_id=false fs.add_filesize=true diff --git a/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.yaml b/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.yaml index 63081ec64..e53038cbc 100644 --- a/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.yaml +++ b/settings/src/main/resources/fr/pilato/elasticsearch/crawler/fs/settings/fscrawler-default.yaml @@ -47,6 +47,9 @@ # include user/group of file only if needed #attributes_support: false + # optional: collect file ACL entries when available (requires attributes_support) + #acl_support: false + # optional: this will try to detect the language of the extracted content #lang_detect: false diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsLoaderTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsLoaderTest.java index 7bea4c9ac..3ff23557a 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsLoaderTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsLoaderTest.java @@ -130,6 +130,7 @@ private void testLoadFullSettings(String jobName) throws IOException { expected.getFs().setIndexContent(false); expected.getFs().setAddFilesize(false); expected.getFs().setAttributesSupport(true); + expected.getFs().setAclSupport(true); expected.getFs().setLangDetect(true); expected.getFs().setStoreSource(true); expected.getFs().setIndexedChars(new Percentage(10000.0)); diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index 4f631b9b9..dce8dcb0c 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -97,4 +97,12 @@ public void parseSettingsWithStaticMetadata() throws IOException { fsSettings.getTags().setStaticMetaFilename("/path/to/metadatafile.yml"); settingsTester(fsSettings); } + + @Test + public void parseSettingsWithAclSupport() throws IOException { + FsSettings fsSettings = FsSettingsLoader.load(); + fsSettings.getFs().setAttributesSupport(true); + fsSettings.getFs().setAclSupport(true); + settingsTester(fsSettings); + } } diff --git a/settings/src/test/resources/config/json-full/_settings.json b/settings/src/test/resources/config/json-full/_settings.json index c955bf332..7e2648a70 100644 --- a/settings/src/test/resources/config/json-full/_settings.json +++ b/settings/src/test/resources/config/json-full/_settings.json @@ -25,6 +25,7 @@ "index_content": false, "add_filesize": false, "attributes_support": true, + "acl_support": true, "lang_detect": true, "store_source": true, "indexed_chars": "10000.0", diff --git a/settings/src/test/resources/config/yaml-full/_settings.yaml b/settings/src/test/resources/config/yaml-full/_settings.yaml index 55cc8e5ed..8b43a7288 100644 --- a/settings/src/test/resources/config/yaml-full/_settings.yaml +++ b/settings/src/test/resources/config/yaml-full/_settings.yaml @@ -47,6 +47,9 @@ fs: # include user/group of file only if needed attributes_support: true + # optional: collect ACL metadata when available + acl_support: true + # optional: this will try to detect the language of the extracted content lang_detect: true diff --git a/settings/src/test/resources/config/yaml-split/_settings/fscrawler-split-fs.test.yml b/settings/src/test/resources/config/yaml-split/_settings/fscrawler-split-fs.test.yml index 0606752d6..cb9e0d32a 100644 --- a/settings/src/test/resources/config/yaml-split/_settings/fscrawler-split-fs.test.yml +++ b/settings/src/test/resources/config/yaml-split/_settings/fscrawler-split-fs.test.yml @@ -44,6 +44,9 @@ fs: # include user/group of file only if needed attributes_support: true + # optional: collect ACL metadata when available + acl_support: true + # optional: this will try to detect the language of the extracted content lang_detect: true