├── .gitignore
├── CHANGELOG.md
├── DEBIAN
    ├── conffiles
    ├── control
    └── postinst
├── README.md
├── config.dist.yml
├── es-index-mapping.json
├── fs2es-indexer
├── fs2es-indexer.service
├── lib
    ├── ChangesWatcher
    │   ├── AuditLogChangesWatcher.py
    │   ├── ChangesWatcher.py
    │   ├── FanotifyChangesWatcher.py
    │   └── __init__.py
    ├── Fs2EsIndexer.py
    └── __init__.py
├── role.yml
├── rsyslog-smbd_audit.conf
└── samba-audit-logrotate.conf


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | lib/pycache/
3 | lib/__pycache__/
4 | config.yml
5 | /workingdata/
6 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # FileSystem To Elastic Search Indexer Changelog
  2 | 
  3 | ## 0.11.0
  4 | - You can now use fanotify (via pyfanotify) to watch for filesystem changes instead of parsing a custom samba audit.log file
  5 |   - This will capture all changes in the filesystem, even those not made by samba
  6 |   - See the README.md on how to set up this changes watcher
  7 | - Removed the venv stuff from the `DEBIAN/postinst` file, because this should be done during packaging
  8 | - Switch to the `logging` module completly, no more `print()`
  9 | 
 10 | ## 0.10.0
 11 | - Handle log rotation gracefully
 12 | - Add a log rotation configuration suggestion
 13 | - Add more documentation hints for the audit log monitoring (thanks @hondaspb!) 
 14 | 
 15 | ## 0.9.1
 16 | - Provide a summary for the new action "analyze_index" whether the index must be recreated or not.
 17 | 
 18 | ## 0.9.0
 19 | - Add "lowercase" and "asciifolding" filters to the elasticsearch analyzer to make the search query case insensitive.
 20 |   - Example:
 21 |     - You have a file named "My_wonderful_Document.pdf" with a capital D on Document.
 22 |     - Old behaviour: searching for "document" wouldnt get you any results. You'd have to search for "Document"
 23 |     - New behaviour: searching for "document" (in fact any case) will give you the one result.
 24 |   - A **full** reindex is necessary because these new filters only run during the index of a document and not after the fact.
 25 |     - This will be detected and done automatically.
 26 | 
 27 | ## 0.8.0
 28 | - Change the tokenizer of the elasticsearch index to our own in order to split the filename correctly into tokens. Explanation:
 29 |   - During indexing the tokenizer of elasticsearch splits the filename into (multiple) words (called "tokens" here). The normal tokenizer of elasticsearch does not interpret underscore ("_") as a word boundary!
 30 |   - The samba spotlight search works at the start of a word: it matches elasticsearch document that have a token starting with the searchterm.
 31 |   - Example: 
 32 |     - You have a file named "My_wonderful_document.pdf"
 33 |     - Old behaviour:
 34 |       - Elasticsearch splits this filename into 2 tokens: "My_wonderful_document" and "pdf"
 35 |       - Searching for "wonderful" wouldn't result in any results, because no token starts with "wonderful"!
 36 |     - New Behaviour:
 37 |       - Elasticsearch splits this filename into 4 tokens: "My", "wonderful", "document" and "pdf"
 38 |       - Searching for "wonderful" would return the file, because its 2nd token starts with "wonderful".
 39 |   - A **full** reindex is necessary because the tokenizer only runs during the index of a document and not after the fact.
 40 |     - This will be detected and done automatically.
 41 | 
 42 | ## 0.7.1
 43 | - add "--system-site-packages" to the creation of the venv to enable the access to the system packages (e. g. yaml)
 44 | 
 45 | ## 0.7.0
 46 | - Switch to installation in a virtual env
 47 |   - Changes in the debian packaging scripts and README only. No changes in functionality
 48 | 
 49 | ## 0.6.0
 50 | - Major rewrite of the indexer!
 51 | - Instead of indexing all paths each time to elasticsearch (which takes a lot of time), the indexer will now retrieve 
 52 | which paths are already in ES and only add new ones and remove deleted ones.
 53 | - This will massivly speed up indexing runs (from ~ 20 min to ~ 1 min for 2 mio paths)
 54 | - Sadly the indexed paths need to be saved in the indexer (~ 500 MiB RAM usaged for 2 mio paths)
 55 | - Removed the ability to add more metadata into ES (like filesize and last_modified), because
 56 |   - they are unused by Samba, 
 57 |   - they slow down the indexer 
 58 |   - and are incompatible with the aforementioned indexing algorithm. 
 59 | - Changed some mapping for the elasticsearch index. It will be automatically recreated if its incompatible.
 60 | - New feature: monitor the samba audit log during the wait_time!
 61 |   - See README.md for more information
 62 | - Add `-v` or `--verbose` to a CLI call to get more information. 
 63 | 
 64 | ## 0.5.0
 65 | - Instead of using the setuptools we're now using pip to install the dependencies
 66 |   - See README.md for more info.
 67 | 
 68 | ## 0.4.9
 69 | - revert change from 0.4.7: the format for "time" is once again "long"
 70 | - report how long the bulk import into elasticsearch took
 71 | - fields "last_modified" and "filesize" are not used yet by Samba
 72 |   - You disable indexing them via setting `elasticsearch.add_additional_fields` to `false` (the default)
 73 |   - Enabling this has a non-zero performance impact and is (currently) not useful
 74 | 
 75 | ## 0.4.8
 76 | - fix error in "/opt/fs2es-indexer/es-index-mapping.json"
 77 | 
 78 | ## 0.4.7
 79 | - Put the elasticsearch index mapping into an extra file: "/opt/fs2es-indexer/es-index-mapping.json"
 80 |   - This path is configurable in the config.yml via the `elasticsearch.index_mapping` key
 81 | - Change the "time" flag to an unsigned_long (because epoch can never be negativ) and round it
 82 | - Round the mtime of the files
 83 | 
 84 | ## 0.4.6
 85 | - Instead of blowing up the log, it will now dump the documents to a json file in /tmp
 86 |   - You can enable / disable this behavior in the config.yml via the `dump_documents_on_error` key (default is false)
 87 | 
 88 | ## 0.4.5
 89 | - Print the documents if the indexing into elasticsearch failed
 90 | 
 91 | ## 0.4.4
 92 | - Output the "objects indexed" count with a space as thousands separator
 93 | 
 94 | ## 0.4.3
 95 | - Add the last modified date to the index for samba / finder to display correct values
 96 | - Output the "objects indexed" count with a thousands separator
 97 | 
 98 | ## 0.4.2
 99 | - Add "errors=surrogatepass" for path.encode() to properly treat UTF surrogate characters on some filesystems
100 | 
101 | ## 0.4.1
102 | - Add the files for the debian packaging to this repo
103 | 
104 | ## 0.4.0
105 | - Switch to ES-Lib v8 for ElasticSearch 8.0+
106 | - Add configuration which library version is currently in use
107 | - Fix problems in ES-Lib v8
108 | - Add README.md section on how to enable the user authentication
109 | - Remove "use_ssl" from the ES-constructor and from the configuration
110 | 
111 | ## 0.3.5
112 | - Don't throw an error and abort if a file is deleted during indexing (2nd try)
113 | 
114 | ## 0.3.4
115 | - Don't throw an error and abort if a file is deleted during indexing
116 | - Fix searching with elasticsearch lib 7
117 | 
118 | ## 0.3.3
119 | - add options for connecting to elasticsearch via SSL
120 | 
121 | ## 0.3.2
122 | - add `file.filesize` to elasticsearch index, so that Spotlight can see it
123 | - add `fs2es-indexer search --search-filename "my-document.pdf"` to search the index
124 | - (For the future) add an internal version switch for the elasticsearch library (current version 7 can access ES server 7+)
125 | 
126 | ## 0.3.1
127 | - remove positional calls to the ElasticSearch lib methods to make it compatible with lib version 8.0
128 | 
129 | ## 0.3.0
130 | - added config for exclusions (e.g. macOS Index files or the trash folder)
131 | 
132 | ## 0.2.7
133 | - almost all config options can now be omitted and standard values will be used (exception: "directories")
134 | 
135 | ## 0.2.6
136 | - print duration of whole indexing run
137 | 
138 | ## 0.2.5
139 | - increase default bulk size
140 | 
141 | ## 0.2.4
142 | - add more automatic retries
143 | 
144 | ## 0.2.3
145 | - fix missing variable
146 | 
147 | ## 0.2.2
148 | - Change application flow: first index all directories then clear old documents
149 | 
150 | ## 0.2.1
151 | - add debug messages for connection problems
152 | 
153 | ## 0.2.0
154 | - First public version
155 | 


--------------------------------------------------------------------------------
/DEBIAN/conffiles:
--------------------------------------------------------------------------------
1 | /etc/fs2es-indexer/config.yml
2 | 


--------------------------------------------------------------------------------
/DEBIAN/control:
--------------------------------------------------------------------------------
 1 | Package: fs2es-indexer
 2 | Version: {version}
 3 | Section: comm
 4 | Priority: optional
 5 | Architecture: all
 6 | Depends: python3-yaml, python3-venv
 7 | Maintainer: Ellerhold IT <it@ellerhold.de>
 8 | Description: fs2es-indexer
 9 |  This tool indexes your directories into an elastic search index and prepares them for searching via Mac OS Spotlight
10 |  search in a samba file server.
11 | 


--------------------------------------------------------------------------------
/DEBIAN/postinst:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | /usr/bin/systemctl daemon-reload
 6 | if /usr/bin/systemctl is-active --quiet fs2es-indexer.service;
 7 | then
 8 |     /usr/bin/systemctl restart fs2es-indexer.service
 9 | fi
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FileSystem To ElasticSearch Indexer
  2 | 
  3 | This tool indexes your files and directories into an elastic search index and prepares them for searching 
  4 | via macOS Spotlight search in a samba file server.
  5 | 
  6 | ## Installation
  7 | 
  8 | Install the dependencies:
  9 | - Python3 (Debian package: `python3`)
 10 | - PyYAML (Debian package: `python3-yaml`)
 11 | - Python-ElasticSearch v8 or higher (Use a venv - see below)
 12 | - Optional: Package `pyfanotify` (Use a venv - see below) if you want to use the fanotify changes watcher
 13 | - a running ElasticSearch instance v8 or higher (see [ElasticSearch installation](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html#install-elasticsearch))
 14 | 
 15 | And download the content of this repo to a directory (e. g. `/opt/fs2es-indexer`).
 16 | 
 17 | ### Installation in a virtual env
 18 | 
 19 | Debian does not like it if you use `pip install <anything>` because you'd pollute the standard python environment with 
 20 | your dependencies.
 21 | 
 22 | They recommend this (cleaner) way:
 23 | 
 24 | ```bash
 25 | # Install the venv module (if you dont have it already)
 26 | apt install python3-venv
 27 | 
 28 | # Create a virtual env for our dependencies, but enable the access to the system packages
 29 | python3 -m venv --system-site-packages /opt/fs2es-indexer/
 30 | 
 31 | # Install our dependencies in this virtual env only
 32 | /opt/fs2es-indexer/bin/pip3 install 'elasticsearch>=8,<9'
 33 | 
 34 | # Optional if you want to use the fanotify watcher
 35 | apt install python3-dev
 36 | /opt/fs2es-indexer/bin/pip3 install 'pyfanotify'
 37 | 
 38 | # Use our new virtual env to run the indexer
 39 | /opt/fs2es-indexer/bin/python3 /opt/fs2es-indexer/fs2es-indexer
 40 | ```
 41 | 
 42 | ### Configuration
 43 | 
 44 | Copy the `config.dist.yml` to `/etc/fs2es-indexer/config.yml` and tweak it to your hearts content!
 45 | 
 46 | You have to configure which directories should be indexed and the URL & credentials for your ElasticSearch instance.
 47 | 
 48 | ### Running it
 49 | 
 50 | ```bash
 51 | # Index the configured directories once
 52 | /opt/fs2es-indexer/fs2es-indexer index
 53 | 
 54 | # Index the configured directories, wait for the specified amount of time and index again
 55 | # Continously!
 56 | /opt/fs2es-indexer/fs2es-indexer daemon
 57 | 
 58 | # Deletes all documents in the elasticsearch index
 59 | /opt/fs2es-indexer/fs2es-indexer clear
 60 | 
 61 | # You can test the Spotlight search with this indexer!
 62 | 
 63 | # Shows the first 100 elasticsearch documents
 64 | /opt/fs2es-indexer/fs2es-indexer search --search-path /srv/samba
 65 | 
 66 | # Searches elasticsearch documents with a match on all attributes:
 67 | /opt/fs2es-indexer/fs2es-indexer search --search-path /srv/samba --search-term "my-doc.pdf"
 68 | 
 69 | # Searches elasticsearch documents with a match on the filename:
 70 | /opt/fs2es-indexer/fs2es-indexer search --search-path /srv/samba --search-filename "my-doc.pdf"
 71 | 
 72 | # Displays some help texts
 73 | /opt/fs2es-indexer/fs2es-indexer --help
 74 | ```
 75 | 
 76 | ### SystemD service
 77 | 
 78 | You can use the `/opt/fs2es-indexer/fs2es-indexer.service` in order to register the daemon-mode as a SystemD service. 
 79 | 
 80 | ## Configuration of Samba
 81 | Add this to your `[global]` section in your `smb.conf`:
 82 | ```ini
 83 | spotlight backend = elasticsearch
 84 | elasticsearch:address = 127.0.0.1
 85 | elasticsearch:port = 9200
 86 | elasticsearch:ignore unknown attribute = yes
 87 | elasticsearch:ignore unknown type = yes
 88 | ```
 89 | 
 90 | If your elasticsearch instance is not on the local machine, use the correct IP address above.
 91 | 
 92 | The last 2 options are entirely optional but sometimes MacOS sends queries with some weird attributes and types. The 
 93 | default behavior is to fail the whole search then.
 94 | If you set both to "yes" samba will use what it can from the query and tries the search regardless. So you may get 
 95 | invalid results which you seemingly excluded.
 96 | 
 97 | ## User authentication
 98 | 
 99 | In elasticsearch v8 the user authentication was made mandatory for elasticsearch.
100 | 
101 | ### 1. Add the roles
102 | 
103 | Add the content of `role.yml` to the `roles.yml` of your elasticsearch (e. g. in Debian: `/etc/elasticsearch/roles.yml`).
104 | 
105 | Unknown if needed: restart your elasticsearch (e. g. in Debian: `systemctl restart elasticsearch`).
106 | 
107 | ### 2. Add the user
108 | 
109 | Navigate to the installation directory of elasticsearch (e. g. in Debian: `/usr/share/elasticsearch`).
110 | 
111 | ```bash
112 | # Create a new user
113 | bin/elasticsearch-users useradd fs2es-indexer
114 | # Use a good password!
115 | 
116 | # Add the new role to it
117 | bin/elasticsearch-users roles -a fs2es-indexer fs2es-indexer
118 | ```
119 | 
120 | ### 3. Configure fs2es-indexer
121 | 
122 | Edit your `/etc/fs2es-indexer/config.yml` and insert your values for `user` and `password` in `elasticsearch`. 
123 | See the template `config.dist.yml` for an example.
124 | 
125 | ### 4. Configure ElasticSearch
126 | 
127 | Samba as of 4.15.6 can't use user authentication yet. 
128 | There is a [pull request](https://gitlab.com/samba-team/samba/-/merge_requests/1847) to add this feature, but it's not merged (yet).
129 | 
130 | That's why we have to enable the anonymous access to ES with a role that can read all indexed files.
131 | 
132 | Add this to your `/etc/elasticsearch/elasticsearch.yml`:
133 | ```yaml
134 | # Allow access without user credentials for Samba 4.15
135 | # See https://www.elastic.co/guide/en/elasticsearch/reference/current/anonymous-access.html
136 | xpack.security.authc:
137 |   anonymous:
138 |     username:        anonymous_user
139 |     roles:           fs2es-indexer-ro
140 |     authz_exception: true
141 | ```
142 | 
143 | ## Debugging the search
144 | 
145 | The whole macOS finder -> Spotlight -> Samba -> ES system is complex and a number of things can go wrong.
146 | 
147 | Use this guideline to determine where the problem might be.
148 | 
149 | ### 1. Is Elasticsearch running correctly?
150 | 
151 | Is elasticsearch running / accepting any connections? In debian run `systemctl status elasticsearch`.
152 | Additionally, look through the logs found in `/var/log/elasticsearch`.
153 | 
154 | ### 2. Is fs2es-indexer running correctly?
155 | 
156 | Did the tool correctly index your directories? Look through the output of `fs2es-indexer index` or `daemon`. 
157 | 
158 | Check your configuration in `/etc/fs2es-indexer/config.yml`, use the `config.dist.yml` as base.
159 | 
160 | ### 3. Does the indexer find the files you're looking for?
161 | 
162 | Try to find some files with `fs2es-indexer search --search-path <Local Path> --search-term <Term>`.
163 | 
164 | If nothing is found: Did the indexer run correctly? Are there any auth or connection problem? 
165 | Check your ES and indexer logs!
166 | 
167 | Make sure your search term is the start of a word in the file name. E.g. searching for "Test" should find files
168 | named "Test123.pdf", "Testing-yesterday.doc" and "This_Is_My_Test.xml" (since 0.8.0) but *not* the file named "notestingdone.pdf".
169 | 
170 | fs2es-indexer prior to 0.8.0 used the [standard tokenizer of elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-tokenizer.html) 
171 | which does not recognize certain symbols as word-boundaries, e. g. the underscore "_" is not recognized as a word boundary. 
172 | So the file "This_Is_My_Test.xml" should only be found if fs2es-indexer is installed in 0.8.0+.
173 | 
174 | This constraint comes from the way samba (at least since 4.15+) creates the ES query and fs2es-indexer mimicks this 
175 | behavior as close as possible. There is currently no way to change this in samba (and therefor impossible in 
176 | fs2es-indexer too).
177 | 
178 | ### 4. Does Server's mdsearch find the files?
179 | 
180 | Try this on the server first:
181 | ```bash
182 | # Searches in all attributes:
183 | mdsearch localhost "<Share>" '*=="<Search Term>"' -U "<User>"
184 | 
185 | # Searches just the filename:
186 | mdsearch localhost "<Share>" 'kMDItemFSName=="<Search Term>"' -U "<User>"
187 | ```
188 | 
189 | Does this yield results?
190 | 
191 | ### 5. Does your Mac uses the correct search index?
192 | 
193 | Go on your macOS client and connect to the samba share ( = mounting the share in /Volumes/my-share).
194 | 
195 | Start a terminal and execute
196 | 
197 | ```bash
198 | mdutil -s /Volumes/my-share
199 | ```
200 | 
201 | Does it say "Server search enabled"? 
202 | 
203 | If not: 
204 | - Are you using Samba 4.12.0 or later?
205 | - Was Samba compiled with spotlight support (default for debian packages)? 
206 | - Is elasticsearch enabled in your smb.conf (on the server)? 
207 | 
208 | ### 6. Does your Mac's mdfind finds anything?
209 | 
210 | Start a terminal on your Mac-Client and execute
211 | ```bash
212 | mdfind -onlyin /Volumes/my-share <search-term>
213 | ```
214 | 
215 | Use the same search-term as in step 3!
216 | 
217 | If no output is produced: wait 5 seconds and try again.
218 | 
219 | If this fails: check your samba-logs on the server. Any entries with "rpc_server", "mds" or "mdssvc" in it?
220 | 
221 | ### 7. Does your Mac's Finder find anything?
222 | 
223 | Start the Finder on your Mac and navigate to the samba share. Use the search field at the top right and type in your 
224 | search term.
225 | 
226 | Wait for the spinner to finish. If no files are returned and Step 5 succeeded: IDK (srsly).
227 | 
228 | If your finder hangs then you have a problem with the `.DS_Store` and `DOSATTRIBS` on your server. This can happen 
229 | if you rsync files from an old macOS server to the new linux samba server.
230 | 
231 | In order to fix this you have to execute these on the samba server:
232 | ```bash
233 | find /my-storage-path -type f -name ".DS_Store" -delete
234 | find /my-storage-path -exec setfattr -x user.DOSATTRIB {} \;
235 | ```
236 | 
237 | And add these lines to your [global] section in the smb.conf on the samba server:
238 | ```bash
239 |     veto files = /.DS_Store/
240 |     delete veto files = yes
241 | ```
242 | 
243 | You have to restart your Mac-OS client btw, because it crashed and won't be usable otherwise.
244 | 
245 | ## How can I uninstall fs2es-indexer?
246 | 
247 | You can uninstall the indexer with pip:
248 | 
249 | ```bash
250 | # Since 0.7.0:
251 | rm -Rf /opt/fs2es-indexer
252 | 
253 | # For versions below 0.7.0
254 | 
255 | # Uninstall the indexer itself:
256 | python3 -m pip uninstall fs2es-indexer
257 | 
258 | # You can check whats installed via
259 | python3 -m pip list
260 | # or
261 | pip3 list
262 | 
263 | # The dependencies
264 | python3 -m pip uninstall elasticsearch elastic-transport certifi urllib3 PyYAML
265 | 
266 | # This may fail for version < 0.5 (where we switched to pip)
267 | # Look into these folders:
268 | ls -lAh /usr/local/lib/python3.9/dist-packages
269 | ls -lAh /usr/lib/python3/dist-packages
270 | rm /usr/bin/fs2es-indexer
271 | 
272 | # If you delete anything there and it's still listed in `pip3 list`, then you have to edit these files:
273 | vi /usr/local/lib/python3.9/dist-packages/easy-install.pth
274 | vi /usr/lib/python3/dist-packages/easy-install.pth
275 | 
276 | # After updating from < 0.5 to 0.5+ you may have to cleanup your /opt/fs2es-indexer
277 | rm -Rf /opt/fs2es-indexer/build /opt/fs2es-indexer/dist /opt/fs2es-indexer/files.txt /opt/fs2es-indexer/fs2es_indexer.egg-info
278 | ```
279 | 
280 | Please make sure that all the dependencies are ONLY used for the indexer and not for any other program.
281 | 
282 | ## Advanced: Switch back to elasticsearch v7
283 | 
284 | You have to install the elasticsearch-python library in version 7, e.g. via pip
285 | ```
286 | python3 -m pip install 'elasticsearch>=7,<8'
287 | ```
288 | 
289 | And configure this in your `config.yml`:
290 | ```yaml
291 | elasticsearch:
292 |   library_version: 7
293 | ```
294 | 
295 | This **should** work!
296 | 
297 | ## Advanced: How does the daemon mode work?
298 | 
299 | The daemon mode consists of two different activities:
300 | - indexing
301 | - waiting / watching for filesystem changes
302 | 
303 | ### Indexing runs
304 | 
305 | Directly after the start of the daemon the elastic search index is setup and an indexing run is started.
306 | 
307 | First elasticsearch is queried and all document IDs are retrieved and saved in RAM. These document IDs are unique and 
308 | derived from the path of the file or directory. 
309 | 
310 | After that all directories are crawled and new elasticsearch documents are created when no existing document ID can be 
311 | found. If an existing ID was not found during the crawl, it's presumed that the file or dir on this path was deleted and the 
312 | document will be purged from elasticsearch too. 
313 | 
314 | After this indexing the waiting period begins.
315 | 
316 | ### Waiting period: No changes watcher configured
317 | 
318 | If the audit log monitoring is disabled: nothing happens except waiting.
319 | Make to sure to strike a balance between server load (indexing runs take a toll!) and uptodateness of the index.
320 | 
321 | ### Waiting period: WITH samba audit log monitoring
322 | 
323 | This new feature in version 0.6.0 can radically enhance your spotlight search experience!
324 | 
325 | Normally during the configured `wait_time` no updates are written to elasticsearch. So if an indexing run is done and 
326 | someone deletes, renames or creates a file this change will be picked up during the next run after the `wait_time` is over.
327 | 
328 | Version 0.6.0 introduces the monitoring of the samba audit log. If setup correctly, samba writes all changes into a separate file.
329 | During the wait time, this file is parsed and changes (creates, deletes and renames) are pushed to elasticsearch.
330 | So changes are visible in the spotlight search (and elasticsearch) almost immediatly after doing them.
331 | 
332 | #### How to setup the samba audit log
333 | Add these lines to your `/etc/samba/smb.conf`:
334 | ```
335 | [global]
336 |     # Add your current vfs objects after this 
337 |     vfs objects = full_audit ...
338 |     full_audit:success = renameat unlinkat mkdirat
339 |     
340 |     # These may be necessary too:
341 |     full_audit:facility = local5
342 |     full_audit:priority = notice
343 | ```
344 | 
345 | Add the `rsyslog-smbd-audit.conf` to your syslog configuration.
346 | In debian: copy it into `/etc/rsyslog.d/` and `systemctl restart rsyslog`.
347 | This will redirect all log entries to `/var/log/samba/audit.log`.
348 | 
349 | This log file may need to be created manually:
350 | ```bash
351 | mkdir -p /var/log/samba
352 | touch /var/log/samba/audit.log
353 | chown syslog:adm /var/log/samba/audit.log
354 | ```
355 | 
356 | Add a logrotate configuration for this file, so it gets cleaned up.
357 | In debian: copy the `samba-audit-logrotate.conf` to `/etc/logrotate.d/samba-auditlog`.
358 | fs2es-indexer handles log rotation (either with "copytruncate" or without) gracefully since 0.10.0.
359 | 
360 | Currently, there is no good method to log the creation of files. There is "openat" that logs all read 
361 | and write operations. Sadly we cant filter for the "w" flag of this operation directly in Samba, so all "openat" 
362 | operations would be logged. This will generate a massive amount of log traffic on even a moderatly used fileserver 
363 | (gigabytes of text!).
364 | 
365 | ### Waiting period: WITH fanotify to look for changes
366 | 
367 | This new feature in version 0.11.0 is a better alternative to the samba audit.log monitoring.
368 | 
369 | Instead of parsing the samba audit.log this watcher uses [fanotify](https://man7.org/linux/man-pages/man7/fanotify.7.html) (a kernel feature since linux 5.1)
370 | to detect changes in the directories and update the elasticsearch index. 
371 | 
372 | The big advantage over the audit.log monitoring is, that now we get all dir/file creations reliably without blowing up the log file. 
373 | In fact you can disable the audit logging entirely and save on IOPS / space and greatly reduce the server load this indexer produces.
374 | Additionally because its a linux kernel feature and not samba-related it can detect ALL changes, even those that are done by server-local scripts, ...
375 | 
376 | Your kernel and filesystem must support fanotify and the indexer must run as `root`! I tested it successfully with Debian 12, ext4 and OpenZFS. 
377 | 
378 | You need to install the python package [pyfanotify](https://github.com/baskiton/pyfanotify) to set `use_fanotify` to `True` in your `config.yml`.
379 | For installation via pip under Debian I needed to install `python3-dev` too, because the C-part of this package cant be compiled otherwise.
380 | 
381 | You could set the `wait_time` in your `config.yml` to something really high (like 30m) to further reduce the load of the indexer on your system.
382 | 
383 | And of course: if you used the audit.log watcher before, you can now remove all config for it from your samba, rsyslog etc...
384 | 
385 | ## Advanced: Which fields are displayed in the finder result page?
386 | 
387 | The basic mapping of elasticsearch to spotlight results can be found here: [elasticsearch_mappings.json](https://gitlab.com/samba-team/samba/-/blob/master/source3/rpc_server/mdssvc/elasticsearch_mappings.json)
388 | 
389 | I'm currently unsure WHICH fields are really queried, mapped and returned to spotlight.
390 | In Samba versions prior to 4.21.4 (and backported to 4.20.8):
391 | - "filesize" is not returned, so it's empty in the result page.
392 | - "last_modified" is not returned, but the finder displays a date. Sometimes this date is well into the future (+ 5 - 6 years).
393 | 
394 | Samba 4.21.4 & 4.20.8 changes this behavior:
395 | filesize, birth date and last modified date are now returned by samba and will be correctly displayed. The "type" column is still empty though.
396 | Thanks to Ralph Böhme of SerNet for implementing this feature request!
397 | 


--------------------------------------------------------------------------------
/config.dist.yml:
--------------------------------------------------------------------------------
 1 | # This configuration file is a template.
 2 | # Copy it to "/etc/fs2es-indexer/config.yml" and tweak it!
 3 | # The commented values are exemplary, the uncommented values are the default values!
 4 | 
 5 | # The directories which should be indexed
 6 | directories:
 7 | #  - "/my-storage-directory"
 8 | 
 9 | # (Optional) Exclude directories / files from the index
10 | #exclusions:
11 |   # Exclusion via a simple string search in the full path of the file / directory.
12 |   # If any of the given strings are found in the full path, it wont be added to the index.
13 |   # Usually faster than a regular expression!
14 | #  partial_paths:
15 | #    - ".DS_Store"
16 | #    - "._.DS_Store"
17 | 
18 |   # Exclusion via testing if a regular expression matches the full path of the file / directory.
19 |   # If any of the regular expression matches, it wont be added to the index.
20 |   # Usually slower than using a simple string search.
21 | #  regular_expressions:
22 | #    - "\.Trash-\d+"
23 | 
24 | elasticsearch:
25 |   # The URL of the elasticsearch index
26 |   url: "http://localhost:9200"
27 | 
28 |   # See the README.md for more information on how to setup user authentication
29 |   # (Optional): The user for elasticsearch
30 | #  user: "fs2es-indexer"
31 | 
32 |   # (Optional): The password for elasticsearch
33 | #  password: "secret!"
34 | 
35 |   # The name of the elasticsearch index
36 |   index: "files"
37 | 
38 |   # The amount of records to insert in one go (bulk)
39 |   bulk_size: 10000
40 | 
41 |   # Verify the SSL certificate presented by the server (only if use_ssl == True)
42 |   verify_certs: True
43 | 
44 |   # ElasticSearch would show a warning if use_ssl == True and verify_certs == False
45 |   # This can be disabled here in order to keep the logs clean.
46 |   ssl_show_warn: True
47 | 
48 |   # The path to the file containing all valid CA certificates
49 |   # Shouldn't be necessary because "certifi" will be installed as a dependency
50 |   #ca_certs: '/etc/ssl/certs/ca-certificates.crt'
51 | 
52 |   # The version of the elasticsearch-library
53 |   # V8 has some pretty big changes, so we need to switch some statements.
54 |   # Only change this if you dont use the installation via setup.py because there is a hardcoded dependency to v8!
55 |   # Only v7 and v8 are valid.
56 |   library_version: 8
57 | 
58 |   # The file where the mapping for the ElasticSearch index is saved.
59 |   index_mapping: "/opt/fs2es-indexer/es-index-mapping.json"
60 | 
61 | # The wait time between indexing runs in "daemon" mode
62 | # If you have no changes watcher, your user will only get stale data - so new files will show up in a spotlight search
63 | # later if you increase this wait_time. The same is true for deletions and renames.
64 | # But a very low wait_time will increase the load on the server massivly! So please choose your interval accordingly.
65 | # Allowed suffixes: s (seconds), m (minutes), h (hours), d (days)
66 | # Recommended values:
67 | # - very low if you dont have any changes watcher (audit.log or fanotify)
68 | # - low (e. g. 5m) if you use the audit.log watcher
69 | # - high (e. g. 30m) if you use the fanotify watcher
70 | wait_time: "30m"
71 | 
72 | # Options for the samba integration
73 | samba:
74 |   # The "daemon" mode can parse the audit.log of samba during the "wait_time" to get changes while waiting
75 |   # See README.md for more information
76 |   audit_log: "/var/log/samba/audit.log"
77 | 
78 |   # How long should the Audit-Log-Watcher sleep() before looking into the audit.log file again (in seconds) ?
79 |   monitor_sleep_time: 1
80 | 
81 | # Instead of monitoring the samba audit.log, fs2es-indexer can use fanotify to be informed about filesystem changes
82 | # in the monitored directories
83 | # See README.md for more information
84 | use_fanotify: False
85 | 
86 | # Do you want to the dump raw documents json to /tmp/fs2es-indexer-failed-documents-%date%.json
87 | # in case it cant be indexed by elasticsearch?
88 | dump_documents_on_error: False
89 | 


--------------------------------------------------------------------------------
/es-index-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mappings": {
 3 |         "properties": {
 4 |             "path": {
 5 |                 "properties": {
 6 |                     "real": {
 7 |                         "type": "keyword",
 8 |                         "store": true,
 9 |                         "fields": {
10 |                             "tree": {
11 |                                 "type": "text",
12 |                                 "fielddata": true
13 |                             },
14 |                             "fulltext": {
15 |                                 "type": "text"
16 |                             }
17 |                         }
18 |                     }
19 |                 }
20 |             },
21 |             "file": {
22 |                 "properties": {
23 |                     "filename": {
24 |                         "type": "keyword",
25 |                         "store": true,
26 |                         "fields": {
27 |                             "tree": {
28 |                                 "type": "text",
29 |                                 "fielddata": true
30 |                             },
31 |                             "fulltext": {
32 |                                 "type": "text"
33 |                             }
34 |                         }
35 |                     }
36 |                 }
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/fs2es-indexer:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import logging
  6 | import re
  7 | import sys
  8 | import time
  9 | import yaml
 10 | 
 11 | from lib.Fs2EsIndexer import *
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser(description='Indexes the names of files and directories into elasticsearch')
 15 | 
 16 | parser.add_argument(
 17 |     'action',
 18 |     default='index',
 19 |     nargs='?',
 20 |     help='What do you want to do? "index" (default), "daemon", "search", "clear", "analyze_index", "enable_slowlog" or "disable_slowlog"?'
 21 | )
 22 | 
 23 | parser.add_argument(
 24 |     '--search-term',
 25 |     action='store',
 26 |     default=None,
 27 |     help='Action "search" only: The term we want to search for in the index'
 28 | )
 29 | 
 30 | parser.add_argument(
 31 |     '--search-filename',
 32 |     action='store',
 33 |     default=None,
 34 |     help='Action "search" only: The filename we want to search for in the index'
 35 | )
 36 | 
 37 | parser.add_argument(
 38 |     '--search-path',
 39 |     action='store',
 40 |     help='Action "search" only: The server(!) path we want to search in (use the samba share\'s "path")'
 41 | )
 42 | 
 43 | parser.add_argument(
 44 |     '--config',
 45 |     action='store',
 46 |     dest='configFile',
 47 |     default='/etc/fs2es-indexer/config.yml',
 48 |     help='The configuration file to be read'
 49 | )
 50 | 
 51 | parser.add_argument(
 52 |     '--verbose',
 53 |     '-v',
 54 |     action='store_true',
 55 |     dest='verbose',
 56 |     default=False,
 57 |     help='Print more verbose messages'
 58 | )
 59 | 
 60 | parser.add_argument(
 61 |     '--log-level-es',
 62 |     action='store',
 63 |     dest='logLevelEs',
 64 |     default='ERROR',
 65 |     help='The logging level of the elasticsearch plugin (DEBUG, INFO, WARN, ERROR, FATAL).'
 66 | )
 67 | 
 68 | args = parser.parse_args()
 69 | 
 70 | logger = logging.getLogger('fs2es-indexer')
 71 | logging.basicConfig(
 72 |     stream=sys.stdout,
 73 |     level=logging.INFO,
 74 |     format='%(asctime)s %(name)s %(levelname)s %(message)s'
 75 | )
 76 | if args.verbose:
 77 |     logger.setLevel(logging.DEBUG)
 78 | 
 79 | logging.getLogger('elasticsearch').setLevel(args.logLevelEs)
 80 | logging.getLogger('elastic_transport').setLevel(args.logLevelEs)
 81 | 
 82 | logger.info('Reading config file "%s"' % args.configFile)
 83 | with open(args.configFile, 'r') as stream:
 84 |     config = yaml.safe_load(stream)
 85 | 
 86 | indexer = Fs2EsIndexer(config, logger)
 87 | 
 88 | if args.action == 'index':
 89 |     logger.info('Starting indexing run...')
 90 | 
 91 |     indexer.elasticsearch_prepare_index()
 92 |     indexer.elasticsearch_get_all_ids()
 93 |     indexer.index_directories()
 94 | elif args.action == 'clear':
 95 |     indexer.clear_index()
 96 | elif args.action == 'daemon':
 97 |     indexer.daemon()
 98 | elif args.action == 'search':
 99 |     if args.search_path is None:
100 |         parser.error('"search" requires --search-path')
101 | 
102 |     resp = indexer.search(args.search_path, args.search_term, args.search_filename)
103 | 
104 |     logger.info('Found %d elasticsearch documents:' % resp['hits']['total']['value'])
105 |     for hit in resp['hits']['hits']:
106 |         if args.verbose:
107 |             logger.info('- "%s": %s' % (hit['_source']['file']['filename'], json.dumps(hit)))
108 |         else:
109 |             logger.info('- "%s"' % hit['_source']['path']['real'])
110 | 
111 | elif args.action == 'enable_slowlog':
112 |     indexer.enable_slowlog()
113 | elif args.action == 'disable_slowlog':
114 |     indexer.disable_slowlog()
115 | elif args.action == 'analyze_index':
116 |     reindex_necessary = indexer.elasticsearch_analyze_index()
117 |     if index_recreate_necessary:
118 |         logger.info('Recreating the elasticsearch index is necessary.')
119 |     else:
120 |         logger.info('Recreating the elasticsearch index is not necessary.')
121 | else:
122 |     logger.info('Unknown action "%s", allowed are "index" (default), "daemon", "search", "clear", "enable_slowlog" or "disable_slowlog".' % args.action)
123 | 


--------------------------------------------------------------------------------
/fs2es-indexer.service:
--------------------------------------------------------------------------------
 1 | # Move this file to '/lib/systemd/system/' and run
 2 | # systemctl daemon-reload && systemctl enable fs2es-indexer && systemctl start fs2es-indexer
 3 | 
 4 | [Unit]
 5 | Description=Indexes files into elasticsearch
 6 | 
 7 | # Comment this if you've installed elasticsearch on ANOTHER machine than where the fs2es-indexer runs.
 8 | After=elasticsearch.service
 9 | 
10 | [Service]
11 | # Run the command unbuffered, so that we can see the log entries in realtime via journalctl -feu fs2es-indexer.service
12 | ExecStart=/opt/fs2es-indexer/bin/python3 -u /opt/fs2es-indexer/fs2es-indexer daemon
13 | 
14 | # Always restart the daemon (even in case of errors) after 1 minute
15 | Restart=always
16 | RestartSec=60
17 | 
18 | [Install]
19 | WantedBy=multi-user.target
20 | 


--------------------------------------------------------------------------------
/lib/ChangesWatcher/AuditLogChangesWatcher.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import re
  5 | import time
  6 | import typing
  7 | 
  8 | from lib.ChangesWatcher.ChangesWatcher import *
  9 | 
 10 | 
 11 | class AuditLogChangesWatcher(ChangesWatcher):
 12 |     """ Watches the samba audit.log for fileystem changes """
 13 | 
 14 |     def __init__(self, indexer, samba_config: dict[str, typing.Any]):
 15 |         super().__init__(indexer)
 16 | 
 17 |         self.samba_audit_log = samba_config.get('audit_log', None)
 18 |         self.samba_monitor_sleep_time = samba_config.get('monitor_sleep_time', 1)
 19 |         self.samba_audit_log_file = None
 20 | 
 21 |     def start(self) -> bool:
 22 |         """ Starts the changes watcher """
 23 |         self.samba_audit_log_file = None
 24 |         if self.samba_audit_log is None:
 25 |             return False
 26 | 
 27 |         try:
 28 |             self.samba_audit_log_file = open(self.samba_audit_log, 'r')
 29 |             # Go to the end of the file - this is our start!
 30 |             self.samba_audit_log_file.seek(0, 2)
 31 | 
 32 |             self.logger.info('Successfully opened %s, will monitor it during wait time.' % self.samba_audit_log)
 33 |             return True
 34 |         except:
 35 |             self.samba_audit_log_file = None
 36 |             self.logger.error('Error opening %s, cant monitor it.' % self.samba_audit_log)
 37 |             return False
 38 | 
 39 |     def watch(self, timeout: float) -> int:
 40 |         """ Monitors the given file descriptor for changes until the timeout is reached. """
 41 | 
 42 |         stop_at = time.time() + timeout
 43 |         self.logger.info('Monitoring Samba audit log until next indexing run in %s seconds.' % timeout)
 44 | 
 45 |         changes = 0
 46 |         while time.time() <= stop_at:
 47 |             line = self.samba_audit_log_file.readline()
 48 |             if not line:
 49 |                 # Was the file log rotated?
 50 |                 # logrotate's copytruncate works by copying the file and removing the contents of the original
 51 |                 #   In this case the size of the file now would be drastically (!) less than our current position.
 52 |                 #   We'll reopen the file without (!) seeking to the end.
 53 |                 # Without "copytruncate" the current file is renamed and a new file is created.
 54 |                 #   We need to close the old file handle (now pointing to the backup) and open the new file
 55 |                 #   (at the old location). The problem is, that this new file WILL be created AFTER the rename and
 56 |                 #   we could possible try to read in between! So we have to test if the file exist and possibly wait a
 57 |                 #   bit before we try again.
 58 |                 try:
 59 |                     file_was_rotated = self.samba_audit_log_file.tell() > os.path.getsize(self.samba_audit_log)
 60 |                     if file_was_rotated:
 61 |                         self.logger.info('Samba audit log was rotated and a new file exists at "%s".' % self.samba_audit_log)
 62 |                 except FileNotFoundError:
 63 |                     # The new file does not exist yet! We need to wait a bit...
 64 |                     file_was_rotated = True
 65 |                     self.logger.info('Samba audit log was rotated and no new file does exist at "%s".' % self.samba_audit_log)
 66 |                     time.sleep(self.samba_monitor_sleep_time)
 67 | 
 68 |                 if file_was_rotated:
 69 |                     self.logger.info('Reopening Samba audit log "%s"...' % self.samba_audit_log)
 70 |                     self.samba_audit_log_file.close()
 71 |                     self.samba_audit_log_file = None
 72 |                     while time.time() <= stop_at and self.samba_audit_log_file is None:
 73 |                         try:
 74 |                             self.samba_audit_log_file = open(self.samba_audit_log, 'r')
 75 |                             self.logger.info('Samba audit log was successfully reopened.')
 76 |                         except FileNotFoundError:
 77 |                             # The new file does not exist yet ... wait a little bit and try again
 78 |                             self.logger.info('Samba audit log couldnt be reopened...')
 79 |                             time.sleep(self.samba_monitor_sleep_time)
 80 | 
 81 |                     if self.samba_audit_log_file is None:
 82 |                         self.logger.info('Samba audit log couldnt be reopened! Disabling the audit log monitoring.')
 83 | 
 84 |                     continue
 85 | 
 86 |                 else:
 87 |                     # Nothing new in the audit log - sleep for X seconds
 88 | 
 89 |                     time.sleep(self.samba_monitor_sleep_time)
 90 |                     continue
 91 | 
 92 |             self.logger.debug('* Got new line: "%s"' % line.strip())
 93 | 
 94 |             re_match = re.match(r'^.*\|(openat|unlinkat|renameat|mkdirat)\|ok\|(.*)$', line)
 95 |             if not re_match:
 96 |                 self.logger.debug('*- not interested: regexp didnt match')
 97 |                 continue
 98 | 
 99 |             # create a file:       <user>|<ip>|openat|ok|w|<path> (w!)
100 |             # rename a file / dir: <user>|<ip>|renameat|ok|<source>|<target>
101 |             # create a dir:        <user>|<ip>|mkdirat|ok|<path>
102 |             # delete a file / dir: <user>|<ip>|unlinkat|ok|<path>
103 | 
104 |             operation = re_match.group(1)
105 |             values = re_match.group(2).split('|')
106 | 
107 |             if len(values) == 0:
108 |                 self.logger.debug('*- not interested: no values?!')
109 |                 continue
110 | 
111 |             # So we can use pop(), because python has no array_shift()!
112 |             values.reverse()
113 | 
114 |             if operation == 'openat':
115 |                 # openat has another value "r" or "w", we only want to react to "w"
116 |                 openat_operation = values.pop()
117 |                 if openat_operation == 'w':
118 |                     changes += self.indexer.import_path(values.pop())
119 |                 else:
120 |                     self.logger.debug('*- not interested: expected openat with w, but got "%s"' % openat_operation)
121 | 
122 |             elif operation == 'renameat':
123 |                 source_path = values.pop()
124 |                 target_path = values.pop()
125 | 
126 |                 if ':' in source_path:
127 |                     # We ignore these paths BECAUSE if you delete a xattr from a file, we don't want to delete the
128 |                     # whole file from index.
129 |                     # This should not happen for a renameat, but oh well...
130 |                     continue
131 | 
132 |                 self.indexer.rename_path(
133 |                     source_path,
134 |                     target_path,
135 |                 )
136 | 
137 |             elif operation == 'mkdirat':
138 |                 changes += self.indexer.import_path(values.pop())
139 |             elif operation == 'unlinkat':
140 |                 changes += self.indexer.delete_path(values.pop())
141 |             else:
142 |                 self.logger.debug('*- not interested: unrecognized operation: %s' % operation)
143 |                 continue
144 | 
145 |         return changes
146 | 


--------------------------------------------------------------------------------
/lib/ChangesWatcher/ChangesWatcher.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | class ChangesWatcher(object):
 5 |     """ A watcher for filesystem changes """
 6 | 
 7 |     def __init__(self, indexer):
 8 |         self.indexer = indexer
 9 |         self.logger = self.indexer.logger
10 | 
11 |     def start(self) -> bool:
12 |         """ Starts the changes watcher """
13 |         pass
14 | 
15 |     def watch(self, timeout: float) -> int:
16 |         """ Watches for changes until the timeout is reached. """
17 |         pass
18 | 


--------------------------------------------------------------------------------
/lib/ChangesWatcher/FanotifyChangesWatcher.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import pyfanotify as fan
 4 | import select
 5 | import time
 6 | 
 7 | from lib.ChangesWatcher.ChangesWatcher import *
 8 | 
 9 | 
10 | class FanotifyChangesWatcher(ChangesWatcher):
11 |     """ Uses fanotify to watch for changes """
12 | 
13 |     def __init__(self, indexer):
14 |         super().__init__(indexer)
15 |         self.fanotify = None
16 |         self.fanotify_client = None
17 |         self.poller = None
18 | 
19 |     def start(self) -> bool:
20 |         """ Starts the changes watcher """
21 |         self.fanotify = fan.Fanotify(init_fid=True, log=self.indexer.logger.getChild('pyfanotify'))
22 | 
23 |         # See https://man7.org/linux/man-pages/man2/fanotify_mark.2.html
24 |         event_types = (fan.FAN_CREATE | fan.FAN_DELETE | fan.FAN_DELETE_SELF | fan.FAN_RENAME | fan.FAN_ONDIR)
25 | 
26 |         for directory in self.indexer.directories:
27 |             self.fanotify.mark(
28 |                 directory,
29 |                 is_type='fs',
30 |                 ev_types=event_types
31 |             )
32 | 
33 |         self.fanotify.start()
34 | 
35 |         self.fanotify_client = fan.FanotifyClient(self.fanotify, path_pattern='*')
36 |         self.poller = select.poll()
37 |         self.poller.register(self.fanotify_client.sock.fileno(), select.POLLIN)
38 | 
39 |         return True
40 | 
41 |     def watch(self, timeout: float) -> int:
42 |         """ Watches for changes via fanotify until the timeout is reached. """
43 | 
44 |         stop_at = time.time() + timeout
45 |         self.logger.info('Monitoring changes via fanotify until next indexing run in %s seconds.' % timeout)
46 | 
47 |         changes = 0
48 |         while time.time() <= stop_at:
49 |             poll_timeout = stop_at - time.time()
50 |             self.logger.debug('Polling for fanotify events with timeout %d seconds.' % poll_timeout)
51 |             # Wait for next event with a timeout (in ms)
52 |             self.poller.poll(poll_timeout * 1000)
53 |             for event in self.fanotify_client.get_events():
54 |                 if fan.FAN_CREATE & event.ev_types:
55 |                     changes += self.indexer.import_path(event.path[0].decode('utf-8'))
56 |                 elif fan.FAN_DELETE & event.ev_types | fan.FAN_DELETE_SELF & event.ev_types:
57 |                     changes += self.indexer.delete_path(event.path[0].decode('utf-8'))
58 |                 elif fan.FAN_RENAME & event.ev_types:
59 |                     changes += self.indexer.rename_path(
60 |                         event.path[0].decode('utf-8'),
61 |                         event.path[1].decode('utf-8'),
62 |                     )
63 | 
64 |         return changes
65 | 


--------------------------------------------------------------------------------
/lib/ChangesWatcher/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ellerhold/fs2es-indexer/c3725f11b068d340c9137fdde7b9d762c92f42dd/lib/ChangesWatcher/__init__.py


--------------------------------------------------------------------------------
/lib/Fs2EsIndexer.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | 
  3 | import datetime
  4 | import elasticsearch
  5 | import elasticsearch.helpers
  6 | import hashlib
  7 | import itertools
  8 | import json
  9 | import logging
 10 | import os
 11 | import re
 12 | import time
 13 | import typing
 14 | 
 15 | from lib.ChangesWatcher.AuditLogChangesWatcher import *
 16 | try:
 17 |     from lib.ChangesWatcher.FanotifyChangesWatcher import *
 18 | except:
 19 |     # Fanotify is not available!
 20 |     # This will lead to an error in __init__() if use_fanotify is set to true
 21 |     pass
 22 | 
 23 | 
 24 | class Fs2EsIndexer(object):
 25 |     """ Indexes filenames and directory names into an ElasticSearch index ready for spotlight search via Samba 4 """
 26 | 
 27 |     def __init__(self, config: dict[str, typing.Any], logger):
 28 |         """ Constructor """
 29 | 
 30 |         self.logger = logger
 31 | 
 32 |         self.directories = config.get('directories', [])
 33 |         self.dump_documents_on_error = config.get('dump_documents_on_error', False)
 34 | 
 35 |         self.daemon_wait_time = config.get('wait_time', '30m')
 36 |         re_match = re.match(r'^(\d+)(\w)$', self.daemon_wait_time)
 37 |         if re_match:
 38 |             suffix = re_match.group(2)
 39 |             if suffix == 's':
 40 |                 self.daemon_wait_seconds = int(re_match.group(1))
 41 |             elif suffix == 'm':
 42 |                 self.daemon_wait_seconds = int(re_match.group(1)) * 60
 43 |             elif suffix == 'h':
 44 |                 self.daemon_wait_seconds = int(re_match.group(1)) * 60 * 60
 45 |             elif suffix == 'd':
 46 |                 self.daemon_wait_seconds = int(re_match.group(1)) * 60 * 60 * 24
 47 |             else:
 48 |                 self.logger.info('Unknown time unit in "wait_time": %s, expected "s", "m", "h" or "d"' % suffix)
 49 |                 exit(1)
 50 |         else:
 51 |             self.logger.info('Unknown "wait_time": %s' % self.daemon_wait_time)
 52 |             exit(1)
 53 | 
 54 |         exclusions = config.get('exclusions', {})
 55 |         self.exclusion_strings = exclusions.get('partial_paths', [])
 56 |         self.exclusion_reg_exps = exclusions.get('regular_expressions', [])
 57 | 
 58 |         if config.get('use_fanotify', False):
 59 |             try:
 60 |                 self.changes_watcher = FanotifyChangesWatcher(self)
 61 |             except:
 62 |                 self.logger.error('Cant use fanotify to watch for filesystem changes. Did you install "pyfanotify"?')
 63 |                 exit(1)
 64 |         else:
 65 |             self.changes_watcher = AuditLogChangesWatcher(self, config.get('samba', {}))
 66 | 
 67 |         elasticsearch_config = config.get('elasticsearch', {})
 68 |         self.elasticsearch_url = elasticsearch_config.get('url', 'http://localhost:9200')
 69 |         self.elasticsearch_index = elasticsearch_config.get('index', 'files')
 70 |         self.elasticsearch_bulk_size = elasticsearch_config.get('bulk_size', 10000)
 71 |         self.elasticsearch_index_mapping_file = elasticsearch_config.get('index_mapping', '/opt/fs2es-indexer/es-index-mapping.json')
 72 | 
 73 |         self.elasticsearch_lib_version = elasticsearch_config.get('library_version', 8)
 74 |         if self.elasticsearch_lib_version != 7 and self.elasticsearch_lib_version != 8:
 75 |             self.logger.info(
 76 |                 'This tool only works with the elasticsearch library v7 or v8. Your configured version "%s" is not supported currently.' % self.elasticsearch_lib_version
 77 |             )
 78 | 
 79 |         if 'user' in elasticsearch_config:
 80 |             elasticsearch_auth = (elasticsearch_config['user'], elasticsearch_config['password'])
 81 |         else:
 82 |             elasticsearch_auth = None
 83 | 
 84 |         self.elasticsearch = elasticsearch.Elasticsearch(
 85 |             hosts = self.elasticsearch_url,
 86 |             http_auth = elasticsearch_auth,
 87 |             max_retries = 10,
 88 |             retry_on_timeout = True,
 89 |             verify_certs = elasticsearch_config.get('verify_certs', True),
 90 |             ssl_show_warn = elasticsearch_config.get('ssl_show_warn', True),
 91 |             ca_certs = elasticsearch_config.get('ca_certs', None)
 92 |         )
 93 | 
 94 |         self.elasticsearch_document_ids = {}
 95 |         self.duration_elasticsearch = 0
 96 |         self.elasticsearch_tokenizer = 'fs2es-indexer-tokenizer'
 97 | 
 98 |     @staticmethod
 99 |     def format_count(count):
100 |         return '{:,}'.format(count).replace(',', ' ')
101 | 
102 |     def elasticsearch_map_path_to_document(self, path: str, filename: str):
103 |         """ Maps a file or directory path to an elasticsearch document """
104 | 
105 |         return {
106 |             "_op_type": "index",
107 |             "_id": self.elasticsearch_map_path_to_id(path),
108 |             "_source": {
109 |                 "path": {
110 |                     "real": path
111 |                 },
112 |                 "file": {
113 |                     "filename": filename
114 |                 }
115 |             }
116 |         }
117 | 
118 |     @staticmethod
119 |     def elasticsearch_map_path_to_id(path: str):
120 |         """ Maps the path to a unique elasticsearch document ID """
121 |         return hashlib.sha256(path.encode('utf-8', 'surrogatepass')).hexdigest()
122 | 
123 |     def elasticsearch_bulk_action(self, documents):
124 |         """ Imports documents into elasticsearch or deletes documents from there """
125 | 
126 |         # See https://elasticsearch-py.readthedocs.io/en/v8.6.2/helpers.html#bulk-helpers
127 | 
128 |         start_time = time.time()
129 |         try:
130 |             elasticsearch.helpers.bulk(self.elasticsearch, documents, index=self.elasticsearch_index)
131 |         except Exception as err:
132 |             self.logger.info(
133 |                 'Failed to bulk import/delete documents into elasticsearch "%s": %s' % (self.elasticsearch_url, str(err))
134 |             )
135 | 
136 |             if self.dump_documents_on_error:
137 |                 filename = '/tmp/fs2es-indexer-failed-documents-%s.json' % datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
138 |                 with open(filename, 'w') as f:
139 |                     json.dump(documents, f)
140 | 
141 |                 self.logger.error(
142 |                     'Dumped the failed documents to %s, please review it and report bugs upstream.' % filename
143 |                 )
144 | 
145 |             exit(1)
146 | 
147 |         self.duration_elasticsearch += time.time() - start_time
148 | 
149 |     def elasticsearch_analyze_index(self):
150 |         """
151 |         Analyzes the elasticsearch index and reports back if it should be recreated
152 | 
153 |         See https://gitlab.com/samba-team/samba/-/blob/master/source3/rpc_server/mdssvc/elasticsearch_mappings.json
154 |         for the fields expected by samba and their mappings to the expected Spotlight results
155 |         """
156 | 
157 |         if self.elasticsearch.indices.exists(index=self.elasticsearch_index):
158 |             index_settings = self.elasticsearch.indices.get_settings(index=self.elasticsearch_index)
159 | 
160 |             self.logger.debug('Index settings: %s' % json.dumps(index_settings[self.elasticsearch_index]))
161 | 
162 |             try:
163 |                 tokenizer = index_settings[self.elasticsearch_index]['settings']['index']['analysis']['analyzer']['default']['tokenizer']
164 |                 if tokenizer == self.elasticsearch_tokenizer:
165 |                     self.logger.info('Index "%s" has correct tokenizer "%s".' % (self.elasticsearch_index, tokenizer))
166 |                 else:
167 |                     self.logger.info(
168 |                         'Index "%s" has wrong tokenizer "%s", expected "%s" -> recreating the index is necessary'
169 |                         % (self.elasticsearch_index, tokenizer, self.elasticsearch_tokenizer)
170 |                     )
171 |                     return True
172 |             except KeyError:
173 |                 self.logger.info('Index "%s" has no tokenizer -> recreating the index is necessary.' % self.elasticsearch_index)
174 |                 return True
175 | 
176 |             try:
177 |                 analyzer_filter = index_settings[self.elasticsearch_index]['settings']['index']['analysis']['analyzer']['default']['filter']
178 |                 self.logger.info('Index "%s" has analyzer filter(s) "%s".' % (self.elasticsearch_index, '", "'.join(analyzer_filter)))
179 | 
180 |                 if 'lowercase' in analyzer_filter:
181 |                     self.logger.info('Index "%s" has analyzer filter "lowercase".' % self.elasticsearch_index)
182 |                 else:
183 |                     self.logger.info(
184 |                         'Index "%s" misses the analyzer filter "lowercase" -> recreating the index is necessary.'
185 |                         % self.elasticsearch_index
186 |                     )
187 |                     return True
188 | 
189 |                 if 'asciifolding' in analyzer_filter:
190 |                     self.logger.info('Index "%s" has analyzer filter "asciifolding".' % self.elasticsearch_index)
191 |                 else:
192 |                     self.logger.info(
193 |                         'Index "%s" misses the analyzer filter "asciifolding" -> recreating the index is necessary.'
194 |                         % self.elasticsearch_index
195 |                     )
196 |                     return True
197 |             except KeyError:
198 |                 self.logger.info('Index "%s" has no analyzer filter -> recreating the index is necessary.' % self.elasticsearch_index)
199 |                 return True
200 | 
201 |         return False
202 | 
203 |     def elasticsearch_prepare_index(self):
204 |         """
205 |         Creates the elasticsearch index and sets the mapping
206 | 
207 |         See https://gitlab.com/samba-team/samba/-/blob/master/source3/rpc_server/mdssvc/elasticsearch_mappings.json
208 |         for the fields expected by samba and their mappings to the expected Spotlight results
209 |         """
210 | 
211 |         with open(self.elasticsearch_index_mapping_file, 'r') as f:
212 |             index_mapping = json.load(f)
213 | 
214 |         if self.elasticsearch.indices.exists(index=self.elasticsearch_index):
215 |             recreate_necessary = self.elasticsearch_analyze_index()
216 | 
217 |             if recreate_necessary:
218 |                 self.logger.info('Deleting index "%s"...' % self.elasticsearch_index)
219 |                 self.elasticsearch.indices.delete(index=self.elasticsearch_index)
220 | 
221 |                 self.logger.info('Recreating index "%s" ...' % self.elasticsearch_index)
222 |                 self.elasticsearch_create_index(index_mapping)
223 |             else:
224 |                 try:
225 |                     self.logger.info('Updating mapping of index "%s" ...' % self.elasticsearch_index)
226 |                     if self.elasticsearch_lib_version == 7:
227 |                         self.elasticsearch.indices.put_mapping(
228 |                             index=self.elasticsearch_index,
229 |                             doc_type=None,
230 |                             body=index_mapping['mappings']
231 |                         )
232 |                     elif self.elasticsearch_lib_version == 8:
233 |                         self.elasticsearch.indices.put_mapping(
234 |                             index=self.elasticsearch_index,
235 |                             properties=index_mapping['mappings']['properties']
236 |                         )
237 |                 except elasticsearch.exceptions.ConnectionError as err:
238 |                     self.logger.error('Failed to connect to elasticsearch at "%s": %s' % (self.elasticsearch_url, str(err)))
239 |                     exit(1)
240 |                 except elasticsearch.exceptions.BadRequestError as err:
241 |                     self.logger.error('Failed to update index at elasticsearch "%s": %s' % (self.elasticsearch_url, str(err)))
242 | 
243 |                     self.logger.info('Deleting index "%s"...' % self.elasticsearch_index)
244 |                     self.elasticsearch.indices.delete(index=self.elasticsearch_index)
245 | 
246 |                     self.logger.info('Recreating index "%s" ...' % self.elasticsearch_index)
247 |                     self.elasticsearch_create_index(index_mapping)
248 |                 except Exception as err:
249 |                     self.logger.error('Failed to update index at elasticsearch "%s": %s' % (self.elasticsearch_url, str(err)))
250 |                     exit(1)
251 |         else:
252 |             self.logger.info('Creating index "%s" ...' % self.elasticsearch_index)
253 |             self.elasticsearch_create_index(index_mapping)
254 | 
255 |     def elasticsearch_create_index(self, index_mapping):
256 |         index_settings = {
257 |             "analysis": {
258 |                 "tokenizer": {
259 |                     self.elasticsearch_tokenizer: {
260 |                         "type": "simple_pattern",
261 |                         "pattern": "[a-zA-Z0-9]+"
262 |                     }
263 |                 },
264 |                 "analyzer": {
265 |                     "default": {
266 |                         "tokenizer": self.elasticsearch_tokenizer,
267 |                         "filter": [
268 |                             "lowercase",
269 |                             "asciifolding"
270 |                         ]
271 |                     }
272 |                 }
273 |             }
274 |         }
275 |         try:
276 |             if self.elasticsearch_lib_version == 7:
277 |                 self.elasticsearch.indices.create(
278 |                     index=self.elasticsearch_index,
279 |                     body=index_mapping,
280 |                     settings=index_settings
281 |                 )
282 |             elif self.elasticsearch_lib_version == 8:
283 |                 self.elasticsearch.indices.create(
284 |                     index=self.elasticsearch_index,
285 |                     mappings=index_mapping['mappings'],
286 |                     settings=index_settings
287 |                 )
288 |         except elasticsearch.exceptions.ConnectionError as err:
289 |             self.logger.error('Failed to connect to elasticsearch at "%s": %s' % (self.elasticsearch_url, str(err)))
290 |             exit(1)
291 |         except Exception as err:
292 |             self.logger.error('Failed to create index at elasticsearch "%s": %s' % (self.elasticsearch_url, str(err)))
293 |             exit(1)
294 | 
295 |     def elasticsearch_refresh_index(self):
296 |         """ Refresh the elasticsearch index """
297 | 
298 |         self.logger.info('Refreshing index "%s" ...' % self.elasticsearch_index)
299 |         start_time = time.time()
300 |         try:
301 |             self.elasticsearch.indices.refresh(index=self.elasticsearch_index)
302 |             self.duration_elasticsearch += time.time() - start_time
303 |         except elasticsearch.exceptions.ConnectionError as err:
304 |             self.logger.error('Failed to connect to elasticsearch at "%s": %s' % (self.elasticsearch_url, str(err)))
305 |             exit(1)
306 |         except Exception as err:
307 |             self.logger.error(
308 |                 'Failed to refresh index "%s" at elasticsearch "%s": %s' % (
309 |                     self.elasticsearch_index,
310 |                     self.elasticsearch_url,
311 |                     str(err)
312 |                 )
313 |             )
314 |             exit(1)
315 | 
316 |     def index_directories(self):
317 |         """ Imports the content of the directories and all of its subdirectories into the elasticsearch index """
318 | 
319 |         # Copy the document IDs to _old and create a new
320 |         elasticsearch_document_ids_old = self.elasticsearch_document_ids
321 |         self.elasticsearch_document_ids = {}
322 | 
323 |         paths_total = 0
324 |         documents = []
325 |         documents_to_be_indexed = 0
326 |         documents_indexed = 0
327 |         self.duration_elasticsearch = 0
328 |         start_time = round(time.time())
329 | 
330 |         self.logger.info('Starting to index the files and directories ...')
331 | 
332 |         for directory in self.directories:
333 |             self.logger.info('- Starting to index directory "%s" ...' % directory)
334 | 
335 |             for root, dirs, files in os.walk(directory):
336 |                 for name in itertools.chain(files, dirs):
337 |                     full_path = os.path.join(root, name)
338 |                     if self.path_should_be_indexed(full_path, False):
339 |                         try:
340 |                             document = self.elasticsearch_map_path_to_document(
341 |                                 path=full_path,
342 |                                 filename=name
343 |                             )
344 |                         except FileNotFoundError:
345 |                             # File/Dir does not exist anymore? Don't index it!
346 |                             document = None
347 | 
348 |                         if document is not None :
349 |                             paths_total += 1
350 | 
351 |                             if document['_id'] not in elasticsearch_document_ids_old:
352 |                                 # Only add _new_ files and dirs to the index
353 |                                 documents.append(document)
354 |                                 documents_to_be_indexed += 1
355 | 
356 |                                 if documents_to_be_indexed >= self.elasticsearch_bulk_size:
357 |                                     self.elasticsearch_bulk_action(documents)
358 | 
359 |                                     documents = []
360 |                                     documents_indexed += documents_to_be_indexed
361 |                                     documents_to_be_indexed = 0
362 |                                     self.logger.info(
363 |                                         '- %s paths indexed, elasticsearch import lasted %.2f / %.2f min(s)' % (
364 |                                             self.format_count(documents_indexed),
365 |                                             self.duration_elasticsearch / 60,
366 |                                             (time.time() - start_time) / 60
367 |                                         )
368 |                                     )
369 | 
370 |                             try:
371 |                                 del elasticsearch_document_ids_old[document['_id']]
372 |                             except:
373 |                                 pass
374 | 
375 |                             self.elasticsearch_document_ids[document['_id']] = 1
376 | 
377 |             self.logger.info('- Indexing of directory "%s" done.' % directory)
378 | 
379 |         # Add the remaining documents...
380 |         if documents_to_be_indexed > 0:
381 |             self.logger.info('- Importing remaining documents')
382 | 
383 |             self.elasticsearch_bulk_action(documents)
384 |             documents_indexed += documents_to_be_indexed
385 | 
386 |             self.logger.info(
387 |                 '- %s paths indexed, elasticsearch import lasted %.2f / %.2f min(s)' % (
388 |                     self.format_count(documents_indexed),
389 |                     self.duration_elasticsearch / 60,
390 |                     (time.time() - start_time) / 60
391 |                 )
392 |             )
393 | 
394 |         old_document_count = len(elasticsearch_document_ids_old)
395 |         if old_document_count > 0:
396 |             # Refresh the index before each delete
397 |             self.elasticsearch_refresh_index()
398 | 
399 |             # Delete every document in elasticsearch_document_ids_old
400 |             # because the crawler didnt find them during the last run!
401 |             self.logger.info(
402 |                 'Deleting %s old document(s) from "%s" ...' % (
403 |                     self.format_count(old_document_count),
404 |                     self.elasticsearch_index
405 |                 )
406 |             )
407 | 
408 |             elasticsearch_document_ids_old_list = list(elasticsearch_document_ids_old.keys())
409 |             start_index = 0
410 |             end_index = self.elasticsearch_bulk_size
411 |             while start_index < old_document_count:
412 |                 temp_list = elasticsearch_document_ids_old_list[start_index:end_index]
413 | 
414 |                 delete_start_time = time.time()
415 |                 if self.elasticsearch_lib_version == 7:
416 |                     self.elasticsearch.delete_by_query(
417 |                         index=self.elasticsearch_index,
418 |                         body={
419 |                             "query": {
420 |                                 "terms": {
421 |                                     "_id": temp_list
422 |                                 }
423 |                             }
424 |                         }
425 |                     )
426 |                 elif self.elasticsearch_lib_version == 8:
427 |                     self.elasticsearch.delete_by_query(
428 |                         index=self.elasticsearch_index,
429 |                         query={
430 |                             "terms": {
431 |                                 "_id": temp_list
432 |                             }
433 |                         }
434 |                     )
435 | 
436 |                 self.duration_elasticsearch += time.time() - delete_start_time
437 | 
438 |                 self.logger.info(
439 |                     '- %s / %s documents deleted.' % (
440 |                         self.format_count(min(end_index, old_document_count)),
441 |                         self.format_count(old_document_count)
442 |                     )
443 |                 )
444 | 
445 |                 start_index += self.elasticsearch_bulk_size
446 |                 end_index += self.elasticsearch_bulk_size
447 | 
448 |         self.logger.info('Total paths crawled: %s' % self.format_count(paths_total))
449 |         self.logger.info('New paths indexed: %s' % self.format_count(documents_indexed))
450 |         self.logger.info('Old paths deleted: %s' % self.format_count(old_document_count))
451 |         self.logger.info('Indexing run done after %.2f minutes.' % (max(0, time.time() - start_time) / 60))
452 |         self.logger.info('Elasticsearch import lasted %.2f minutes.' % (max(0, self.duration_elasticsearch) / 60))
453 | 
454 |     def path_should_be_indexed(self, path: str, test_parent_directory: bool):
455 |         """ Tests if a specific path (dir or file) should be indexed """
456 | 
457 |         if test_parent_directory:
458 |             # For the audit log monitoring we need to test if the parent directory is in the list of directories
459 |             # that we should index
460 |             parent_dir_is_included = False
461 | 
462 |             for directory in self.directories:
463 |                 if path.startswith(directory):
464 |                     parent_dir_is_included = True
465 |                     break
466 | 
467 |             if not parent_dir_is_included:
468 |                 return False
469 | 
470 |         for search_string in self.exclusion_strings:
471 |             if search_string in path:
472 |                 return False
473 | 
474 |         for search_reg_exp in self.exclusion_reg_exps:
475 |             if re.match(search_reg_exp, path):
476 |                 return False
477 | 
478 |         return True
479 | 
480 |     def clear_index(self):
481 |         """ Deletes all documents in the elasticsearch index """
482 |         self.logger.info('Deleting all documents from index "%s" ...' % self.elasticsearch_index)
483 |         try:
484 |             if self.elasticsearch_lib_version == 7:
485 |                 resp = self.elasticsearch.delete_by_query(
486 |                     index=self.elasticsearch_index,
487 |                     body={"query": {"match_all": {}}}
488 |                 )
489 |             elif self.elasticsearch_lib_version == 8:
490 |                 resp = self.elasticsearch.delete_by_query(
491 |                     index=self.elasticsearch_index,
492 |                     query={"match_all": {}}
493 |                 )
494 | 
495 |             self.logger.info('Deleted %d documents.' % resp['deleted'])
496 |         except elasticsearch.exceptions.ConnectionError as err:
497 |             self.logger.error('Failed to connect to elasticsearch at "%s": %s' % (self.elasticsearch_url, str(err)))
498 |             exit(1)
499 |         except Exception as err:
500 |             self.logger.error(
501 |                 'Failed to delete all documents of index "%s" at elasticsearch "%s": %s' % (
502 |                     self.elasticsearch_index,
503 |                     self.elasticsearch_url,
504 |                     str(err)
505 |                 )
506 |             )
507 |             exit(1)
508 | 
509 |     def daemon(self):
510 |         """ Starts the daemon mode of the indexer"""
511 |         self.logger.info('Starting indexing in daemon mode with a wait time of %s between indexing runs.' % self.daemon_wait_time)
512 | 
513 |         changes_watcher_active = self.changes_watcher.start()
514 | 
515 |         self.elasticsearch_prepare_index()
516 | 
517 |         # Get all document IDs from ES and add new paths to it
518 |         self.elasticsearch_get_all_ids()
519 |         self.index_directories()
520 | 
521 |         while True:
522 |             if changes_watcher_active:
523 |                 changes = self.changes_watcher.watch(self.daemon_wait_seconds)
524 |                 self.logger.info('%d filesystem changes in this waiting period handled.' % changes)
525 |             else:
526 |                 self.logger.info('No changes-watcher is active, starting next indexing run in %s.' % self.daemon_wait_time)
527 |                 time.sleep(self.daemon_wait_seconds)
528 | 
529 |             self.index_directories()
530 | 
531 |     def search(self, search_path: str, search_term=None, search_filename=None):
532 |         """
533 |         Searches for a specific term in the ES index
534 | 
535 |         For the records, the exact query Samba generates for filename (or directory name) queries are either
536 |         1. for a search on the file or directory name (macOS Spotlight search on kMDItemFSName attribute):
537 |         { "_source": ["path.real"], "query": { "query_string": { "query": "(file.filename:Molly*) AND path.real.fulltext:\"/srv/samba/spotlight\"" } } }
538 | 
539 |         2. for a search on all attributes:
540 |         { "from ": 0, "size": 100, "query": { "query_string ": { "query": "(coron* OR content:coron*) AND path.real.fulltext: \"/storage\" ", "fields": [] } }, "_source": { "includes": [ "path.real" ], "excludes":[] } }
541 | 
542 |         Enable logging all queries as "slow query" see enable_slowlog() and look into your slow-log-files.
543 |         """
544 | 
545 |         if search_term is not None:
546 |             query = {
547 |                 "query_string": {
548 |                     "query": '(%s* or content:%s*) AND path.real.fulltext:"%s"' % (search_term, search_term, search_path)
549 |                 }
550 |             }
551 |         elif search_filename is not None:
552 |             query = {
553 |                 "query_string": {
554 |                     "query": 'file.filename: %s* AND path.real.fulltext:"%s"' % (search_term, search_path)
555 |                 }
556 |             }
557 |         else:
558 |             # This will return everything!
559 |             query = {
560 |                 "query_string": {
561 |                     "query": 'path.real.fulltext: "%s"' % search_path
562 |                 }
563 |             }
564 | 
565 |         try:
566 |             if self.elasticsearch_lib_version == 7:
567 |                 return self.elasticsearch.search(
568 |                     index=self.elasticsearch_index,
569 |                     body={
570 |                         "query": query
571 |                     },
572 |                     from_=0,
573 |                     size=100
574 |                 )
575 |             elif self.elasticsearch_lib_version == 8:
576 |                 return self.elasticsearch.search(
577 |                     index=self.elasticsearch_index,
578 |                     query=query,
579 |                     from_=0,
580 |                     size=100
581 |                 )
582 |         except elasticsearch.exceptions.ConnectionError as err:
583 |             self.logger.error('Failed to connect to elasticsearch at "%s": %s' % (self.elasticsearch_url, str(err)))
584 |         except Exception as err:
585 |             self.logger.error(
586 |                 'Failed to search for documents of index "%s" at elasticsearch "%s": %s' % (
587 |                     self.elasticsearch_index,
588 |                     self.elasticsearch_url,
589 |                     str(err)
590 |                 )
591 |             )
592 | 
593 |     def elasticsearch_get_all_ids(self):
594 |         """ Reads all document IDs from elasticsearch """
595 |         self.logger.info('Loading all document IDs from elasticsearch...')
596 | 
597 |         resp = None
598 |         start_time = time.time()
599 | 
600 |         try:
601 |             if self.elasticsearch_lib_version == 7:
602 |                 resp = self.elasticsearch.search(
603 |                     index=self.elasticsearch_index,
604 |                     body={
605 |                         "query": {
606 |                             "match_all": {}
607 |                         }
608 |                     },
609 |                     stored_fields=[],
610 |                     size=self.elasticsearch_bulk_size,
611 |                     scroll='1m'
612 |                 )
613 |             elif self.elasticsearch_lib_version == 8:
614 |                 resp = self.elasticsearch.search(
615 |                     index=self.elasticsearch_index,
616 |                     query={
617 |                         "match_all": {}
618 |                     },
619 |                     stored_fields=[],
620 |                     size=self.elasticsearch_bulk_size,
621 |                     scroll='1m'
622 |                 )
623 |         except elasticsearch.exceptions.ConnectionError as err:
624 |             self.logger.error('Failed to connect to elasticsearch at "%s": %s' % (self.elasticsearch_url, str(err)))
625 |             return
626 |         except Exception as err:
627 |             self.logger.error(
628 |                 'Failed to search for documents of index "%s" at elasticsearch "%s": %s' % (
629 |                     self.elasticsearch_index,
630 |                     self.elasticsearch_url,
631 |                     str(err)
632 |                 )
633 |             )
634 |             return
635 | 
636 |         while len(resp['hits']['hits']) > 0:
637 |             for document in resp['hits']['hits']:
638 |                 self.elasticsearch_document_ids[document['_id']] = 1
639 | 
640 |             self.logger.debug('- Calling es.scroll() with ID "%s"' % resp['_scroll_id'])
641 | 
642 |             resp = self.elasticsearch.scroll(
643 |                 scroll_id=resp['_scroll_id'],
644 |                 scroll='1m'
645 |             )
646 | 
647 |         self.logger.info(
648 |             'Loaded %s ID(s) from elasticsearch in %.2f min' % (
649 |                 self.format_count(len(self.elasticsearch_document_ids)),
650 |                 (time.time() - start_time) / 60
651 |             )
652 |         )
653 | 
654 |     def enable_slowlog(self):
655 |         """ Enables the slow log """
656 |         self.logger.info('Setting the slowlog thresholds on index %s to "0"...' % self.elasticsearch_index)
657 | 
658 |         self.elasticsearch.indices.put_settings(
659 |             settings={
660 |                 "index": {
661 |                     "search": {
662 |                         "slowlog": {
663 |                             "threshold": {
664 |                                 "query": {
665 |                                     "warn": "0",
666 |                                     "info": "0",
667 |                                     "debug": "0",
668 |                                     "trace": "0"
669 |                                 },
670 |                                 "fetch": {
671 |                                     "warn": "0",
672 |                                     "info": "0",
673 |                                     "debug": "0",
674 |                                     "trace": "0"
675 |                                 }
676 |                             }
677 |                         }
678 |                     }
679 |                 }
680 |             },
681 |             index=self.elasticsearch_index
682 |         )
683 | 
684 |         self.logger.info('Slowlog for all queries enabled. Do a spotlight search and look into your elasticsearch logs.')
685 | 
686 |     def disable_slowlog(self):
687 |         """ Disables the slow log """
688 |         self.logger.info('Setting the slowlog thresholds on index %s back to defaults...' % self.elasticsearch_index)
689 | 
690 |         self.elasticsearch.indices.put_settings(
691 |             settings={
692 |                 "index": {
693 |                     "search": {
694 |                         "slowlog": {
695 |                             "threshold": {
696 |                                 "query": {
697 |                                     "warn": "-1",
698 |                                     "info": "-1",
699 |                                     "debug": "-1",
700 |                                     "trace": "-1"
701 |                                 },
702 |                                 "fetch": {
703 |                                     "warn": "-1",
704 |                                     "info": "-1",
705 |                                     "debug": "-1",
706 |                                     "trace": "-1"
707 |                                 }
708 |                             }
709 |                         }
710 |                     }
711 |                 }
712 |             },
713 |             index=self.elasticsearch_index
714 |         )
715 | 
716 |         self.logger.info('Slowlog for slow queries only enabled. Only queries that are slow enough are logged to the slowlog again.')
717 | 
718 | 
719 |     def import_path(self, path: str) -> int:
720 |         # The path can have a suffix! These are the xattr... ignore them completely
721 |         if ':' in path:
722 |             return 0
723 | 
724 |         if not self.path_should_be_indexed(path, True):
725 |             return 0
726 | 
727 |         self.logger.debug('*- Import ES doc for "%s"' % path)
728 | 
729 |         document = self.elasticsearch_map_path_to_document(
730 |             path=path,
731 |             filename=os.path.basename(path)
732 |         )
733 | 
734 |         self.elasticsearch_document_ids[document['_id']] = 1
735 | 
736 |         self.elasticsearch.index(
737 |             index=self.elasticsearch_index,
738 |             id=document['_id'],
739 |             document=document['_source']
740 |         )
741 |         return 1
742 | 
743 |     def delete_path(self, path: str) -> int:
744 |         if ':' in path:
745 |             # We ignore these paths BECAUSE if you delete a xattr from a file, we don't want to delete the
746 |             # whole file from index.
747 |             return 0
748 | 
749 |         if not self.path_should_be_indexed(path, True):
750 |             return 0
751 | 
752 |         self.logger.debug('*- Delete ES doc for "%s"' % path)
753 | 
754 |         document_id_old = self.elasticsearch_map_path_to_id(path)
755 | 
756 |         try:
757 |             del self.elasticsearch_document_ids[document_id_old]
758 |         except:
759 |             # If the key was already deleted - thats ok!
760 |             pass
761 | 
762 |         try:
763 |             self.elasticsearch.delete(
764 |                 index=self.elasticsearch_index,
765 |                 id=document_id_old
766 |             )
767 |         except elasticsearch.NotFoundError:
768 |             # That's OK, we wanted to delete it anyway
769 |             return 0
770 | 
771 |         return 1
772 | 
773 |     def rename_path(self, source_path: str, target_path: str) -> int:
774 |         # If source_path WAS a directory, we have to move all files and subdirectories BELOW it too.
775 |         changes = 0
776 |         resp = self.search(source_path)
777 |         for hit in resp['hits']['hits']:
778 |             # Each of these documents got moved from source_path to target_path!
779 | 
780 |             hit_old_path = hit['_source']['path']['real']
781 |             changes += self.delete_path(hit['_source']['path']['real'])
782 | 
783 |             hit_new_path = hit_old_path.replace(source_path, target_path, 1)
784 |             changes += self.import_path(hit_new_path)
785 | 
786 |         return changes
787 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ellerhold/fs2es-indexer/c3725f11b068d340c9137fdde7b9d762c92f42dd/lib/__init__.py


--------------------------------------------------------------------------------
/role.yml:
--------------------------------------------------------------------------------
 1 | # Add this role to your /etc/elasticsearch/roles.yml
 2 | # Make sure the name of the index matches the one configured in your /etc/fs2es-indexer/config.yml !
 3 | 
 4 | # This role is for the administration of the index, e. g. creating, updating, ...
 5 | fs2es-indexer:
 6 |   indices:
 7 |     - names: [ 'files' ]
 8 |       privileges: [ 'all' ]
 9 | 
10 | # This role is for reading the index, e. g. Samba
11 | fs2es-indexer-ro:
12 |   indices:
13 |     - names: [ 'files' ]
14 |       privileges: [ 'read' ]
15 | 


--------------------------------------------------------------------------------
/rsyslog-smbd_audit.conf:
--------------------------------------------------------------------------------
 1 | # Redirects the messages from "smbd_audit" in its own file and removes them from the normal syslog
 2 | # See https://www.rsyslog.com/doc/v8-stable/configuration/filters.html
 3 | 
 4 | if $programname == "smbd_audit" then {
 5 |   # These message are not interesting to us:
 6 |   # - Adding / modifying / deleting xattrs are logged via "<path>:<xattr>" - we're not interested in those!
 7 |   if $msg contains ":com.apple." or $msg contains ":AFP_AfpInfo" then stop
 8 | 
 9 |   # Everything else from smbd_audit: write to its own file
10 |   *.* -/var/log/samba/audit.log
11 | 
12 |   # Dont write them to the normal syslog
13 |   *.* stop
14 | }
15 | 


--------------------------------------------------------------------------------
/samba-audit-logrotate.conf:
--------------------------------------------------------------------------------
 1 | /var/log/samba/audit.log {
 2 |     daily
 3 |     rotate 14
 4 | 
 5 |     create 0640 root adm
 6 | 
 7 |     missingok
 8 | 
 9 |     compress
10 |     copytruncate
11 | }
12 | 


--------------------------------------------------------------------------------