├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE.md
├── LICENSE.md
├── PULL_REQUEST_TEMPLATE.md
├── README.md
├── UPGRADE.md
├── composer.json
├── docs
├── 00_Configuration_Values.md
├── 20_Categories.md
├── 29_Custom_Request_Header.md
├── 30_Restrictions.md
├── 40_Meta.md
├── 50_Crawler_Events.md
├── 60_Document_Modification.md
└── 90_Frontend_Implementation.md
└── src
└── LuceneSearchBundle
├── Command
├── CrawlCommand.php
└── DocumentModifierCommand.php
├── Configuration
├── Categories
│ └── CategoriesInterface.php
└── Configuration.php
├── Controller
├── Admin
│ └── SettingsController.php
├── AutoCompleteController.php
├── FrontendController.php
└── ListController.php
├── DependencyInjection
├── Compiler
│ ├── CategoriesPass.php
│ └── TaskPass.php
├── Configuration.php
└── LuceneSearchExtension.php
├── Doctrine
└── DBAL
│ └── ConnectionKeepAlive.php
├── Event
├── AssetResourceRestrictionEvent.php
├── CrawlerRequestHeaderEvent.php
├── DocumentModificationEvent.php
├── HtmlParserEvent.php
├── PdfParserEvent.php
└── RestrictionContextEvent.php
├── EventListener
├── DocumentMetaDataListener.php
├── MaintenanceListener.php
├── MaintenanceQueueListener.php
└── MaintenanceRunCrawlerListener.php
├── Helper
├── HighlighterHelper.php
├── LuceneHelper.php
└── StringHelper.php
├── Logger
├── AbstractLogger.php
├── ConsoleLogger.php
└── Logger.php
├── LuceneSearchBundle.php
├── LuceneSearchEvents.php
├── Modifier
├── DocumentModifier.php
└── QueuedDocumentModifier.php
├── Organizer
├── Dispatcher
│ └── HandlerDispatcher.php
└── Handler
│ ├── AbstractHandler.php
│ ├── StateHandler.php
│ └── StoreHandler.php
├── Resources
├── config
│ ├── pimcore
│ │ ├── config.yml
│ │ └── routing.yml
│ ├── services.yml
│ └── services
│ │ ├── commands.yml
│ │ ├── controller.yml
│ │ ├── event.yml
│ │ ├── helper.yml
│ │ ├── modifier.yml
│ │ ├── organizer.yml
│ │ ├── system.yml
│ │ ├── tasks.yml
│ │ └── twig.yml
├── install
│ └── config.yml
├── public
│ ├── css
│ │ └── admin.css
│ ├── img
│ │ ├── ajax-loader.gif
│ │ ├── lucene.png
│ │ ├── lucene_white.png
│ │ ├── plugin.png
│ │ ├── search-logo.png
│ │ └── services.svg
│ └── js
│ │ └── backend
│ │ ├── settings.js
│ │ └── startup.js
├── translations
│ └── admin.en.yml
└── views
│ └── List
│ ├── Partial
│ ├── Pagination
│ │ ├── default.html.twig
│ │ └── single.html.twig
│ └── suggestions.html.twig
│ ├── error.html.twig
│ └── result.html.twig
├── Task
├── AbstractTask.php
├── Crawler
│ ├── CrawlerTask.php
│ ├── Event
│ │ ├── Logger.php
│ │ └── Statistics.php
│ ├── Filter
│ │ ├── Discovery
│ │ │ ├── NegativeUriFilter.php
│ │ │ └── UriFilter.php
│ │ ├── FilterPersistor.php
│ │ ├── LogDispatcher.php
│ │ └── PostFetch
│ │ │ ├── MaxContentSizeFilter.php
│ │ │ └── MimeTypeFilter.php
│ ├── Listener
│ │ └── Abort.php
│ └── PersistenceHandler
│ │ └── FileSerializedResourcePersistenceHandler.php
├── Parser
│ └── ParserTask.php
├── System
│ ├── ShutDownTask.php
│ └── StartUpTask.php
├── TaskInterface.php
└── TaskManager.php
├── Tool
├── CrawlerState.php
└── Install.php
└── Twig
└── Extension
├── CategoriesExtension.php
├── CrawlerExtension.php
└── PaginationExtension.php
/.gitignore:
--------------------------------------------------------------------------------
1 | ######################
2 | # Compiled source #
3 | ######################
4 | *.com
5 | *.class
6 | *.dll
7 | *.exe
8 | *.o
9 | *.so
10 |
11 | ######################
12 | # Packages #
13 | ######################
14 | # it's better to unpack these files and commit the raw source
15 | # git has its own built in compression methods
16 | *.7z
17 | *.dmg
18 | *.gz
19 | *.iso
20 | *.jar
21 |
22 | ######################
23 | # Logs and databases #
24 | ######################
25 | *.log
26 |
27 | ######################
28 | # Global #
29 | ######################
30 | .DS_Store
31 | .DS_Store\?
32 | ._*
33 | .Spotlight-V100
34 | .Trashes
35 | Icon\?
36 | *.sublime-workspace
37 | *.sublime-project
38 | atlassian-ide-plugin.xml
39 | .idea/
40 | .project
41 | ehthumbs.db
42 | Thumbs.db
43 | Vagrantfile
44 | .vagrant
45 | php-cgi.core
46 | .sass-cache
47 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
5 |
6 | ## Our Standards
7 | Examples of behavior that contributes to creating a positive environment include:
8 |
9 | * Using welcoming and inclusive language
10 | * Being respectful of differing viewpoints and experiences
11 | * Gracefully accepting constructive criticism
12 | * Focusing on what is best for the community
13 | * Showing empathy towards other community members
14 |
15 | Examples of unacceptable behavior by participants include:
16 |
17 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
18 | * Trolling, insulting/derogatory comments, and personal or political attacks
19 | * Public or private harassment
20 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
21 | * Other conduct which could reasonably be considered inappropriate in a professional setting
22 |
23 | ## Our Responsibilities
24 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
25 |
26 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
27 |
28 | ## Scope
29 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
30 |
31 | ## Enforcement
32 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at support@dachcom.ch. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
33 |
34 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
35 |
36 | ## Attribution
37 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
38 |
39 | [homepage]: http://contributor-covenant.org
40 | [version]: http://contributor-covenant.org/version/1/4/
41 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to a Members
2 |
3 | ## Bug Reports & Feature Requests
4 | The Members team heavily uses (and loves!) GitHub for all of our software management.
5 | We use GitHub issues exclusively to track all bugs and features.
6 |
7 | * [Open an issue](https://github.com/dachcom-digital/pimcore-members/issues) here on GitHub.
8 | If you can, **please provide a fix and create a pull request (PR) instead**; this will automatically create an issue for you.
9 | * Report security issues **only** to support@dachcom.ch
10 | * Please be patient as not all items will be tested immediately - remember, Members is open source and free of charge.
11 | * Occasionally we'll close issues if they appear stale or are too vague - please don't take this personally!
12 | Please feel free to re-open issues we've closed if there's something we've missed and they still need to be addressed.
13 |
14 | ## Contributing Pull Requests
15 | PR's are even better than issues.
16 | We gladly accept community pull requests.
17 | There are a few necessary steps before we can accept a pull request:
18 |
19 | * [Open an issue](https://github.com/dachcom-digital/pimcore-members/issues) describing the problem that you are looking to solve in
20 | your PR (if one is not already open), and your approach to solving it (no necessary for bug fixes - only feature contributions).
21 | * [Fork us!](https://help.github.com/articles/fork-a-repo/) Code! Follow the coding standards PSR-1, PSR-2 and PSR-4.
22 | * [Send a pull request](https://help.github.com/articles/using-pull-requests/) from your fork’s branch to our `master` branch.
23 |
24 | ### Contributor License Agreement
25 | The following terms are used throughout this agreement:
26 |
27 | * **You** - the person or legal entity including its affiliates asked to accept this agreement. An affiliate is any
28 | entity that controls or is controlled by the legal entity, or is under common control with it.
29 |
30 | * **Project** - is an umbrella term that refers to any and all Members projects.
31 |
32 | * **Contribution** - any type of work that is submitted to a Project, including any modifications or additions to
33 | existing work.
34 |
35 | * **Submitted** - conveyed to a Project via a pull request, commit, issue, or any form of electronic, written, or
36 | verbal communication with Members, contributors or maintainers.
37 |
38 | #### 1. Grant of Copyright License.
39 | Subject to the terms and conditions of this agreement, You grant to the Projects’ maintainers, contributors, users and
40 | to Members a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce,
41 | prepare derivative works of, publicly display, publicly perform, sublicense, and distribute Your contributions and such
42 | derivative works. Except for this license, You reserve all rights, title, and interest in your contributions.
43 |
44 | #### 2. Grant of Patent License.
45 | Subject to the terms and conditions of this agreement, You grant to the Projects’ maintainers, contributors, users and
46 | to dachcom-digital/members a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section)
47 | patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer your contributions, where
48 | such license applies only to those patent claims licensable by you that are necessarily infringed by your contribution
49 | or by combination of your contribution with the project to which this contribution was submitted.
50 |
51 | If any entity institutes patent litigation - including cross-claim or counterclaim in a lawsuit - against You alleging
52 | that your contribution or any project it was submitted to constitutes or is responsible for direct or contributory
53 | patent infringement, then any patent licenses granted to that entity under this agreement shall terminate as of the
54 | date such litigation is filed.
55 |
56 | #### 3. Source of Contribution.
57 | Your contribution is either your original creation, based upon previous work that, to the best of your knowledge, is
58 | covered under an appropriate open source license and you have the right under that license to submit that work with
59 | modifications, whether created in whole or in part by you, or you have clearly identified the source of the contribution
60 | and any license or other restriction (like related patents, trademarks, and license agreements) of which you are
61 | personally aware.
62 |
--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | | Q | A
2 | | ---------------- | -----
3 | | Bug report? | yes/no
4 | | Feature request? | yes/no
5 | | BC Break report? | yes/no
6 | | RFC? | yes/no
7 |
8 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # License
2 | Copyright (C) 2017 DACHCOM.DIGITAL
3 |
4 | This software is available under the GNU General Public License version 3 (GPLv3).
5 |
6 | ### GNU General Public License version 3 (GPLv3)
7 | If you decide to choose the GPLv3 license, you must comply with the following terms:
8 |
9 | This program is free software: you can redistribute it and/or modify
10 | it under the terms of the GNU General Public License as published by
11 | the Free Software Foundation, either version 3 of the License, or
12 | (at your option) any later version.
13 |
14 | This program is distributed in the hope that it will be useful,
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | GNU General Public License for more details.
18 |
19 | You should have received a copy of the GNU General Public License
20 | along with this program. If not, see .
21 |
22 | [GNU General Public License](lhttps://www.gnu.org/licenses/gpl-3.0.en.html)
--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | | Q | A
2 | | ------------- | ---
3 | | Bug fix? | yes/no
4 | | New feature? | yes/no
5 | | BC breaks? | no
6 | | Deprecations? | yes/no
7 | | Fixed tickets | #...
8 |
9 |
13 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pimcore Lucene Search
2 | 
3 |
4 | ## Note
5 | The Pimcore Lucene Search Bundle will be marked as abandoned as soon the [Dynamic Search Bundle](https://github.com/dachcom-digital/pimcore-dynamic-search) reached a stable state.
6 | After that, bugfixing will be supported in some cases. However, PRs are always welcome.
7 |
8 | ### Requirements
9 | - Pimcore >= 5.8
10 | - Pimcore >= 6.0
11 |
12 | #### Pimcore 4
13 | Get the Pimcore4 Version [here](https://github.com/dachcom-digital/pimcore-lucene-search/tree/pimcore4).
14 |
15 | ### Installation
16 | 1. Add code below to your `composer.json`
17 | 2. Activate & install it through the ExtensionManager
18 |
19 | ```json
20 | "require" : {
21 | "dachcom-digital/lucene-search" : "~2.3.0"
22 | }
23 | ```
24 |
25 | ### Configuration
26 | To enable LuceneSearch, add those lines to your `AppBundle/Resources/config/pimcore/config.yml`:
27 |
28 | ```yaml
29 | lucene_search:
30 | enabled: true
31 | ```
32 |
33 | A complete setup could look like this:
34 |
35 | ```yaml
36 | lucene_search:
37 | enabled: true
38 | fuzzy_search_results: false
39 | search_suggestion: true
40 | seeds:
41 | - 'http://your-domain.dev'
42 | filter:
43 | valid_links:
44 | - '@^http://your-domain.dev.*@i'
45 | view:
46 | max_per_page: 10
47 | crawler:
48 | content_max_size: 4
49 | content_start_indicator: ''
50 | content_end_indicator: ''
51 | ```
52 |
53 | You need to add the config parameter to your config.yml to override the default values.
54 | Execute this command to get some information about all the config elements of LuceneSearch:
55 |
56 | ```bash
57 | # configuration about all config parameters
58 | $ bin/console config:dump-reference LuceneSearchBundle
59 |
60 | # configuration info about the "fuzzy_search_results" parameter
61 | $ bin/console config:dump-reference LuceneSearchBundle fuzzy_search_results
62 | ```
63 |
64 | We also added a [detailed documentation](docs/00_Configuration_Values.md) about all possible config values.
65 |
66 | ### Features
67 | * Maintenance driven indexing
68 | * Auto Complete
69 | * Restricted Documents & Usergroups ([member](https://github.com/dachcom-digital/pimcore-members) plugin recommended but not required)
70 |
71 | ### Usage
72 |
73 | **Default**
74 | The crawler Engine will start automatically every night by default. Please check that the pimcore default maintenance script is properly installed.
75 |
76 | **Command Line Command**
77 | If you want to start the crawler manually, use this command:
78 |
79 | ```
80 | $ php bin/console lucenesearch:crawl -f -v
81 | ```
82 |
83 | | command | short command | type | description |
84 | |:---|:---|:---|:---|
85 | | ```force``` | `-f` | force crawler start | sometimes the crawler stuck because of a critical error mostly triggered because of wrong configuration. use this command to force a restart |
86 | | ```verbose``` | `-v` | show some logs | good for debugging. you'll get some additional information about filtered and forbidden links while crawling. |
87 |
88 | ## Logs
89 | You'll find some logs from the last crawl in your backend (at the bottom on the LuceneSearch settings page). Of course you'll also find some logs in your `var/logs` folder.
90 | **Note:** please enable the debug mode in pimcore settings to get all types of logs.
91 |
92 | ## Further Information
93 |
94 | - [Categories](docs/20_Categories.md): Learn more about category based crawling / searching.
95 | - [Custom Header](docs/29_Custom_Request_Header.md): Learn how to add custom headers to the crawler request (like a auth token).
96 | - [Restrictions](docs/30_Restrictions.md): Learn more about restricted crawling / indexing.
97 | - [Custom Meta Content](docs/40_Meta.md): Learn more about crawling / searching custom meta.
98 | - [Crawler Events](docs/50_Crawler_Events.md): Hook into crawler process to add custom fields to index.
99 | - [Lucene Document Modification](docs/60_Document_Modification.md): Remove or change availability of lucene documents within a pimcore update/deletion event.
100 | - [Frontend Implementation](docs/90_Frontend_Implementation.md): Get a step by step walkthrough to implement lucene search into your website.
101 |
102 | ## Copyright and license
103 | Copyright: [DACHCOM.DIGITAL](http://dachcom-digital.ch)
104 | For licensing details please visit [LICENSE.md](LICENSE.md)
105 |
106 | ## Upgrade Info
107 | Before updating, please [check our upgrade notes!](UPGRADE.md)
--------------------------------------------------------------------------------
/UPGRADE.md:
--------------------------------------------------------------------------------
1 | # Upgrade Notes
2 |
3 | #### Update from Version 2.1.x to Version 2.2.0
4 | - Pimcore 6 Compatibility
5 | - Fix encoding in lucene url field
6 |
7 | #### Update from Version 2.1.1 to Version 2.1.2
8 | - Availability Flag implemented.
9 | - DocumentModifier implemented. See [Docs](https://github.com/dachcom-digital/pimcore-lucene-search/blob/master/docs/60_Index_Manipulation.md).
10 | - Various Clean-Ups and try/catch improvements.
11 | - Do not index pages with other status codes than 200.
12 | - [Milestone](https://github.com/dachcom-digital/pimcore-lucene-search/milestone/7?closed=1)
13 |
14 | #### Update from Version 2.1.0 to Version 2.1.1
15 | - Implemented [PackageVersionTrait](https://github.com/pimcore/pimcore/blob/master/lib/Extension/Bundle/Traits/PackageVersionTrait.php).
16 | - [Milestone](https://github.com/dachcom-digital/pimcore-lucene-search/milestone/5?closed=1)
17 |
18 | #### Update from Version 2.0.x to Version 2.1.0
19 | - **[REMOVED FEATURE]**: The SiteMap Feature has been removed. Please remove the `lucene_search.sitemap.render` config element **before** updating!
20 | - **[CRITICAL BUGFIX]**: There was a wrong path assignment for the tmp persistence manager. Please delete the `/var/tmp/ls-crawler-tmp` folder immediately.
21 |
22 | #### Update from Version 2.0.x to Version 2.0.2
23 | - **[NEW FEATURE]**: [Query/Hash Url Filter](docs/00_Configuration_Values.md) implemented.
24 |
25 | #### Update from Version 1.x to Version 2.0.0
26 | TBD
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "dachcom-digital/lucene-search",
3 | "type": "pimcore-bundle",
4 | "license": "GPL-3.0+",
5 | "description": "Pimcore 5.x Website Indexer (powered by Zend Search Lucene)",
6 | "keywords": ["pimcore", "search", "lucene"],
7 | "homepage": "https://github.com/dachcom-digital/pimcore-lucene-search",
8 | "authors": [
9 | {
10 | "name": "DACHCOM.DIGITAL Stefan Hagspiel",
11 | "email": "shagspiel@dachcom.ch",
12 | "homepage": "http://www.dachcom.com/",
13 | "role": "Developer"
14 | }
15 | ],
16 | "autoload": {
17 | "psr-4": {
18 | "LuceneSearchBundle\\": "src/LuceneSearchBundle"
19 | }
20 | },
21 |
22 | "extra": {
23 | "pimcore": {
24 | "bundles": [
25 | "LuceneSearchBundle\\LuceneSearchBundle"
26 | ]
27 | }
28 | },
29 | "require": {
30 | "pimcore/pimcore": "^5.8.0 | ^6.0.0",
31 | "vdb/php-spider": "^0.3 | ^0.4",
32 | "zf1/zend-search-lucene": "~1.12"
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/docs/00_Configuration_Values.md:
--------------------------------------------------------------------------------
1 | # Configuration
2 |
3 | Here you'll find all the configuration possibilities, default values and also some description.
4 |
5 | | Name | Type | Default | Description |
6 | |------------------------------------------|------|----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
7 | | enabled | bool | false | Enable and configure the search frontend if you want to include a full text search on your website. |
8 | | fuzzy_search_results | bool | false | Fuzzy search results: When enabled, a fuzzy search is performed. The search will automatically include related terms. |
9 | | search_suggestion | bool | true | Search suggestions: When enabled, a fuzzy search for similar search terms is performed. If no results could be found for the search term entered by the user, similar search terms are presented as suggestions. |
10 | | own_host_only | bool | true | Own Host only: Limit search (and crawling) results to results from the current seed (sub-)domain only. |
11 | | allow_subdomains | bool | false | Allow Subdomains: Limit search (and crawling) results to allow / disallow subomdains of current seed. |
12 | | seeds | array | [] | Start-Urls (Seeds): Specify start URLs for the crawler. Please enter with protocol! e.g. http://www.pimcore.org and enter a starting URL on your main domain first and any other domains next. |
13 | | categories | service | ~ | If search results should be displayed by categories, please enter all valid categories here. The crawler sorts a page into a category if it contains a html meta tag with the name "`lucene-search:categories`". |
14 | | filter:allow_query_in_url | bool | false | When true, LuceneSearch will crawl urls with query fragments. |
15 | | filter:allow_hash_in_url | bool | false | When true, LuceneSearch will crawl urls with hash fragments. |
16 | | filter:valid_links | array | [] | Regex for valid Uris: Specify PREG regexes with start and end delimiter to define allowed links. e.g. `@^http://www\.pimcore\.org*@i` |
17 | | filter:user_invalid_links | array | [] | Regex for forbidden Uris: Specify PREG regexes for links which should be ignored by the crawler. The crawler does not even follow these links e.g. `@^http://www\.pimcore\.org\/community*@i` |
18 | | allowed_mime_types | array | ['text/html', 'application/pdf'] | Supported: `text/html`, `application/pdf` |
19 | | allowed_schemes | array | ['http'] | Define which url Schemes are allowed. (eg. http and/or https). Default is http. |
20 | | crawler:max_link_depth | int | 15 | To avoid loops produced by relative links on a website, a maximum link depth needs to be set. Please choose the value suited to the website to crawl. |
21 | | crawler:max_download_limit | int | 0 | Maximum links to crawl: Constrain crawler to a specific limit of crawled links. Defaults is 0 which means no limit. |
22 | | crawler:content_max_size | int | 0 | Maximum content size (in MB): crawler ignores resources if its size exceeds limit (mostly useful for asset indexing). Defaults is 0 which means no limit. |
23 | | crawler:content_start_indicator | string | ~ | You can limit the page content relevant for searching by surrounding it with certain html comments. The crawler will still parse the entire page to find links, but only the specified area within your html comments is used when searching for a term. String specifying content start for search. |
24 | | crawler:content_end_indicator | string | ~ | String specifying content end for search. |
25 | | crawler:content_exclude_start_indicator: | string | ~ | String specifying exclude content start for search. |
26 | | crawler:content_exclude_end_indicator | string | ~ | String specifying exclude content end for search. |
27 | | locale:ignore_language | bool | false | Receive search results from all languages, set to false to limit search results to the current language only. The current language is retrieved from the registry, the language of any page in the search result index is extracted by the crawler (Content-Language Http header, html tag lang attribute or html meta tag content-language) |
28 | | locale:ignore_country | bool | true | Receive search results from all countries, set to false to limit search results to country only. The current country is retrieved from the search result index. it is extracted by the crawler (html meta tag country) |
29 | | restriction:enabled | bool | false | Document Restriction: Ignore Document restrictions. Set to true if you're using the [Pimcore/MembersBundle](https://github.com/dachcom-digital/pimcore-members) |
30 | | boost:documents | int | 1 | Document Boost Factor |
31 | | boost:assets | int | 1 | Asset Boost Factor |
32 | | view:max_per_page | int | 10 | Max Results per Page |
33 | | view:max_suggestions | int | 10 | Max Suggestions |
--------------------------------------------------------------------------------
/docs/20_Categories.md:
--------------------------------------------------------------------------------
1 | # Categories
2 | It's possible to activate a category based indexing / searching.
3 |
4 | ### Configuration
5 |
6 | ```yaml
7 | lucene_search:
8 | enabled: true
9 | categories: AppBundle\LuceneSearch\Services\Categories
10 | ```
11 |
12 | You need a custom service for that which implements the `LuceneSearchBundle\Configuration\Categories\CategoriesInterface` interface.
13 | So you're class may looks like this:
14 |
15 | ```php
16 | 1, 'label' => 'Category 1'],
32 | [ 'id' => 2, 'label' => 'Category 2'],
33 | ];
34 | }
35 |
36 | }
37 | ```
38 |
39 | To inform the lucene search crawler about those categories we need to add another meta element.
40 | As you can see it's also possible to add multiple categories per document.
41 |
42 | ```html
43 | {% if lucene_search_crawler_active() %}
44 |
45 | {% endif %}
46 | ```
47 |
48 | Congratulations, you're done. From now on the categories get stored into the lucene index.
49 |
50 | ### Twig Extension
51 | If you need the categories in your template, you could use the following snipped:
52 |
53 | ```html
54 | {% for category in lucene_search_get_categories() %}
55 | Id: {{ category.id }}, Label: {{ category.label}}
56 | {% endfor %}
57 | ```
58 |
59 | ### Templating
60 | If you want to know how to implement the categories in frontend, checkout our [frontend implementation advice](90_Frontend_Implementation.md).
61 |
--------------------------------------------------------------------------------
/docs/29_Custom_Request_Header.md:
--------------------------------------------------------------------------------
1 | # Custom Request Header
2 |
3 | Add some header information to the crawler request.
4 |
5 | > The [Members](https://github.com/dachcom-digital/pimcore-members) Bundle adds a auth header element by default.
6 |
7 | ## Event
8 |
9 | | Name | Class | Setter |
10 | |---------------------|-------------|-------------------------------|
11 | | `lucene_search.task.crawler.request_header` | Event\CrawlerRequestHeaderEvent | addHeader |
12 |
13 | ## Example: Auth
14 |
15 | ```yaml
16 | parameters:
17 | lucene_search_user_name: 'Crawler'
18 | lucene_search_password: 'crawler@universe.org'
19 |
20 |
21 | AppBundle\EventListener\CrawlerHeader:
22 | arguments:
23 | $userName: '%lucene_search_user_name%'
24 | $password: '%lucene_search_password%'
25 | tags:
26 | - { name: kernel.event_subscriber }
27 | ```
28 |
29 | ```php
30 | 'addHeaderToLuceneCrawler'
44 | ];
45 | }
46 |
47 | public function addHeaderToLuceneCrawler(CrawlerRequestHeaderEvent $event)
48 | {
49 | //example 1: token auth.
50 | $event->addHeader([
51 | 'name' => 'x-auth-token',
52 | 'value' => 'your-special-token',
53 | 'identifier' => 'lucene-search-token-auth'
54 | ]);
55 |
56 | //example 2: basic auth.
57 | $event->addHeader([
58 | 'name' => 'Authorization',
59 | 'value' => 'Basic ' . base64_encode('USERNAME:PASSWORD'),
60 | 'identifier' => 'lucene-search-basic-auth'
61 | ]);
62 | }
63 | }
64 | ```
--------------------------------------------------------------------------------
/docs/30_Restrictions.md:
--------------------------------------------------------------------------------
1 | # Restrictions
2 | If you want a seamless integration of protected document crawling, install our [Members](https://github.com/dachcom-digital/pimcore-members) bundle.
3 |
4 |
5 | ### Documents
6 | Each document needs a meta tag in the head section. the crawler extract and stores the usergroup id(s) from that meta property..
7 |
8 | > If you're using the Members Bundle this meta property gets assigned automatically.
9 |
10 | **Meta Property Example**
11 | ```html
12 |
13 | ```
14 |
15 | If the document is restricted to a specific user group, the meta `content` contains its id. Otherwise, the meta property needs to be filled with a `default` value.
16 |
17 | ### Assets
18 | Since Assets does not have a html view, you need to catch an event (`lucene_search.task.parser.asset_restriction`).
19 | > If you're using the Members Bundle this event is already implemented.
20 |
21 | ## Asset Language restriction
22 | Because Assets does not have any language hierarchy, you need to add a property called `assigned_language`. This Property will be installed during the install process of LuceneSearch.
23 | If you add some additional language afterwards, you need to add this language to the property. if you do not set any information at all, the asset will be found in any language context.
24 |
25 | ## Asset Country restriction
26 | Because Assets does not have any country hierarchy, you need to add a property called `assigned_country`. This Property will be installed during the install process of LuceneSearch.
27 | If you add some additional countries afterwards, you need to add this country to the property. if you do not set any information at all, the asset will be found in any country context.
28 |
29 | ## Events
30 |
31 | | Name | Class | Setter |
32 | |---------------------|-------------|-------------------------------|
33 | | `lucene_search.task.parser.asset_restriction` | Event\AssetResourceRestrictionEvent | setRestrictions, setAsset |
34 | | `lucene_search.frontend.restriction_context` | Event\RestrictionContextEvent | setAllowedRestrictionGroups |
--------------------------------------------------------------------------------
/docs/40_Meta.md:
--------------------------------------------------------------------------------
1 | # Custom Meta Content
2 |
3 | In some cases you need to add some content or keywords to improve the search accuracy.
4 | But it's not meant for the public crawlers like Google. LuceneSearch is using a custom meta property called `lucene-search:meta`.
5 | This Element should be visible while crawling only.
6 |
7 | **Example:**
8 |
9 | ```html
10 | {% if lucene_search_crawler_active() %}
11 |
12 | {% endif %}
13 | ```
14 |
15 | ## Custom Meta in Documents
16 | It's also possible to add the custom meta property in backend.
17 |
18 | Open *Document* => *Settings* go to *Meta Data* and add a new field:
19 |
20 | ```html
21 |
22 | ```
23 |
24 | > **Note:** Currently it's not possible to hide this meta tag if you're adding it via backend since pimcore provides no way to add/remove/modify those elements programmatically.
25 |
26 | ## Custom Meta in Objects
27 | Because Object may have some front-end capability (a news detail page for example), you have to integrate the custom meta field by yourself (see example above).
28 |
29 | ## Custom Meta in Assets
30 | TBD
--------------------------------------------------------------------------------
/docs/50_Crawler_Events.md:
--------------------------------------------------------------------------------
1 | # Crawler Events
2 |
3 | Hook into crawler process to add custom fields to current lucene document.
4 |
5 | ## HtmlParserEvent params
6 |
7 | ### Document Id
8 | The crawler will always add the ID of the current indexed pimcore document to the params array.
9 | You can access it using `$params['document_id']`.
10 |
11 | > **Note!** The document id is not available in the lucene index unless you're adding it via the parser event (see example below)
12 |
13 | ### Object Id
14 | The crawler will check for the presence of a meta tag called `lucene-search:objectId`.
15 | If the meta tag is present, the objectId will be passed to the event inside the params array.
16 | You can access it using `$params['object_id']`.
17 |
18 | Since it is not possible to automatically detect the current object id, you need to add it by yourself.
19 | This is an example how you could implement the *lucene-search:objectId* meta tag:
20 |
21 | ```html
22 | {% if lucene_search_crawler_active() %}
23 | {% do pimcore_head_meta().appendName('lucene-search:objectId', product.id) %}
24 | {% endif %}
25 | ```
26 |
27 | ## Configuration
28 |
29 | ```yaml
30 | AppBundle\EventListener\LuceneSearchParserListener:
31 | autowire: true
32 | tags:
33 | - { name: kernel.event_subscriber }
34 | ```
35 |
36 | ## Services
37 |
38 | ```php
39 | 'parseHtml',
55 | LuceneSearchEvents::LUCENE_SEARCH_PARSER_PDF_DOCUMENT => 'parsePdf',
56 | ];
57 | }
58 |
59 | public function parseHtml(HtmlParserEvent $event)
60 | {
61 | $luceneDoc = $event->getDocument();
62 | $html = $event->getHtml();
63 | $params = $event->getParams();
64 |
65 | if (!empty($params['document_id'])) {
66 | $document = \Pimcore\Model\Document::getById($params['document_id']);
67 | $documentIdField = \Zend_Search_Lucene_Field::keyword('documentId', $document->getId());
68 | $luceneDoc->addField($documentIdField);
69 | }
70 |
71 | if (!empty($params['object_id'])) {
72 | $object = DataObject::getById($params['object_id']);
73 | $objectIdField = \Zend_Search_Lucene_Field::keyword('objectId', $object->getId());
74 | $luceneDoc->addField($objectIdField);
75 | }
76 |
77 | // additional fields
78 | $field = \Zend_Search_Lucene_Field::text('myCustomField', 'Custom field content', $params['encoding']);
79 | $field->boost = 5;
80 |
81 | $luceneDoc->addField($field);
82 |
83 | $event->setDocument($luceneDoc);
84 | }
85 |
86 | public function parsePdf(PdfParserEvent $event)
87 | {
88 | $luceneDoc = $event->getDocument();
89 | $content = $event->getContent();
90 | $assetMetaData = $event->getAssetMetaData();
91 | $params = $event->getParams();
92 |
93 | $field = \Zend_Search_Lucene_Field::text('myCustomField', 'Custom field content', $params['encoding']);
94 | $luceneDoc->addField($field);
95 |
96 | $event->setDocument($luceneDoc);
97 | }
98 | }
99 | ```
100 |
--------------------------------------------------------------------------------
/docs/60_Document_Modification.md:
--------------------------------------------------------------------------------
1 | # Lucene Document Modification
2 |
3 | It's possible to modify an indexed document.
4 |
5 | Use the `DocumentModifier` class to:
6 |
7 | - mark Lucene-Document as available
8 | - mark Lucene-Document as unavailable
9 | - mark Lucene-Document as deleted (remove from index until next crawl)
10 |
11 | **Note:** The availability check works within the maintenance cycle so there is a dispatch delay up to 5 minutes depending on your maintenance cron settings!
12 |
13 | ## Warning!
14 | There are some limitations while changing lucene documents.
15 | If we change the availability of documents, we can't just update an existing document
16 | since Zend Lucene does not allow us to modify exiting documents. Instead we need to add them as new documents.
17 | Read more about it [here](https://framework.zend.com/manual/1.12/en/zend.search.lucene.index-creation.html#zend.search.lucene.index-creation.document-updating).
18 |
19 | ### Boost
20 | Because of complex lucene indexing strategies, it's not possible to re-gather the boost factor of documents **and** fields.
21 | So you need to hook into the `lucene_search.modifier.document` event and add those boost values again (see example event below).
22 |
23 | ### UnStored Fields
24 | Currently it's not possible to re-add fields with type `\Zend_Search_Lucene_Field::unStored` since they are not available in the query document!
25 | If you're changing the availability of documents with `Unstored` fields, they're gone after updating!
26 | Read more about field types [here](https://framework.zend.com/manual/1.10/en/zend.search.lucene.overview.html#zend.search.lucene.index-creation.understanding-field-types).
27 |
28 | Solution: Hook into the `lucene_search.modifier.document` event and add them again (see example event below).
29 |
30 | ## Implementation
31 |
32 | ```yaml
33 | AppBundle\EventListener\IndexManipulator:
34 | autowire: true
35 | tags:
36 | - { name: kernel.event_subscriber }
37 | ```
38 |
39 | ```php
40 | documentModifier = $documentModifier;
59 | }
60 |
61 | public static function getSubscribedEvents()
62 | {
63 | return [
64 | DocumentEvents::PRE_UPDATE => 'onPreUpdate',
65 | DocumentEvents::PRE_DELETE => 'onPreDelete',
66 | LuceneSearchEvents::LUCENE_SEARCH_DOCUMENT_MODIFICATION => 'onModification',
67 | ];
68 | }
69 |
70 | public function onPreUpdate(DocumentEvent $event)
71 | {
72 | $document = $event->getDocument();
73 |
74 | try {
75 | // get current document from db (without changed values)
76 | $storedDocument = Document::getById($document->getId(), true);
77 | } catch (\Exception $e) {
78 | $storedDocument = null;
79 | }
80 |
81 | // check if untouched db entity has same status. if so = save resources and skip updating.
82 | if ($storedDocument instanceof Document) {
83 | if ($storedDocument->getPublished() === $document->getPublished()) {
84 | return;
85 | }
86 | }
87 |
88 | if ($document->isPublished() === true) {
89 | $marker = DocumentModifier::MARK_AVAILABLE;
90 | } else {
91 | $marker = DocumentModifier::MARK_UNAVAILABLE;
92 | }
93 |
94 | // way 1: use a custom lucene query (slower but could be a complex query)
95 | // yourCustomMetaIdentifier: you need to add custom Keyword via the lucene_search.task.parser.html_parser event
96 | $term = new \Zend_Search_Lucene_Index_Term($document->getProperty('yourCustomMetaIdentifierProperty'), 'yourIdentifier');
97 | $query = new \Zend_Search_Lucene_Search_Query_Term($term);
98 | $this->documentModifier->markDocumentsViaQuery($query, $marker);
99 |
100 | // way 2: use simple term index (faster but only one term possible)
101 | // yourCustomMetaIdentifier: you need to add custom Keyword via the lucene_search.task.parser.html_parser event
102 | $term = new \Zend_Search_Lucene_Index_Term($document->getProperty('yourCustomMetaIdentifierProperty'), 'yourIdentifier');
103 | $this->documentModifier->markDocumentsViaTerm($term, $marker);
104 |
105 | }
106 |
107 | public function onPreDelete(DocumentEvent $event)
108 | {
109 | $document = $event->getDocument();
110 |
111 | // yourCustomMetaIdentifier: you need to add custom Keyword via the lucene_search.task.parser.html_parser event
112 | $term = new \Zend_Search_Lucene_Index_Term($document->getProperty('yourCustomMetaIdentifierProperty'), 'yourIdentifier');
113 | $this->documentModifier->markDocumentsViaTerm($term, DocumentModifier::MARK_DELETED);
114 | }
115 |
116 | /**
117 | * You only need this method if you want to re-add boost values or unstored fields.
118 | *
119 | * @param DocumentModificationEvent $event
120 | */
121 | public function onModification(DocumentModificationEvent $event)
122 | {
123 | $document = $event->getDocument();
124 |
125 | $someConditionsAreTrue = false;
126 |
127 | // use this event to re-add boost values
128 | if ($someConditionsAreTrue === true) {
129 | $document->boost = 999;
130 | $event->setDocument($document);
131 | }
132 | }
133 | }
134 | ```
--------------------------------------------------------------------------------
/docs/90_Frontend_Implementation.md:
--------------------------------------------------------------------------------
1 | # Lucene Search FrontEnd
2 | This guide will help you to implement a search page into your website in seconds.
3 |
4 | ### Optional: Create a Layout/Controller
5 | > Note: This is only required if you're starting a project from scratch.
6 |
7 | - Create a layout in `app\Resources\views\layout.html.twig`
8 | - Add some markup to your layout:
9 |
10 | ```twig
11 |
12 |
13 |