├── LICENSE
├── README.md
├── dev-docs
└── API.rst
├── img
└── Sitehound-architecture-diagram.png
├── installation
├── README.md
└── sitehound-configs
│ ├── docker-compose.deep-deep.kafka-host.yml
│ ├── docker-compose.deep-deep.yml
│ ├── docker-compose.yml
│ ├── download-deep-deep-models.sh
│ ├── models
│ └── .keep
│ └── volumes
│ ├── elasticsearch
│ ├── config
│ │ └── elasticsearch.yml
│ └── data
│ │ └── .keep
│ ├── mongodb
│ └── data
│ │ └── db
│ │ └── .keep
│ └── sitehound-backend
│ └── config
│ └── properties-override
│ └── application-override.properties
└── user-docs
├── README.pdf
├── eisvogel.latex
├── sitehound-walkthrough-guide.pdf
└── sitehound-walkthrough-guide_October-2017_v2.pdf
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Site Hound
2 | Site Hound (previously THH) is a Domain Discovery Tool that extends the capabilities of commercial search engines using automation and human-in-the-loop (HITL) machine learning, allowing the user efficiently expand the set of relevant web pages within his domain/s or topic/s of interest.
3 | Site Hound is the UI to a more complex set of tools described below.
4 | Site Hound was developed under the Memex Program by HyperionGray LLC in partnership with Scrapinghub, Ltd. (2015/2017)
5 |
6 | ### Main Features
7 |
8 | 1. Role Based Access Control (RBAC).
9 | 2. Multiple __workspaces__ for keeping things tidy.
10 | 3. Input of __keywords__, to be included or excluded to the search.
11 | 4. Input of __seeds URLs__, an initial list of websites that you already know are on-topic.
12 | 5. Expand the list of sites by fetching the keywords on multiple commercial search engines.
13 | 6. Displays __screenshots__ (powered by Splash), title, text, html, relevant terms in the text
14 | 7. Allows the user to __iteratively train__ a topic model based on these results by assigning them into defined values (Relevant/Irrelevant/Neutral), as well as re-scoring the associated keywords.
15 | 8. Allows an unbounded training module based on __user-defined categories__.
16 | 9. __Language detection__ (powered byApache Tika) and __page-type classification__
17 | 10. Allows the user to view the trained topic model through a human-interpretable explaination of the model powered by our machine learning explanation toolkit [ELI5](https://github.com/TeamHG-Memex/eli5)
18 | 11. Performs a __broad crawl__ of thousand of sites, using __Machine Learning__ provided by [DeepDeep-crawler](https://github.com/TeamHG-Memex/hh-deep-deep) filtering the ones matching the defined domain.
19 | 12. Displays the results in an interface similar to __Pinterest__ for easy scrolling of the findings.
20 | 13. Provides __summarized__ data about the broad crawl and __exporting__ of the broad-crawl results in CSV format.
21 | 14. Provides real time information about the __progress__ of the crawlers.
22 | 15. Allows search of the Dark web via integration with an __onion index__
23 |
24 |
25 |
26 | ### Infrastructure Components
27 |
28 | When the app starts up, it will try to connect first with all this components
29 | - Mongo (>3.0.*) stores the data about users, workspace and metadata about the crawlings
30 | - Elasticsearch (2.0) stores the results of the crawling (screenshots, html, extracted text)
31 | - Kafka (10.1.*) handles the communication between the backend components regarding the crawlings.
32 |
33 | Custom Docker versions of these components are provided with their extra args to set up the stack correctly, in the Containers section below.
34 |
35 |
36 | ### Service Components:
37 |
38 | This components offer a suite of capabilities to Site Hound. Only the first three components are required.
39 |
40 | - [Sitehound-Frontend](https://github.com/TeamHG-Memex/sitehound-frontend): The user interface web application that handles auth, metadata and the labeled data.
41 | - [Sitehound-Backend](https://github.com/TeamHG-Memex/sitehound-backend): Performs queries on the Search engines, follows the relevant links and orchestrates the screenshots, text extraction,
42 | language identification, page-classification, naive scoring using the cosine difference of TF*IDF, and stores the results sets.
43 | - [Splash](https://github.com/scrapinghub/splash): Splash is used for screenshoot and html capturing.
44 | - [HH-DeepDeep](https://github.com/TeamHG-Memex/hh-deep-deep): Allows the user to train a page model to perform on-topic crawls
45 | - [ExcavaTor]: Our own tor index. This is currently a private db. Ask us about it!
46 |
47 |
48 | Here is the components diagram for reference
49 | 
50 |
51 | ### Install:
52 |
53 | Check the [installation guide](installation/README.md)
54 |
55 | ### How to use it:
56 |
57 | Check the [walkthrough guide](user-docs/sitehound-walkthrough-guide_October-2017_v2.pdf)
58 |
59 | ---
60 |
61 | [](https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=sitehound "Hyperion Gray")
62 |
--------------------------------------------------------------------------------
/dev-docs/API.rst:
--------------------------------------------------------------------------------
1 | =============
2 | Sitehound API
3 | =============
4 |
5 | This is the API for Kafka messages. It is used for communication between
6 | Sitehound, hh-deep-deep and hh-page-classifier.
7 |
8 | .. contents::
9 |
10 |
11 | DD Deepcrawler
12 | ==============
13 |
14 | This is for deep crawling.
15 |
16 | Input
17 | -----
18 |
19 | Topic: ``dd-deepcrawler-input``::
20 |
21 | {
22 | "id": "crawl id",
23 | "workspace_id": "workspace id",
24 | "page_limit": 10000000,
25 | "urls": [
26 | "https://example1.com",
27 | "https://example2.com",
28 | "https://example3.com"
29 | ],
30 | "login_credentials": [
31 | {
32 | "id":"75ea86a9d11ff300022f", // the id of the credentials
33 | "domain":"example1.com",
34 | "url": "http://example1.com/login", // login page as provided
35 | "key_values": {"txtUser":"user1234", "txtPassword":"12345678"} // identifiers of the fields with the value entered by the user.
36 | }
37 | ]
38 | }
39 |
40 | where:
41 |
42 | - id: (String) The id of the job,
43 | - workspace_id: (String) The id of the workspace,
44 | - urls: (List) All URLs selected for deepcrawl,
45 | - page_limit: (Integer) (optional) (defaulting to 10M items). The maximum number of pages to fetch
46 | - login_credentials: (List) (optional) Login credentials already existing in this workspace.
47 |
48 | Stop the crawl::
49 |
50 | {
51 | "id": "the same crawl id",
52 | "stop": true
53 | }
54 |
55 | Progress
56 | --------
57 |
58 | Topic: ``dd-deepcrawler-output-progress``::
59 |
60 | {
61 | "id": "some crawl id",
62 | "progress": {
63 | "status": "running",
64 | "pages_fetched": 1468,
65 | "rpm": 24000,
66 | "domains": [
67 | {
68 | "url": "http://example1.com",
69 | "domain": "example1.com",
70 | "status": "running",
71 | "pages_fetched": 1234,
72 | "rpm": 12000
73 | },
74 | {
75 | "url": "http://example2.com",
76 | "domain": "example2.com",
77 | "status": "finished",
78 | "pages_fetched": 234,
79 | "rpm": 12000
80 | },
81 | {
82 | "url": "http://example3.com",
83 | "domain": "example3.com",
84 | "status": "failed",
85 | "pages_fetched": 0,
86 | "rpm": 0
87 | },
88 | {
89 | "url": "http://example4.com",
90 | "domain": "example4.com",
91 | "status": "running",
92 | "pages_fetched": 0,
93 | "rpm": 0
94 | },
95 | ]
96 | }
97 | }
98 |
99 | Output
100 | ------
101 |
102 | Topic: ``dd-deepcrawler-output-pages``::
103 |
104 | {
105 | "id": "some crawl id",
106 | "page_samples": [
107 | {"url": "http://example.com/pag1", "domain": "example.com"},
108 | {"url": "http://example.com/pag2", "domain": "example.com"}
109 | ]
110 | }
111 |
112 |
113 | Login workflow
114 | ==============
115 |
116 | Assumptions for the first iteration:
117 |
118 | 1) The login feature will be implemented only on the deep and broad crawl results (i.e. not on the trainer, the seeds or seeds-url)
119 | 2) The login will be only on-(dd's)-demand. (i.e the user won't be able to load some url+usr+pwd as seeds or the like)
120 |
121 | Basic Flow:
122 |
123 | 1) While DD is crawling, it would be able to identify sites that requires logging in's for further crawling.
124 | 2) DD will report these sites to a ``dd-login-input`` topic.
125 | 3) Sitehound-backend will listen to the queue and it will:
126 |
127 | a) take a screenshot of the page (may be useful in case of catcha, so we don't waste time, etc.)
128 | b) store this message
129 |
130 | 4) A option will be added on Sitehound to show the users this snapshot, along with the fields to be completed,
131 | as label + inputs, where each label is one keys from dd-login-input
132 | 5) When the user fulfills one message from the step above, the data is stored(wo encryption by now),
133 | and sent to DD via the ``dd-login-output`` topic.
134 | 6) DD receives this message and performs the logging in for that domain.
135 | 7) DD will send a notification once the login was successfull or failed to ``dd-login-result``.
136 |
137 | dd-login-input
138 | --------------
139 |
140 | Topic: ``dd-login-input``. New login form found::
141 |
142 | {
143 | "workspace_id":"57ea86a9d11ff300054a3519",
144 | "job_id":"57ea86a9d11ff300054a3519",
145 | "domain":"example.com",
146 | "url": "http://example.com/login", // login page
147 | "keys": ["txtUser", "txtPassword"], // identifiers of the fields required to be completed by the user, whatever it makes sense to use them back by dd
148 | "screenshot":"57ea86a9d11ff300054a351.....afazzz9" // b64 representation of the bytes of the image. (PNG format)
149 | }
150 |
151 | dd-login-output
152 | ---------------
153 |
154 | Topic: ``dd-login-output``. Credentials provided by the user and sent for crawling::
155 |
156 | {
157 | "workspace_id":"57ea86a9d11ff300054a3519",
158 | "job_id":"57ea86a9d11ff300054a",
159 | "id":"75ea86a9d11ff300022f", // the id of the credentials
160 | "domain":"example.com",
161 | "url": "http://example.com/login", // login page as provided
162 | "key_values": {"txtUser":"user1234", "txtPassword":"12345678"} // identifiers of the fields with the value entered by the user.
163 | }
164 |
165 |
166 | dd-login-result
167 | ---------------
168 |
169 | Topic: ``dd-login-result``. Credentials result after trying to log in sent from the crawling::
170 |
171 | {
172 | "id":"75ea86a9d11ff300022f", // the id of the credentials
173 | "result": "success" | "failed"
174 | }
175 |
176 |
177 |
178 | DD Modeler
179 | ==========
180 |
181 | This is for page classifier training.
182 |
183 | dd-modeler-input
184 | ----------------
185 |
186 | Topic: ``dd-modeler-input``. Training page classifier model. All workspace annotations are sent,
187 | html is fetched based on ``html_location`` field::
188 |
189 | {
190 | "workspace_id": "workspace id",
191 | "pages": [
192 | {
193 | "url": "http://example.com",
194 | "html_location": "specifies-where-to-get-html",
195 | "relevant": true
196 | },
197 | {
198 | "url": "http://example.com/1",
199 | "html_location": "specifies-where-to-get-html",
200 | "relevant": false
201 | }
202 | ]
203 | }
204 |
205 | ``html_location`` looks like this: http://hh-elasticsearch:9200/crawled-open/analyzed/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2Fmock_object?_source=result.crawlResultDto.html
206 |
207 | dd-modeler-progress
208 | -------------------
209 |
210 | Topic: ``dd-modeler-progress``. Progress report when training the model::
211 |
212 | {
213 | "workspace_id": "workspace id",
214 | "percentage_done": 98.123
215 | }
216 |
217 | dd-modeler-output
218 | -----------------
219 |
220 | Topic: ``dd-modeler-output``. Result of training the model::
221 |
222 | {
223 | "workspace_id": "workspace id",
224 | "quality": "json data",
225 | "model": "b64-encoded page classifier model"
226 | }
227 |
228 | JSON data format for the ``quality`` field::
229 |
230 | {
231 | "main_score": 89.2,
232 | "n_labeled": 20,
233 | "n_positive": 10,
234 | "advice": [{"kind":"INFO", "message": "keep annotating"}, ...],
235 | "description": ["item1", "item2"],
236 | "weights": {"pos": ..., "neg": ..., "pos_remaining": 0, "neg_remaining": 0},
237 | "tooltips": {"ROC AUC": "some description"}
238 | }
239 |
240 |
241 |
242 | DD Trainer
243 | ==========
244 |
245 | This is for training deep-deep link classifier model by crawling.
246 |
247 | dd-trainer-input
248 | ----------------
249 |
250 | Topic: ``dd-trainer-input``.
251 |
252 | This message is sent by the page classifier (DD Modeller).
253 | Start the crawl::
254 |
255 | {
256 | "workspace_id": "workspace id",
257 | "page_model": "b64-encoded page classifier",
258 | "urls": ["http://example.com", "http://example.com/2"],
259 | }
260 |
261 | dd-trainer-output-*
262 | -----------------------
263 |
264 | Topic ``dd-trainer-output-pages``.
265 |
266 | Sample of crawled pages::
267 |
268 | {
269 | "workspace_id": "workspace id",
270 | "page_samples": [
271 | {"url": "http://example1.com", "domain": "example1.com", "score": 80},
272 | {"url": "http://example2.com", "domain": "example2.com", "score": 90}
273 | ]
274 | }
275 |
276 | Topic: ``dd-trainer-output-progress``.
277 |
278 | Progress update (to be displayed in the UI, probably more fields will be added)::
279 |
280 | {
281 | "workspace_id": "workspace id",
282 | "progress": "Crawled N pages and M domains, average reward is 0.122",
283 | "percentage_done": 98.123
284 | }
285 |
286 |
287 | DD Crawler
288 | ==========
289 |
290 | This is the smart crawler.
291 |
292 |
293 | dd-crawler-input
294 | ----------------
295 |
296 | Topic ``dd-crawler-input``. Start the crawl::
297 |
298 | {
299 | "id": "crawl id",
300 | "workspace_id": "workspace id",
301 | "page_model": "b64-encoded page classifier",
302 | "urls": ["http://example.com", "http://example.com/2"],
303 | "broadness": "BROAD" // Valid codes are ["N10", "N100", "N1000", "N10000", "BROAD"],
304 | "page_limit": 100
305 | }
306 |
307 | ``page_limit`` is optional (defaults to 10000000).
308 |
309 | Stop the crawl::
310 |
311 | {
312 | "id": "the same crawl id",
313 | "stop": true
314 | }
315 |
316 | dd-crawler-output-*
317 | -------------------
318 |
319 | Crawler output.
320 |
321 | Topic: ``dd-crawler-output-pages``.
322 |
323 | Sample of crawled pages::
324 |
325 | {
326 | "id": "crawl id",
327 | "workspace_id": "workspace id",
328 | "page_samples": [
329 | {"url": "http://example1.com", "domain": "example1.com", "score": 80},
330 | {"url": "http://example2.com", "domain": "example2.com", "score": 90}
331 | ]
332 | }
333 |
334 | Topic: ``dd-crawler-output-progress``.
335 |
336 | Progress update (to be displayed in the UI, probably more fields will be added)::
337 |
338 | {
339 | "id": "crawl id",
340 | "workspace_id": "workspace id",
341 | "progress": "Crawled N pages and M domains, average reward is 0.122",
342 | "percentage_done": 98.123
343 | }
344 |
--------------------------------------------------------------------------------
/img/Sitehound-architecture-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/sitehound/9c7c986ba7b08b04327adeed028e8a347adf3e93/img/Sitehound-architecture-diagram.png
--------------------------------------------------------------------------------
/installation/README.md:
--------------------------------------------------------------------------------
1 | INSTALLATION GUIDE
2 | ==================
3 |
4 |
5 | # Overview
6 |
7 | 1. The application is fully dockerized for ease of deployment, nevertheless it is a large system with more than a dozen containers.
8 | 2. There are two major subsystems, Sitehound-* and deep-*. You can read more [here](https://github.com/TeamHG-Memex/sitehound)
9 | 3. Ideally these two components are deploying separately and they will communicate via Apache Kafka.
10 | 4. For reference, there is an Architecture Diagram shown below.
11 | 5. Once deployed you will only interact with the application via your browser.
12 |
13 |
14 | # Provided Hosted version
15 |
16 | We are aware that the hardware requirements are not easy to meet so we also provide a hosted version.
17 | Send us an email to [support@hyperiongray.com](mailto:support@hyperiongray.com)
18 | and we will set you up and account.
19 |
20 |
21 | # Recommended Hardware
22 |
23 | Since the stack of applications contains several infrastructure containers (mongo, elasticsearch, redis) and is designed to take advantage of the multicore architecture, we would recommend:
24 |
25 | For a single host:
26 |
27 | - At least 100GB of storage (if you plan to do serious crawling, 1TB is better)
28 | - At least 16GB of dedicated RAM, 32GB is better
29 | - 8 CPU cores
30 |
31 |
32 | # Cloud Deployment on Amazon's EC2
33 |
34 | 1. For a single host installation, we recomend ``m4.2xlarge`` instance type.
35 | 2. On the security groups panel, open the inbound traffic for ports:
36 | ``5081``, ``2181`` and ``9092`` on the Sitehound host EC2.
37 |
38 |
39 | # Prerequisites
40 |
41 | 1. Ubuntu 16.04 is the recommended OS but it should play well with other distros.
42 | It won't work on Windows nor Mac though so a Virtual Machine would be needed
43 | in these cases.
44 | You can get one [here](http://www.osboxes.org/ubuntu/#ubuntu-16-04-vbox).
45 |
46 | 2. Update the system:
47 |
48 | ```
49 | sudo apt update
50 | ```
51 |
52 | 3. Docker CE or better [installed](https://docs.docker.com/engine/installation/).
53 | Docker API version should be at least 1.24.
54 | For Ubuntu 16.04 run:
55 |
56 | ```
57 | sudo apt install docker.io
58 | ```
59 |
60 | 4. docker-compose [installed](https://docs.docker.com/compose/install/)
61 | on the Deep-deep server.
62 | Version should be at least 1.10. For Ubuntu 16.04 run:
63 |
64 | ```
65 | sudo apt install -y python-pip
66 | export LC_ALL="en_US.UTF-8"
67 | export LC_CTYPE="en_US.UTF-8"
68 | sudo -H pip install docker-compose
69 | ```
70 |
71 | 5. ``$USER`` (current user) [added](https://docs.docker.com/engine/installation/linux/linux-postinstall/)
72 | to the docker group. For Ubuntu 16.04 and user ``ubuntu``, run:
73 |
74 | ```
75 | sudo usermod -aG docker ubuntu
76 | ```
77 |
78 | and re-login.
79 |
80 |
81 | # Base installation
82 |
83 | 1. From the provided .zip file, copy the folder ./sitehound-configs to the home directory of the server, or servers if you choose the dual host installation.
84 |
85 | ```
86 | scp -r sitehound-configs ubuntu@{host-ip}:~
87 | ```
88 |
89 |
90 | 2. All further actions are executed in this ``sitehound-configs`` directory:
91 |
92 | ```
93 | cd sitehound-configs
94 | ```
95 |
96 | # Deep-deep installation
97 |
98 | Download deep-deep models on the host where the deep-deep will be running:
99 |
100 | ```
101 | ./download-deep-deep-models.sh
102 | ```
103 |
104 | # Running Sitehound + Deep-deep on the same host
105 |
106 | 1. Sitehound uses external services that are not part of this suite, such as: [Splash](https://github.com/scrapinghub/splash), excavaTor (currently private) for onion searches and [Crawlera](https://scrapinghub.com/crawlera) as a proxy.
107 | In order to have the app fully running, you need to specify hosts/credentials for them on the
108 | [config file](/installation/sitehound-configs/volumes/sitehound-backend/config/properties-override/application-override.properties) by replacing the placeholder values with the actual ones
109 |
110 |
111 | 2. Make sure port ``5081`` is open.
112 | Start all services with:
113 |
114 | ```
115 | docker-compose \
116 | -f docker-compose.yml \
117 | -f docker-compose.deep-deep.yml \
118 | up -d
119 | ```
120 |
121 | # Where is the data stored?
122 |
123 | Sitehound data is kept in mongodb and elasticsearch databases, outside the containers,
124 | using mounted volumes in ``volumes/`` folder,
125 | so if you do ``docker-compose down`` or remove containers with ``docker rm``,
126 | data will be persisted.
127 |
128 | Crawl results of deep-deep would be in ``deep-deep-jobs/``,
129 | and broad crawl results will be in ``./dd-jobs/``. Crawled data in CDRv3 format
130 | will be in ``./dd-jobs/*/out/*.jl.gz``,
131 | and downloaded media items in ``./dd-jobs/*/out/media/``.
132 |
133 |
134 | # What you would be using without installing
135 |
136 | The application will be using several external services:
137 |
138 | 1. Crawlera: a proxy rotator.
139 | 2. Our clustered hosted version of [Splash](https://github.com/scrapinghub/splash).
140 | 3. An onion index.
141 |
142 |
143 | # You are ready!
144 |
145 | 1. Navigate to [http://localhost:5081](http://localhost:5081)
146 | (or your Sitehound's IP address).
147 | 2. Log in with user ``admin@hyperiongray.com`` and password ``changeme!``.
148 | 3. Create a new workspace, and click on the row to select it.
149 | 4. Follow this [walk-through](/user-docs/sitehound-walkthrough-guide_October-2017_v2.pdf) for a better insight.
150 |
151 | # Help
152 |
153 | You can reach us at [support@hyperiongray.com](mailto:support@hyperiongray.com).
154 |
--------------------------------------------------------------------------------
/installation/sitehound-configs/docker-compose.deep-deep.kafka-host.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 |
4 | services:
5 |
6 | hh-trainer:
7 | external_links: []
8 | extra_hosts:
9 | - "hh-kafka:${KAFKA_HOST}"
10 |
11 | hh-crawler:
12 | external_links: []
13 | extra_hosts:
14 | - "hh-kafka:${KAFKA_HOST}"
15 |
16 | hh-modeler:
17 | external_links: []
18 | extra_hosts:
19 | - "hh-kafka:${KAFKA_HOST}"
20 |
--------------------------------------------------------------------------------
/installation/sitehound-configs/docker-compose.deep-deep.yml:
--------------------------------------------------------------------------------
1 | version: '2.1'
2 |
3 |
4 | services:
5 |
6 | hh-trainer:
7 | image: hyperiongray/hh-deep-deep:0.4.8 # update below too!
8 | restart: always
9 | depends_on:
10 | hh-kafka:
11 | condition: service_healthy
12 | volumes:
13 | - /var/run/docker.sock:/var/run/docker.sock
14 | - ./trainer-jobs:/opt/hh-deep-deep/trainer-jobs
15 | command:
16 | - hh-deep-deep-service
17 | - trainer
18 | - --kafka-host=hh-kafka
19 | - --docker-image=hyperiongray/deep-deep-hh:0.1.3 # update below too!
20 | - --host-root=${PWD}
21 | - --proxy-container=hh-deep-deep-tor-proxy
22 |
23 | hh-crawler-trainer:
24 | image: hyperiongray/hh-deep-deep:0.4.8 # update below too!
25 | restart: always
26 | depends_on:
27 | hh-kafka:
28 | condition: service_healthy
29 | volumes:
30 | - /var/run/docker.sock:/var/run/docker.sock
31 | - ./crawler-trainer-jobs:/opt/hh-deep-deep/crawler-trainer-jobs
32 | command:
33 | - hh-deep-deep-service
34 | - crawler-trainer
35 | - --kafka-host=hh-kafka
36 | - --docker-image=hyperiongray/deep-deep-hh:0.1.3
37 | - --host-root=${PWD}
38 | - --proxy-container=hh-deep-deep-tor-proxy
39 |
40 | hh-crawler:
41 | image: hyperiongray/hh-deep-deep:0.4.8 # update below too!
42 | restart: always
43 | depends_on:
44 | hh-kafka:
45 | condition: service_healthy
46 | volumes:
47 | - /var/run/docker.sock:/var/run/docker.sock
48 | - ./crawler-jobs:/opt/hh-deep-deep/crawler-jobs
49 | command:
50 | - hh-deep-deep-service
51 | - crawler
52 | - --kafka-host=hh-kafka
53 | - --docker-image=hyperiongray/dd-crawler-hh:0.3.5 # update below too!
54 | - --max-workers=8
55 | - --host-root=${PWD}
56 | - --proxy-container=hh-deep-deep-tor-proxy
57 |
58 | hh-deepcrawler:
59 | image: hyperiongray/hh-deep-deep:0.4.8
60 | restart: always
61 | depends_on:
62 | hh-kafka:
63 | condition: service_healthy
64 | volumes:
65 | - /var/run/docker.sock:/var/run/docker.sock
66 | - ./deepcrawler-jobs:/opt/hh-deep-deep/deepcrawler-jobs
67 | command:
68 | - hh-deep-deep-service
69 | - deepcrawler
70 | - --kafka-host=hh-kafka
71 | - --docker-image=hyperiongray/dd-crawler-hh:0.3.5
72 | - --max-workers=8
73 | - --host-root=${PWD}
74 | - --proxy-container=hh-deep-deep-tor-proxy
75 |
76 | hh-modeler:
77 | image: hyperiongray/hh-page-clf:0.5.3 # update git submodule too
78 | restart: always
79 | depends_on:
80 | hh-kafka:
81 | condition: service_healthy
82 | volumes:
83 | - ./models:/models
84 | command:
85 | - hh-page-clf-service
86 | - --kafka-host=hh-kafka
87 | - --random-pages=/models/random-pages.jl.gz
88 | - --debug
89 |
90 | tor-proxy:
91 | image: hyperiongray/tor-proxy:0.1.1
92 | network_mode: bridge
93 | expose:
94 | - "9050"
95 |
96 | proxy:
97 | image: hyperiongray/privoxy:0.1.0
98 | network_mode: bridge
99 | container_name: hh-deep-deep-tor-proxy
100 | expose:
101 | - "8118"
102 | links:
103 | - "tor-proxy:torproxy"
104 |
--------------------------------------------------------------------------------
/installation/sitehound-configs/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2.1'
2 |
3 | services:
4 |
5 | hh-kafka:
6 | image: hyperiongray/kafka-2.11-0.10.1.1:2.5
7 | expose:
8 | - "9092"
9 | - "2181"
10 | healthcheck:
11 | test: ["CMD-SHELL", "nc -z 127.0.0.1 2181 && nc -z `hostname` 9092"]
12 | interval: 5s
13 | timeout: 5s
14 | retries: 10
15 |
16 | mongodb:
17 | image: hyperiongray/mongodb:1.0
18 | expose:
19 | - "27017"
20 | volumes:
21 | - ./volumes/mongodb/data/db:/data/db
22 | healthcheck:
23 | # test: ["CMD", "mongo", "--quiet", "http://localhost:27017/test", "--eval", "quit(db.runCommand({ ping: 1 }).ok ? 0 : 2)"]
24 | test: ["CMD", "nc", "-z", "127.0.0.1", "27017"]
25 | interval: 5s
26 | timeout: 5s
27 | retries: 10
28 |
29 | hh-elasticsearch:
30 | image: elasticsearch:2.0
31 | expose:
32 | - "9200"
33 | - "9300"
34 | volumes:
35 | - ./volumes/elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
36 | - ./volumes/elasticsearch/data:/usr/share/elasticsearch/data
37 | healthcheck:
38 | test: ["CMD", "curl", "-f", "http://localhost:9200"]
39 | interval: 5s
40 | timeout: 5s
41 | retries: 10
42 |
43 | thh-classifier:
44 | image: hyperiongray/thh-classfier:1.0
45 | restart: always
46 | expose:
47 | - "8889"
48 |
49 | sitehound:
50 | image: hyperiongray/sitehound:5.3.14
51 | restart: always
52 | ports:
53 | - "5081:5081"
54 | - "80:5081"
55 | links:
56 | - hh-elasticsearch:elasticsearch
57 | depends_on:
58 | hh-kafka:
59 | condition: service_healthy
60 | hh-elasticsearch:
61 | condition: service_healthy
62 | mongodb:
63 | condition: service_healthy
64 |
65 | hh-joogle:
66 | image: hyperiongray/sitehound-backend:5.3.7
67 | restart: always
68 | volumes:
69 | - ./volumes/sitehound-backend/config/properties-override:/root/sitehound-backend/config/properties-override/
70 | links:
71 | - hh-elasticsearch:elasticsearch
72 | depends_on:
73 | hh-kafka:
74 | condition: service_healthy
75 | hh-elasticsearch:
76 | condition: service_healthy
77 | mongodb:
78 | condition: service_healthy
79 |
--------------------------------------------------------------------------------
/installation/sitehound-configs/download-deep-deep-models.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -ev
4 |
5 | cd ./models
6 | wget https://s3-us-west-2.amazonaws.com/darpa-memex/thh/random-pages.jl.gz
7 | wget https://s3-us-west-2.amazonaws.com/darpa-memex/thh/lda.pkl
8 | cd ..
9 |
--------------------------------------------------------------------------------
/installation/sitehound-configs/models/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/sitehound/9c7c986ba7b08b04327adeed028e8a347adf3e93/installation/sitehound-configs/models/.keep
--------------------------------------------------------------------------------
/installation/sitehound-configs/volumes/elasticsearch/config/elasticsearch.yml:
--------------------------------------------------------------------------------
1 | network.host: 0.0.0.0
2 |
3 | script.engine.groovy.inline.update: on
4 | script.inline: true
5 | script.indexed: true
6 |
--------------------------------------------------------------------------------
/installation/sitehound-configs/volumes/elasticsearch/data/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/sitehound/9c7c986ba7b08b04327adeed028e8a347adf3e93/installation/sitehound-configs/volumes/elasticsearch/data/.keep
--------------------------------------------------------------------------------
/installation/sitehound-configs/volumes/mongodb/data/db/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/sitehound/9c7c986ba7b08b04327adeed028e8a347adf3e93/installation/sitehound-configs/volumes/mongodb/data/db/.keep
--------------------------------------------------------------------------------
/installation/sitehound-configs/volumes/sitehound-backend/config/properties-override/application-override.properties:
--------------------------------------------------------------------------------
1 | http.proxyHost=your-host.com
2 | http.proxyPort=8010
3 | http.proxyUser=your-user
4 | http.proxyPassword=your-password
5 | http.proxy.enabled=1
6 |
7 | aquarium.host=your-host.com
8 | aquarium.user=your-user
9 | aquarium.password=your-password
10 | aquarium.url.path=/render.json?png=1&width=375&height=280&html=1&resource_timeout=180&url=
11 | aquarium.threads=50
12 | aquarium.tasks.concurrent.load=40
13 |
14 | excavator.scheme=https
15 | excavator.host=your-host.com
16 | excavator.port=9200
17 | excavator.index=deeptexts
18 | excavator.user=your-user
19 | excavator.password=your-password
20 |
21 |
--------------------------------------------------------------------------------
/user-docs/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/sitehound/9c7c986ba7b08b04327adeed028e8a347adf3e93/user-docs/README.pdf
--------------------------------------------------------------------------------
/user-docs/eisvogel.latex:
--------------------------------------------------------------------------------
1 | %%
2 | % Copyright (c) 2017, Pascal Wagler;
3 | % Copyright (c) 2014--2017, John MacFarlane
4 | %
5 | % All rights reserved.
6 | %
7 | % Redistribution and use in source and binary forms, with or without
8 | % modification, are permitted provided that the following conditions
9 | % are met:
10 | %
11 | % - Redistributions of source code must retain the above copyright
12 | % notice, this list of conditions and the following disclaimer.
13 | %
14 | % - Redistributions in binary form must reproduce the above copyright
15 | % notice, this list of conditions and the following disclaimer in the
16 | % documentation and/or other materials provided with the distribution.
17 | %
18 | % - Neither the name of John MacFarlane nor the names of other
19 | % contributors may be used to endorse or promote products derived
20 | % from this software without specific prior written permission.
21 | %
22 | % THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 | % "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 | % LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 | % FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 | % COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 | % INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 | % BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 | % LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 | % CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 | % LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 | % ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 | % POSSIBILITY OF SUCH DAMAGE.
34 | %%
35 |
36 | \documentclass[$if(fontsize)$$fontsize$,$endif$$if(lang)$$babel-lang$,$endif$$if(papersize)$$papersize$paper,$endif$$for(classoption)$$classoption$$sep$,$endfor$]{scrartcl}
37 | $if(beamerarticle)$
38 | \usepackage{beamerarticle} % needs to be loaded first
39 | $endif$
40 | $if(fontfamily)$
41 | \usepackage[$for(fontfamilyoptions)$$fontfamilyoptions$$sep$,$endfor$]{$fontfamily$}
42 | $else$
43 | \usepackage{lmodern}
44 | $endif$
45 | $if(linestretch)$
46 | \usepackage{setspace}
47 | \setstretch{$linestretch$}
48 | $endif$
49 | \usepackage{amssymb,amsmath}
50 | \usepackage{ifxetex,ifluatex}
51 | \usepackage{fixltx2e} % provides \textsubscript
52 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
53 | \usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc}
54 | \usepackage[utf8]{inputenc}
55 | $if(euro)$
56 | \usepackage{eurosym}
57 | $endif$
58 | \else % if luatex or xelatex
59 | $if(mathspec)$
60 | \ifxetex
61 | \usepackage{mathspec}
62 | \else
63 | \usepackage{unicode-math}
64 | \fi
65 | $else$
66 | \usepackage{unicode-math}
67 | $endif$
68 | \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
69 | $for(fontfamilies)$
70 | \newfontfamily{$fontfamilies.name$}[$fontfamilies.options$]{$fontfamilies.font$}
71 | $endfor$
72 | $if(euro)$
73 | \newcommand{\euro}{€}
74 | $endif$
75 | $if(mainfont)$
76 | \setmainfont[$for(mainfontoptions)$$mainfontoptions$$sep$,$endfor$]{$mainfont$}
77 | $endif$
78 | $if(sansfont)$
79 | \setsansfont[$for(sansfontoptions)$$sansfontoptions$$sep$,$endfor$]{$sansfont$}
80 | $endif$
81 | $if(monofont)$
82 | \setmonofont[Mapping=tex-ansi$if(monofontoptions)$,$for(monofontoptions)$$monofontoptions$$sep$,$endfor$$endif$]{$monofont$}
83 | $endif$
84 | $if(mathfont)$
85 | $if(mathspec)$
86 | \ifxetex
87 | \setmathfont(Digits,Latin,Greek)[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
88 | \else
89 | \setmathfont[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
90 | \fi
91 | $else$
92 | \setmathfont[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
93 | $endif$
94 | $endif$
95 | $if(CJKmainfont)$
96 | \usepackage{xeCJK}
97 | \setCJKmainfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
98 | $endif$
99 | \fi
100 | % use upquote if available, for straight quotes in verbatim environments
101 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{}
102 | % use microtype if available
103 | \IfFileExists{microtype.sty}{%
104 | \usepackage[$for(microtypeoptions)$$microtypeoptions$$sep$,$endfor$]{microtype}
105 | \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
106 | }{}
107 | $if(indent)$
108 | $else$
109 | \IfFileExists{parskip.sty}{%
110 | \usepackage{parskip}
111 | }{% else
112 | \setlength{\parindent}{0pt}
113 | \setlength{\parskip}{6pt plus 2pt minus 1pt}
114 | }
115 | $endif$
116 | \PassOptionsToPackage{hyphens}{url} % url is loaded by hyperref
117 | $if(verbatim-in-note)$
118 | \usepackage{fancyvrb}
119 | $endif$
120 | \usepackage[unicode=true]{hyperref}
121 | $if(colorlinks)$
122 | \PassOptionsToPackage{usenames,dvipsnames}{color} % color is loaded by hyperref
123 | $endif$
124 | \hypersetup{
125 | $if(title-meta)$
126 | pdftitle={$title-meta$},
127 | $endif$
128 | $if(author-meta)$
129 | pdfauthor={$author-meta$},
130 | $endif$
131 | $if(subject)$
132 | pdfsubject={$subject$},
133 | $endif$
134 | $if(keywords)$
135 | pdfkeywords={$for(keywords)$$keywords$$sep$, $endfor$},
136 | $endif$
137 | $if(tags)$
138 | pdfkeywords={$for(tags)$$tags$$sep$, $endfor$},
139 | $endif$
140 | $if(colorlinks)$
141 | colorlinks=true,
142 | linkcolor=$if(linkcolor)$$linkcolor$$else$Maroon$endif$,
143 | citecolor=$if(citecolor)$$citecolor$$else$Blue$endif$,
144 | urlcolor=$if(urlcolor)$$urlcolor$$else$Blue$endif$,
145 | $else$
146 | pdfborder={0 0 0},
147 | $endif$
148 | breaklinks=true}
149 | \urlstyle{same} % don't use monospace font for urls
150 | $if(verbatim-in-note)$
151 | \VerbatimFootnotes % allows verbatim text in footnotes
152 | $endif$
153 | $if(geometry)$
154 | \usepackage[$for(geometry)$$geometry$$sep$,$endfor$]{geometry}
155 | $endif$
156 | $if(natbib)$
157 | \usepackage{natbib}
158 | \bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$}
159 | $endif$
160 | $if(biblatex)$
161 | \usepackage[$if(biblio-style)$style=$biblio-style$,$endif$$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$]{biblatex}
162 | $for(bibliography)$
163 | \addbibresource{$bibliography$}
164 | $endfor$
165 | $endif$
166 | $if(listings)$
167 | \usepackage{listings}
168 | \newcommand{\passthrough}[1]{#1}
169 | $endif$
170 | $if(lhs)$
171 | \lstnewenvironment{code}{\lstset{language=Haskell,basicstyle=\small\ttfamily}}{}
172 | $endif$
173 | $if(highlighting-macros)$
174 | $highlighting-macros$
175 | $endif$
176 | $if(tables)$
177 | \usepackage{longtable,booktabs}
178 | % Fix footnotes in tables (requires footnote package)
179 | \IfFileExists{footnote.sty}{\usepackage{footnote}\makesavenoteenv{long table}}{}
180 | $endif$
181 | $if(graphics)$
182 | \usepackage{graphicx,grffile}
183 | \makeatletter
184 | \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
185 | \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
186 | \makeatother
187 | % Scale images if necessary, so that they will not overflow the page
188 | % margins by default, and it is still possible to overwrite the defaults
189 | % using explicit options in \includegraphics[width, height, ...]{}
190 | \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
191 | $endif$
192 | $if(links-as-notes)$
193 | % Make links footnotes instead of hotlinks:
194 | \DeclareRobustCommand{\href}[2]{#2\footnote{\url{#1}}}
195 | $endif$
196 | $if(strikeout)$
197 | \usepackage[normalem]{ulem}
198 | % avoid problems with \sout in headers with hyperref:
199 | \pdfstringdefDisableCommands{\renewcommand{\sout}{}}
200 | $endif$
201 | \setlength{\emergencystretch}{3em} % prevent overfull lines
202 | \providecommand{\tightlist}{%
203 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
204 | $if(numbersections)$
205 | \setcounter{secnumdepth}{$if(secnumdepth)$$secnumdepth$$else$5$endif$}
206 | $else$
207 | \setcounter{secnumdepth}{0}
208 | $endif$
209 | $if(subparagraph)$
210 | $else$
211 | % Redefines (sub)paragraphs to behave more like sections
212 | \ifx\paragraph\undefined\else
213 | \let\oldparagraph\paragraph
214 | \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
215 | \fi
216 | \ifx\subparagraph\undefined\else
217 | \let\oldsubparagraph\subparagraph
218 | \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
219 | \fi
220 | $endif$
221 | $if(lang)$
222 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
223 | \usepackage[shorthands=off,$for(babel-otherlangs)$$babel-otherlangs$,$endfor$main=$babel-lang$]{babel}
224 | $if(babel-newcommands)$
225 | $babel-newcommands$
226 | $endif$
227 | \else
228 | % load polyglossia as late as possible as it *could* call bidi if RTL lang (e.g. Hebrew or Arabic)
229 | \usepackage{polyglossia}
230 | \setmainlanguage[$polyglossia-lang.options$]{$polyglossia-lang.name$}
231 | $for(polyglossia-otherlangs)$
232 | \setotherlanguage[$polyglossia-otherlangs.options$]{$polyglossia-otherlangs.name$}
233 | $endfor$
234 | \fi
235 | $endif$
236 | $if(dir)$
237 | \ifxetex
238 | % load bidi as late as possible as it modifies e.g. graphicx
239 | $if(latex-dir-rtl)$
240 | \usepackage[RTLdocument]{bidi}
241 | $else$
242 | \usepackage{bidi}
243 | $endif$
244 | \fi
245 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
246 | \TeXXeTstate=1
247 | \newcommand{\RL}[1]{\beginR #1\endR}
248 | \newcommand{\LR}[1]{\beginL #1\endL}
249 | \newenvironment{RTL}{\beginR}{\endR}
250 | \newenvironment{LTR}{\beginL}{\endL}
251 | \fi
252 | $endif$
253 |
254 | % set default figure placement to htbp
255 | \makeatletter
256 | \def\fps@figure{htbp}
257 | \makeatother
258 |
259 | $for(header-includes)$
260 | $header-includes$
261 | $endfor$
262 |
263 | $if(title)$
264 | \title{$title$$if(thanks)$\thanks{$thanks$}$endif$}
265 | $endif$
266 | $if(subtitle)$
267 | \providecommand{\subtitle}[1]{}
268 | \subtitle{$subtitle$}
269 | $endif$
270 | $if(author)$
271 | \author{$for(author)$$author$$sep$ \and $endfor$}
272 | $endif$
273 | $if(institute)$
274 | \providecommand{\institute}[1]{}
275 | \institute{$for(institute)$$institute$$sep$ \and $endfor$}
276 | $endif$
277 | \date{$date$}
278 |
279 |
280 |
281 | %%
282 | %% added
283 | %%
284 |
285 | %
286 | % No language specified? take American English.
287 | %
288 | $if(lang)$$else$
289 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
290 | \usepackage[shorthands=off,$for(babel-otherlangs)$$babel-otherlangs$,$endfor$main=english]{babel}
291 | $if(babel-newcommands)$
292 | $babel-newcommands$
293 | $endif$
294 | \else
295 | % load polyglossia as late as possible as it *could* call bidi if RTL lang (e.g. Hebrew or Arabic)
296 | \usepackage{polyglossia}
297 | \setmainlanguage[]{english}
298 | $for(polyglossia-otherlangs)$
299 | \setotherlanguage[$polyglossia-otherlangs.options$]{$polyglossia-otherlangs.name$}
300 | $endfor$
301 | \fi
302 | $endif$
303 |
304 | %
305 | % colors
306 | %
307 | \usepackage[usenames, dvipsnames, svgnames, table]{xcolor}
308 |
309 | %
310 | % listing colors
311 | %
312 | \definecolor{listing-background}{rgb}{0.97,0.97,0.97}
313 | \definecolor{listing-rule}{HTML}{B3B2B3}
314 | \definecolor{listing-numbers}{HTML}{B3B2B3}
315 | \definecolor{listing-text-color}{HTML}{000000}
316 | \definecolor{listing-keyword}{HTML}{435489}
317 | \definecolor{listing-identifier}{HTML}{435489}
318 | \definecolor{listing-string}{HTML}{00999a}
319 | \definecolor{listing-comment}{HTML}{8e8e8e}
320 | \definecolor{listing-javadoc-comment}{HTML}{006CA9}
321 |
322 | %\definecolor{listing-background}{rgb}{0.97,0.97,0.97}
323 | %\definecolor{listing-rule}{HTML}{B3B2B3}
324 | %\definecolor{listing-numbers}{HTML}{B3B2B3}
325 | %\definecolor{listing-text-color}{HTML}{000000}
326 | %\definecolor{listing-keyword}{HTML}{D8006B}
327 | %\definecolor{listing-identifier}{HTML}{000000}
328 | %\definecolor{listing-string}{HTML}{006CA9}
329 | %\definecolor{listing-comment}{rgb}{0.25,0.5,0.35}
330 | %\definecolor{listing-javadoc-comment}{HTML}{006CA9}
331 |
332 | %
333 | % for the background color of the title page
334 | %
335 | $if(titlepage)$
336 | \usepackage{pagecolor}
337 | \usepackage{afterpage}
338 | $endif$
339 |
340 | %
341 | % TOC depth and
342 | % section numbering depth
343 | %
344 | \setcounter{tocdepth}{3}
345 | $if(numbersections)$
346 | \setcounter{secnumdepth}{3}
347 | $endif$
348 |
349 | %
350 | % line spacing
351 | %
352 | \usepackage{setspace}
353 | \setstretch{1.2}
354 |
355 | %
356 | % break urls
357 | %
358 | \PassOptionsToPackage{hyphens}{url}
359 |
360 | %
361 | % When using babel or polyglossia with biblatex, loading csquotes is recommended
362 | % to ensure that quoted texts are typeset according to the rules of your main language.
363 | %
364 | \usepackage{csquotes}
365 |
366 | %
367 | % paper size and
368 | % margins
369 | %
370 | \usepackage[$if(papersize)$$papersize$paper,$else$a4paper,$endif$margin=2.5cm,includehead=true,includefoot=true,centering]{geometry}
371 |
372 | %
373 | % captions
374 | %
375 | \usepackage[font={small,it}]{caption}
376 | \newcommand{\imglabel}[1]{\textbf{\textit{(#1)}}}
377 |
378 | %
379 | % Source Sans Pro as the default font family
380 | % Source Code Pro for monospace text
381 | %
382 | % 'default' option sets the default
383 | % font family to Source Sans Pro, not \sfdefault.
384 | %
385 | \usepackage[default]{sourcesanspro}
386 | \usepackage{sourcecodepro}
387 |
388 | %
389 | % heading font
390 | %
391 | \newcommand*{\heading}{\fontfamily{\sfdefault}\selectfont}
392 |
393 | %
394 | % heading color
395 | %
396 | \usepackage{relsize}
397 | \usepackage{sectsty}
398 | \definecolor{almostblack}{RGB}{40,40,40}
399 | \allsectionsfont{\sffamily\color{almostblack}}
400 |
401 | %
402 | % variables for title and author
403 | %
404 | \usepackage{titling}
405 | \title{$title$}
406 | \author{$for(author)$$author$$sep$, $endfor$}
407 |
408 | %
409 | % environment for boxes
410 | %
411 | %\usepackage{framed}
412 |
413 | %
414 | % tables
415 | %
416 | \usepackage{booktabs} % needed for midrule
417 | \usepackage{tabularx}
418 | \renewcommand{\arraystretch}{1.6} % table spacing
419 |
420 |
421 | %
422 | % remove paragraph indention
423 | %
424 | \setlength{\parindent}{0pt}
425 | \setlength{\parskip}{6pt plus 2pt minus 1pt}
426 | \setlength{\emergencystretch}{3em} % prevent overfull lines
427 |
428 | %
429 | %
430 | % Listings
431 | %
432 | %
433 |
434 | $if(listings)$
435 | \lstdefinestyle{eisvogel_listing_style}{
436 | language=java,
437 | % numbers=left,
438 | backgroundcolor=\color{listing-background},
439 | basicstyle=\color{listing-text-color}\small\ttfamily{}, % print whole listing small
440 | xleftmargin=0.8em, % 2.8 with line numbers
441 | breaklines=true,
442 | frame=single,
443 | framesep=0.6mm,
444 | rulecolor=\color{listing-rule},
445 | frameround=ffff,
446 | framexleftmargin=0.4em, % 2.4 with line numbers | 0.4 without them
447 | tabsize=4, %width of tabs
448 | numberstyle=\color{listing-numbers},
449 | aboveskip=1.0em,
450 | keywordstyle=\color{listing-keyword}\bfseries, % underlined bold black keywords
451 | classoffset=0,
452 | sensitive=true,
453 | identifierstyle=\color{listing-identifier}, % nothing happens
454 | commentstyle=\color{listing-comment}, % white comments
455 | morecomment=[s][\color{listing-javadoc-comment}]{/**}{*/},
456 | stringstyle=\color{listing-string}, % typewriter type for strings
457 | showstringspaces=false, % no special string spaces
458 | escapeinside={/*@}{@*/}, % for comments
459 | literate=
460 | {á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1
461 | {Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1
462 | {à}{{\`a}}1 {è}{{\'e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1
463 | {À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1
464 | {ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1
465 | {Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1
466 | {â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1
467 | {Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1
468 | {œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1
469 | {ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1
470 | {€}{{\EUR}}1 {£}{{\pounds}}1 {«}{{\guillemotleft}}1
471 | {»}{{\guillemotright}}1 {ñ}{{\~n}}1 {Ñ}{{\~N}}1 {¿}{{?`}}1
472 | }
473 | \lstset{style=eisvogel_listing_style}
474 |
475 | \lstdefinelanguage{XML}
476 | {
477 | morestring=[b]",
478 | moredelim=[s][\bfseries\color{listing-keyword}]{<}{\ },
479 | moredelim=[s][\bfseries\color{listing-keyword}]{}{>},
480 | moredelim=[l][\bfseries\color{listing-keyword}]{/>},
481 | moredelim=[l][\bfseries\color{listing-keyword}]{>},
482 | morecomment=[s]{}{?>},
483 | morecomment=[s]{},
484 | commentstyle=\color{listing-comment},
485 | stringstyle=\color{listing-string},
486 | identifierstyle=\color{listing-identifier}
487 | }
488 | $endif$
489 |
490 | %
491 | % header and footer
492 | %
493 | \usepackage{fancyhdr}
494 | \pagestyle{fancy}
495 | \fancyhead{}
496 | \fancyfoot{}
497 | \lhead{$title$}
498 | \chead{}
499 | \rhead{$date$}
500 | \lfoot{$for(author)$$author$$sep$, $endfor$}
501 | \cfoot{}
502 | \rfoot{\thepage}
503 | \renewcommand{\headrulewidth}{0.4pt}
504 | \renewcommand{\footrulewidth}{0.4pt}
505 |
506 | %%
507 | %% end added
508 | %%
509 |
510 | \begin{document}
511 |
512 | %%
513 | %% begin titlepage
514 | %%
515 |
516 | $if(titlepage)$
517 | \begin{titlepage}
518 | \newgeometry{left=6cm}
519 | $if(titlepage-color)$
520 | \definecolor{titlepage-color}{HTML}{$titlepage-color$}
521 | \newpagecolor{titlepage-color}\afterpage{\restorepagecolor}
522 | $endif$
523 | \newcommand{\colorRule}[3][black]{\textcolor[HTML]{#1}{\rule{#2}{#3}}}
524 | \begin{flushleft}
525 | \noindent
526 | \\[-1em]
527 | \color[HTML]{$if(titlepage-text-color)$$titlepage-text-color$$else$5F5F5F$endif$}
528 | \makebox[0pt][l]{\colorRule[$if(titlepage-rule-color)$$titlepage-rule-color$$else$435488$endif$]{1.3\textwidth}{$if(titlepage-rule-height)$$titlepage-rule-height$$else$4$endif$pt}}
529 | \par
530 | \noindent
531 |
532 | { \setstretch{1.4}
533 | \vfill
534 | \noindent {\huge \textbf{\textsf{$title$}}}
535 | $if(subtitle)$
536 | \vskip 1em
537 | {\Large \textsf{$subtitle$}}
538 | $endif$
539 | \vskip 2em
540 | \noindent
541 | {\Large \textsf{\uppercase{$for(author)$$author$$sep$, $endfor$}}
542 | \vfill
543 | }
544 |
545 | \textsf{$date$}}
546 | \end{flushleft}
547 | \end{titlepage}
548 | \restoregeometry
549 | $endif$
550 |
551 | %%
552 | %% end titlepage
553 | %%
554 |
555 | $if(abstract)$
556 | \begin{abstract}
557 | $abstract$
558 | \end{abstract}
559 | $endif$
560 |
561 | $for(include-before)$
562 | $include-before$
563 |
564 | $endfor$
565 | $if(toc)$
566 | {
567 | $if(colorlinks)$
568 | \hypersetup{linkcolor=$if(toccolor)$$toccolor$$else$black$endif$}
569 | $endif$
570 | \setcounter{tocdepth}{$toc-depth$}
571 | \tableofcontents
572 | }
573 | $endif$
574 | $if(lot)$
575 | \listoftables
576 | $endif$
577 | $if(lof)$
578 | \listoffigures
579 | $endif$
580 | $body$
581 |
582 | $if(natbib)$
583 | $if(bibliography)$
584 | $if(biblio-title)$
585 | $if(book-class)$
586 | \renewcommand\bibname{$biblio-title$}
587 | $else$
588 | \renewcommand\refname{$biblio-title$}
589 | $endif$
590 | $endif$
591 | \bibliography{$for(bibliography)$$bibliography$$sep$,$endfor$}
592 |
593 | $endif$
594 | $endif$
595 | $if(biblatex)$
596 | \printbibliography$if(biblio-title)$[title=$biblio-title$]$endif$
597 |
598 | $endif$
599 | $for(include-after)$
600 | $include-after$
601 |
602 | $endfor$
603 | \end{document}
604 |
--------------------------------------------------------------------------------
/user-docs/sitehound-walkthrough-guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/sitehound/9c7c986ba7b08b04327adeed028e8a347adf3e93/user-docs/sitehound-walkthrough-guide.pdf
--------------------------------------------------------------------------------
/user-docs/sitehound-walkthrough-guide_October-2017_v2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/sitehound/9c7c986ba7b08b04327adeed028e8a347adf3e93/user-docs/sitehound-walkthrough-guide_October-2017_v2.pdf
--------------------------------------------------------------------------------