`;
120 |
121 | // If the comment exists, we update it
122 | if (comment !== undefined) {
123 | core.info('Existing comment found.');
124 | await octokit.rest.issues.updateComment({
125 | ...context.repo,
126 | comment_id: comment.id,
127 | body: message,
128 | });
129 | core.info(`Updated comment id '${comment.id}'.`);
130 | return;
131 | }
132 |
133 | octokit.rest.issues.createComment({
134 | ...context.repo,
135 | issue_number: prNumber,
136 | body: message,
137 | });
138 | } catch (error) {
139 | let errorMessage = 'An unexpected error happened.';
140 |
141 | if (error instanceof Error) {
142 | errorMessage = error.message;
143 | } else {
144 | // eslint-disable-next-line no-console
145 | console.log(error);
146 | }
147 |
148 | core.setFailed(errorMessage);
149 | }
150 | }
151 |
152 | export async function getCrawlerId(
153 | {
154 | client,
155 | name,
156 | override,
157 | }: {
158 | client: CrawlerApiClient;
159 | name: string;
160 | override: boolean;
161 | },
162 | config: Pick & {
163 | siteUrl: string;
164 | indexName: string;
165 | }
166 | ): Promise {
167 | // Searching for the crawler, based on the name and application ID
168 | const crawlers = await client.getCrawlers({
169 | name,
170 | appId: config.appId,
171 | });
172 |
173 | if (crawlers.items.length > 0) {
174 | // If the crawler exists : update it
175 | const crawlerId = crawlers.items[0].id;
176 | if (override) {
177 | const configJson = getConfig(config);
178 | await client.updateConfig(crawlerId, configJson);
179 | }
180 | return crawlerId;
181 | }
182 |
183 | // If it doesn't exist yet: create it
184 | const crawler = await client.createCrawler(name, getConfig(config));
185 | return crawler.id;
186 | }
187 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | /* eslint-disable no-console */
2 | import * as core from '@actions/core';
3 | import * as github from '@actions/github';
4 |
5 | import { CrawlerApiClient } from './crawler-api-client';
6 | import { addComment, getCrawlerId } from './helpers';
7 |
8 | // CREDENTIALS
9 | const CRAWLER_USER_ID = core.getInput('crawler-user-id');
10 | const CRAWLER_API_KEY = core.getInput('crawler-api-key');
11 | const CRAWLER_API_BASE_URL = core.getInput('crawler-api-base-url');
12 | const GITHUB_TOKEN = core.getInput('github-token');
13 |
14 | // CRAWLER CONFIGURATION
15 | const CRAWLER_NAME = core.getInput('crawler-name');
16 | const INDEX_NAME = CRAWLER_NAME.replace(/[ /]/g, '-').replace(
17 | /[/~,[\]`&|;$*\\]/g,
18 | ''
19 | );
20 | const ALGOLIA_APP_ID = core.getInput('algolia-app-id');
21 | const ALGOLIA_API_KEY = core.getInput('algolia-api-key');
22 | const SITE_URL = core.getInput('site-url');
23 | const OVERRIDE_CONFIG = core.getInput('override-config') === 'true';
24 |
25 | async function run(): Promise {
26 | const crawlerApiBaseUrl = CRAWLER_API_BASE_URL;
27 | const appId = ALGOLIA_APP_ID;
28 | const name = CRAWLER_NAME;
29 | const siteUrl = SITE_URL;
30 | const indexName = INDEX_NAME;
31 |
32 | const client = new CrawlerApiClient({
33 | crawlerApiBaseUrl,
34 | crawlerUserId: CRAWLER_USER_ID,
35 | crawlerApiKey: CRAWLER_API_KEY,
36 | });
37 | const octokit = github.getOctokit(GITHUB_TOKEN);
38 |
39 | console.log('---------CRAWLER CONFIG---------');
40 | console.log('config', JSON.stringify({ name, appId, siteUrl, indexName }));
41 |
42 | let crawlerId: string;
43 | try {
44 | crawlerId = await getCrawlerId(
45 | {
46 | client,
47 | override: OVERRIDE_CONFIG,
48 | name,
49 | },
50 | {
51 | appId,
52 | apiKey: ALGOLIA_API_KEY,
53 | indexName,
54 | siteUrl,
55 | }
56 | );
57 | } catch (err) {
58 | core.error(new Error('Can not upsert crawler'), {
59 | title: err instanceof Error ? err.message : '',
60 | });
61 | core.setFailed('Can not upsert crawler');
62 | return;
63 | }
64 |
65 | console.log(`---------- Reindexing crawler ${crawlerId} ----------`);
66 | await client.reindex(crawlerId);
67 |
68 | await addComment({ octokit, crawlerApiBaseUrl, crawlerId, appId, name });
69 | }
70 |
71 | run().catch((error) => {
72 | core.setFailed(error);
73 | });
74 |
--------------------------------------------------------------------------------
/src/types/algoliaSettings.ts:
--------------------------------------------------------------------------------
1 | // Copied from algoliasearchjs client
2 | // Explicitely copied for monaco editor, direct import did not work (but should)
3 | // If you find a solution you can remove this file
4 |
5 | export interface AlgoliaSettings {
6 | /**
7 | * The complete list of attributes that will be used for searching.
8 | */
9 | searchableAttributes?: string[];
10 | /**
11 | * @deprecated Use `searchableAttributes` instead.
12 | */
13 | attributesToIndex?: string[];
14 | /**
15 | * The complete list of attributes that will be used for faceting.
16 | */
17 | attributesForFaceting?: string[];
18 | /**
19 | * List of attributes that cannot be retrieved at query time.
20 | */
21 | unretrievableAttributes?: string[];
22 | /**
23 | * Gives control over which attributes to retrieve and which not to retrieve.
24 | */
25 | attributesToRetrieve?: string[];
26 | /**
27 | * Controls the way results are sorted.
28 | */
29 | ranking?: string[];
30 | /**
31 | * Specifies the custom ranking criterion.
32 | */
33 | customRanking?: string[];
34 | /**
35 | * Creates replicas, exact copies of an index.
36 | */
37 | replicas?: string[];
38 | /**
39 | * @deprecated Use `replicas` instead.
40 | */
41 | slaves?: string[];
42 | /**
43 | * The primary parameter is automatically added to a replica's settings when the replica is created and cannot be modified.
44 | *
45 | * Can not be setted.
46 | */
47 | primary?: string;
48 | /**
49 | * Maximum number of facet values to return for each facet during a regular search.
50 | */
51 | maxValuesPerFacet?: number;
52 | /**
53 | * Controls how facet values are sorted.
54 | */
55 | sortFacetValuesBy?: 'alpha' | 'count';
56 | /**
57 | * List of attributes to highlight.
58 | */
59 | attributesToHighlight?: string[];
60 | /**
61 | * List of attributes to snippet, with an optional maximum number of words to snippet.
62 | */
63 | attributesToSnippet?: string[];
64 | /**
65 | * The HTML string to insert before the highlighted parts in all highlight and snippet results.
66 | */
67 | highlightPreTag?: string;
68 | /**
69 | * The HTML string to insert after the highlighted parts in all highlight and snippet results.
70 | */
71 | highlightPostTag?: string;
72 | /**
73 | * String used as an ellipsis indicator when a snippet is truncated.
74 | */
75 | snippetEllipsisText?: string;
76 | /**
77 | * Restrict highlighting and snippeting to items that matched the query.
78 | */
79 | restrictHighlightAndSnippetArrays?: boolean;
80 | /**
81 | * Set the number of hits per page.
82 | */
83 | hitsPerPage?: number;
84 | /**
85 | * Set the maximum number of hits accessible via pagination.
86 | */
87 | paginationLimitedTo?: number;
88 | /**
89 | * Minimum number of characters a word in the query string must contain to accept matches with 1 typo.
90 | */
91 | minWordSizefor1Typo?: number;
92 | /**
93 | * Minimum number of characters a word in the query string must contain to accept matches with 2 typos.
94 | */
95 | minWordSizefor2Typos?: number;
96 | /**
97 | * Controls whether typo tolerance is enabled and how it is applied.
98 | */
99 | typoTolerance?: boolean | string;
100 | /**
101 | * Hether to allow typos on numbers (“numeric tokens”) in the query string.
102 | */
103 | allowTyposOnNumericTokens?: boolean;
104 | /**
105 | * List of attributes on which you want to disable typo tolerance.
106 | */
107 | disableTypoToleranceOnAttributes?: string[];
108 | /**
109 | * List of words on which you want to disable typo tolerance.
110 | */
111 | disableTypoToleranceOnWords?: string[];
112 | /**
113 | * Control which separators are indexed.
114 | */
115 | separatorsToIndex?: string;
116 | /**
117 | * Treats singular, plurals, and other forms of declensions as matching terms.
118 | */
119 | ignorePlurals?: string[] | boolean;
120 | /**
121 | * Sets the languages to be used by language-specific settings and functionalities such as ignorePlurals, removeStopWords, and CJK word-detection.
122 | */
123 | queryLanguages?: string[];
124 | /**
125 | * A list of language ISO code.
126 | */
127 | indexLanguages?: string[];
128 | /**
129 | * Whether rules should be globally enabled.
130 | */
131 | enableRules?: boolean;
132 | /**
133 | * Controls if and how query words are interpreted as prefixes.
134 | */
135 | queryType?: 'prefixAll' | 'prefixLast' | 'prefixNone';
136 | /**
137 | * Selects a strategy to remove words from the query when it doesn’t match any hits.
138 | */
139 | removeWordsIfNoResults?: 'allOptional' | 'firstWords' | 'lastWords' | 'none';
140 | /**
141 | * Enables the advanced query syntax.
142 | */
143 | advancedSyntax?: boolean;
144 | /**
145 | * AdvancedSyntaxFeatures can be exactPhrase or excludeWords.
146 | */
147 | advancedSyntaxFeatures?: Array<'exactPhrase' | 'excludeWords'>;
148 | /**
149 | * A list of words that should be considered as optional when found in the query.
150 | */
151 | optionalWords?: string[];
152 | /**
153 | * List of attributes on which you want to disable prefix matching.
154 | */
155 | disablePrefixOnAttributes?: string[];
156 | /**
157 | * List of attributes on which you want to disable the exact ranking criterion.
158 | */
159 | disableExactOnAttributes?: string[];
160 | /**
161 | * Controls how the exact ranking criterion is computed when the query contains only one word.
162 | */
163 | exactOnSingleWordQuery?: 'attribute' | 'none' | 'word';
164 | /**
165 | * List of alternatives that should be considered an exact match by the exact ranking criterion.
166 | */
167 | alternativesAsExact?: Array<
168 | 'ignorePlurals' | 'multiWordsSynonym' | 'singleWordSynonym'
169 | >;
170 | /**
171 | * Removes stop (common) words from the query before executing it.
172 | */
173 | removeStopWords?: string[] | boolean;
174 | /**
175 | * List of numeric attributes that can be used as numerical filters.
176 | */
177 | numericAttributesForFiltering?: string[];
178 | /**
179 | * Enables compression of large integer arrays.
180 | */
181 | allowCompressionOfIntegerArray?: boolean;
182 | /**
183 | * Name of the de-duplication attribute to be used with the distinct feature.
184 | */
185 | attributeForDistinct?: string;
186 | /**
187 | * Enables de-duplication or grouping of results.
188 | */
189 | distinct?: boolean | number;
190 | /**
191 | * Whether to highlight and snippet the original word that matches the synonym or the synonym itself.
192 | */
193 | replaceSynonymsInHighlight?: boolean;
194 | /**
195 | * Allows proximity to impact which searchable attribute is matched in the attribute ranking stage.
196 | */
197 | attributeCriteriaComputedByMinProximity?: boolean;
198 | /**
199 | * Precision of the proximity ranking criterion.
200 | */
201 | minProximity?: number;
202 | /**
203 | * Choose which fields the response will contain. Applies to search and browse queries.
204 | */
205 | responseFields?: string[];
206 | /**
207 | * Maximum number of facet hits to return during a search for facet values.
208 | */
209 | maxFacetHits?: number;
210 | /**
211 | * List of attributes on which to do a decomposition of camel case words.
212 | */
213 | camelCaseAttributes?: string[];
214 | /**
215 | * Specify on which attributes in your index Algolia should apply word-splitting (“decompounding”).
216 | */
217 | decompoundedAttributes?: Record;
218 | /**
219 | * Characters that should not be automatically normalized by the search engine.
220 | */
221 | keepDiacriticsOnCharacters?: string;
222 | /**
223 | * Overrides Algolia's default normalization.
224 | */
225 | customNormalization?: Record>;
226 | /**
227 | * Custom userData that could be added to the Settings.
228 | */
229 | userData?: any;
230 | }
231 |
--------------------------------------------------------------------------------
/src/types/config.ts:
--------------------------------------------------------------------------------
1 | ///
2 |
3 | import type { AlgoliaSettings } from './algoliaSettings';
4 | import type { FileTypes } from './fileTypes';
5 |
6 | /**
7 | * Specification of a CrawlerConfig object, i.e. The unserialized UserConfig.config field.
8 | */
9 | export interface ExternalDataSourceGoogleAnalytics {
10 | dataSourceId: string;
11 | type: 'googleanalytics';
12 | metrics: string[];
13 | startDate?: string;
14 | endDate?: string;
15 | samplingLevel?: 'DEFAULT' | 'LARGE' | 'SMALL';
16 | credentials: {
17 | type: 'service_account';
18 | client_email: string;
19 | private_key: string;
20 | viewIds?: string[];
21 | };
22 | }
23 |
24 | export interface ExternalDataSourceCSV {
25 | dataSourceId: string;
26 | type: 'csv';
27 | url: string;
28 | }
29 |
30 | export interface ExtractionHelpers {
31 | splitContentIntoRecords: (params?: {
32 | /**
33 | * A [Cheerio instance](https://cheerio.js.org/) that determines from which element(s) textual content will be extracted and turned into records.
34 | *
35 | * @default `$('body')`
36 | */
37 | $elements?: cheerio.Cheerio;
38 |
39 | /**
40 | * Attributes (and their values) to add to all resulting records.
41 | *
42 | * @default `{}`
43 | */
44 | baseRecord?: Record;
45 |
46 | /**
47 | * Maximum number of bytes allowed per record, on the resulting Algolia index.
48 | *
49 | * @default `10000`
50 | */
51 | maxRecordBytes?: number;
52 |
53 | /**
54 | * Name of the attribute in which to store the text of each record.
55 | *
56 | * @default `'text'`
57 | */
58 | textAttributeName?: string;
59 |
60 | /**
61 | * Name of the attribute in which to store the number of each record.
62 | */
63 | orderingAttributeName?: string;
64 | }) => Array>;
65 |
66 | docsearch: (params: {
67 | selectors: {
68 | lvl0?: string;
69 | lvl1: string;
70 | lvl2?: string;
71 | lvl3?: string;
72 | lvl4?: string;
73 | lvl5?: string;
74 | lvl6?: string;
75 | content: string;
76 | };
77 |
78 | /**
79 | * Should we indexHeadings
80 | * - true = yes
81 | * - false = no
82 | * - { from, to } = from lvl to lvl only.
83 | */
84 | indexHeadings?:
85 | | false
86 | | {
87 | from: number;
88 | to: number;
89 | };
90 | }) => Array<{
91 | objectID: string;
92 | [key: string]: any;
93 | }>;
94 | }
95 |
96 | export type RecordExtractor = (params: {
97 | /** A [Cheerio instance](https://cheerio.js.org/) that contains the HTML for the crawled page. */
98 | $: cheerio.Root;
99 |
100 | /** A [Location object](https://developer.mozilla.org/en-US/docs/Web/API/Location) containing the URL and metadata for the crawled page. */
101 | url: URL;
102 |
103 | /** The fileType of the crawled page (e.g.: html, pdf, ...). */
104 | fileType: keyof typeof FileTypes;
105 |
106 | /** The number of bytes in the crawled page. */
107 | contentLength: number;
108 |
109 | /** Array of external data sources. */
110 | dataSources: { [dataSourceName: string]: { [key: string]: any } };
111 |
112 | /** A set of functions to help you extract content. */
113 | helpers: ExtractionHelpers;
114 | }) => Array<{
115 | objectID?: string;
116 | [key: string]: any;
117 | }>;
118 |
119 | export interface ExtractorCustom {
120 | type: 'custom';
121 | params: {
122 | method: RecordExtractor;
123 | };
124 | }
125 |
126 | export interface Action {
127 | /** Unique name of the action. */
128 | name?: string;
129 |
130 | indexName: string;
131 |
132 | partialUpdate?: boolean;
133 |
134 | /** How often this specific action will run.
135 | * See root level schedule for more details.
136 | */
137 | schedule?: string;
138 |
139 | /** Will determine which webpages will match for this action. This list is checked against the url of webpages using [micromatch](https://github.com/micromatch/micromatch). Negation, wildcards and more can be used. Check the full documentation. */
140 | pathsToMatch?: string[];
141 |
142 | /** Will check for the presence or absence of DOM nodes. */
143 | selectorsToMatch?: string[];
144 |
145 | /** Override if you want to index documents. Chosen file types will be converted to HTML using [Tika](https://wiki.apache.org/tika/TikaJAXRS), then treated as a normal HTML page. See the [documents guide](https://www.algolia.com/doc/tools/crawler/guides/extracting-data/how-to/index-documents/) for a list of available `fileTypes`. */
146 | fileTypesToMatch?: Array;
147 |
148 | /** Generate an `objectID` for records that don't have one. See the [`objectID` definition](#). Setting this parameter to `false` means we'll raise an error in case an extracted record doesn't have an `objectID`. Note, this parameter is not compatible with `partialUpdate = true`. */
149 | autoGenerateObjectIDs?: boolean;
150 |
151 | /** An recordExtractor is just a custom Javascript function that let you execute your own code and extract what you want from a page. */
152 | recordExtractor?: RecordExtractor;
153 | extractors?: ExtractorCustom[];
154 | }
155 |
156 | /**
157 | * Typed Schema used for autocompletion in the Editor of the Admin Console.
158 | * Note: please keep in sync with crawler-common/src/config/validation.
159 | */
160 | export interface Config {
161 | /** @required Application ID that specifies which of your Algolia application you want to save your crawler extractions to. */
162 | appId: string;
163 |
164 | /**
165 | * @required Algolia API key for your targeted Algolia application. Using the Admin API key is not allowed, and it must:
166 | * - Have the following rights: `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`
167 | * - Have access to the correct set of indexes, according to the `indexPrefix` (e.g. have access to `crawler_*` if the indexPrefix is `crawler_`)
168 | *
169 | * This key will be generated for you by the Admin Console when you create a configuration, if you provide the Admin API Key. We will never store the Admin API Key.
170 | */
171 | apiKey: string;
172 |
173 | /**
174 | * @default 8 seconds
175 | *
176 | * @required Number of concurrent tasks (per second) that can run for this configuration. Higher means more crawls per second.
177 | * This number works with the following formula:
178 | * ```
179 | * MAX ( urls_added_in_the_last_second, urls_currently_being_processed ) <= rateLimit
180 | * ```
181 | * If fetching, processing, uploading is taking less than a second, your crawler processes `rateLimit` urls per second.
182 | *
183 | * However, if each page takes on average 4 secondes to be processed, your crawler processes `rateLimit / 4` pages per second.
184 | *
185 | * It's recommend to start with a low value (e.g. 2) and update it if you need faster crawling: a high `rateLimit` can have a huge impact over bandwidth cost and server resource consumption.
186 | */
187 | rateLimit: number;
188 |
189 | /**
190 | * How often you want to execute a complete recrawl. Expressed using [Later.js' syntax](https://bunkat.github.io/later/).
191 | *
192 | * If omitted, you will need to manually launch a reindex operation in order to update the crawled records.
193 | *
194 | * Important notes:
195 | * 1. The interval between two scheduled crawls must be equal or higher than 24 hours.
196 | * 2. Times will be interpreted as UTC (GMT+0 timezone).
197 | */
198 | schedule?: string;
199 |
200 | /**
201 | * When `true`, all web pages are rendered with a chrome headless browser. You get the rendered HTML result.
202 | *
203 | * Because rendering JavaScript-based web pages is much slower than crawling regular HTML pages, you can apply this setting to a specified list of [micromatch](https://github.com/micromatch/micromatch) URL patterns. These patterns can include negations and wildcards.
204 | *
205 | * With this setting enabled, JavaScript is executed on the webpage. Because a lot of websites have infinite refreshes and updates, this Chrome headless browser is configured with a timeout (set to a few seconds).
206 | *
207 | * This can lead to inconsistent records across recrawls, depending on the browser load and the website speed.
208 | *
209 | * Make sure your crawler manages to load the data from JavaScript-based pages interested in fast enough.
210 | */
211 | renderJavaScript?: string[] | boolean;
212 |
213 | /** Saves a backup of your production index before it is overwritten by the index generated during a recrawl. */
214 | saveBackup?: boolean;
215 |
216 | /**
217 | * When set to `true`, this tells the Crawler to ignore rules set in the robots.txt.
218 | */
219 | ignoreRobotsTxtRules?: boolean;
220 |
221 | /**
222 | * Whether the Crawler should extract records from a page whose `robots` meta tag contains `noindex` or `none`.
223 | *
224 | * When `true`, the crawler will ignore the `noindex` directive of the [robots meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names).
225 | *
226 | * Its default value is currently `true`, but it will change to `false` in a near future. If you'd like the crawler to not respect the `noindex` directive, you should set it explicitely.
227 | */
228 | ignoreNoIndex?: boolean;
229 |
230 | /**
231 | * Whether the Crawler should follow links marked as `nofollow`.
232 | *
233 | * This setting applies to both:
234 | * - links which should be ignored because the [`robots` meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names) contains `nofollow`;
235 | * - links whose [rel attribute](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel) contains the `nofollow` directive.
236 | *
237 | * When `true`, the crawler will consider those links as if they weren't marked to be ignored.
238 | *
239 | * The crawler might still ignore links that don't match the patterns of your configuration.
240 | *
241 | * Its default value is currently `true`, but it will change to `false` in a near future. If you'd like the crawler to never respect `nofollow` directives, you should set it explicitely.
242 | *
243 | * Note: The "To" suffix is here for consistency with `ignoreCanonicalTo`. While it only accepts a boolean for now, we plan for it to accept an array of patterns eventually. Please contact us if you need such fine grained control.
244 | */
245 | ignoreNoFollowTo?: boolean;
246 |
247 | /**
248 | * This tells the Crawler to process a page even if there is a meta canonical URL specified.
249 | *
250 | * When set to `true`, it will ignore all canonical.
251 | * When set to `string[]`, it will ignore canonical that matches the specified patterns.
252 | */
253 | ignoreCanonicalTo?: string[] | boolean;
254 |
255 | /**
256 | * @required if no `sitemaps`
257 | *
258 | * Your crawler uses these URLs as a starting point for its crawl.
259 | */
260 | startUrls?: string[];
261 |
262 | /**
263 | * @required if no `startUrls`
264 | *
265 | * URLs found in `sitemaps` are treated as `startUrls` for your crawler: they are used as start points for the crawl.
266 | */
267 | sitemaps?: string[];
268 |
269 | /**
270 | * URLs found in `extraUrls` are treated as `startUrls` for your crawler: they are used as start points for the crawl.
271 | *
272 | * Crawler saves URLs added through the **Add a URL** field of the Admin's Configuration tab to the `extraUrls` array.
273 | *
274 | * Internally `extraUrls` is treated like `startUrls`. The seperate parameter serves to identify which URLs were added directly to the crawler's configuration file vs. Those that were added through the Admin.
275 | */
276 | extraUrls?: string[];
277 |
278 | /**
279 | * Determines the webpage patterns ignored or excluded during a crawl.
280 | *
281 | * This list is checked against the url of webpages using [micromatch](https://github.com/micromatch/micromatch). You can use negation, wildcards, and more.
282 | */
283 | exclusionPatterns?: string[];
284 |
285 | /** Filters out specified query parameters from crawled urls. Useful for avoiding duplicate crawls of the same page. */
286 | ignoreQueryParams?: string[];
287 |
288 | /** Prefix added in front of all indices defined in the crawler's configuration. */
289 | indexPrefix?: string;
290 |
291 | /**
292 | * Defines the settings for the indices that updated by your crawler.
293 | *
294 | * Index names should be provided as keys. Their values are objects that define Algolia index settings as properties (e.g. `searchableAttributes` `attributesForFaceting`).
295 | *
296 | * Index settings will only be applied on your Algolia's index during the first run (or if the index doesn't exist when launching the reindex). Once an index has been created, settings are never re-applied: this prevents to not override any manual changes you may have done.
297 | */
298 | initialIndexSettings?: {
299 | [indexName: string]: AlgoliaSettings;
300 | };
301 |
302 | /**
303 | * Limits the number of URLs your crawler processes.
304 | *
305 | * Useful for demoing and preventing infinite link holes in the website structure.
306 | *
307 | * `maxUrls` does not guarantee consistent indexing accross recrawls. Because of parallel processing, discovered URLs can be processed in different orders for different recrawls.
308 | *
309 | * This parameter is capped at a maximum of `1,000,000`.
310 | */
311 | maxUrls?: number;
312 |
313 | /**
314 | * Limits the processing of URLs to a specified depth, inclusively.
315 | *
316 | *_Maximum_: `100`.
317 | *
318 | * URLs added manually (startUrls, sitemaps...) are not checked against this limit.
319 | *
320 | * **How we calculate depth:**.
321 | *
322 | * @example
323 | * ```javascript
324 | * http://example.com => 1
325 | * http://example.com/ => 1
326 | * http://example.com/foo => 1
327 | * http://example.com/foo/ => 2
328 | * http://example.com/foo/bar => 2
329 | * http://example.com/foo/bar/ => 3
330 | * ...
331 | * ```
332 | */
333 | maxDepth?: number;
334 |
335 | /**
336 | * Defines which webpages will be visited.
337 | * It is used in combination with the `pathsToMatchs` of your actions.
338 | * The Crawler will visit all links that match at least one of those paths.
339 | */
340 | discoveryPatterns?: string[];
341 |
342 | /**
343 | * Defines a hostname key that will be transformed as the value specified.
344 | * The keys are exact match only.
345 | *
346 | * Applied to:
347 | * - All URLs found
348 | * - Canonical
349 | * - Redirection.
350 | *
351 | * Not applied to:
352 | * - props: startUrls, extraUrls, pathsToMatch, etc...
353 | * - URLs in your code.
354 | *
355 | * @example
356 | * ```javascript
357 | * hostnameAliases: {
358 | * 'algolia.com': 'dev.algolia.com'
359 | * }
360 | * ```
361 | */
362 | hostnameAliases?: Record;
363 |
364 | pathAliases?: Record>;
365 |
366 | /**
367 | * Determines the function used to extract URLs from pages.
368 | *
369 | * If provided, this function is called on a crawled page. Only the URLs it returns are enqueued for further crawling. By default, all the URLs found while crawling a page are enqueued given that they comply with `pathsToMatch`, `fileTypesToMatch` and `exclusions`.
370 | *
371 | * Expected return value: `array` of `strings` (URLs).
372 | */
373 | linkExtractor?: (params: {
374 | $: cheerio.Root;
375 | url: URL;
376 | defaultExtractor: () => string[];
377 | }) => string[];
378 |
379 | /**
380 | * Modify all requests behavior.
381 | *
382 | * Cookie Header will be overriden by the cookie fetched in `login`.
383 | */
384 | requestOptions?: {
385 | proxy?: string;
386 | timeout?: number;
387 | retries?: number;
388 | headers?: {
389 | 'Accept-Language'?: string;
390 | Authorization?: string;
391 | Cookie?: string;
392 | };
393 | };
394 |
395 | /**
396 | * This property can be set in order to define how the Crawler should login to the website before crawling pages.
397 | *
398 | * The Crawler will then extract the `Set-Cookie` response header from the login page and send that Cookie when crawling all pages of the website defined in the configuration.
399 | */
400 | login?: {
401 | fetchRequest?: {
402 | url: string;
403 | requestOptions?: {
404 | method?: string;
405 | headers?: {
406 | 'Content-Type'?: string;
407 | Cookie?: string;
408 | Authorization?: string;
409 | };
410 | body?: string;
411 | timeout?: number;
412 | };
413 | };
414 | browserRequest?: {
415 | url: string;
416 | username: string;
417 | password: string;
418 | };
419 | };
420 |
421 | cache?: {
422 | enabled: boolean;
423 | };
424 |
425 | /**
426 | * Defines external data sources you want to retrieve during every recrawl and made available to your extractors.
427 | *
428 | * **There are two supported data sources: Google Analytics and CSV files.**.
429 | *
430 | * Once you setup an `externalDataSource`, it is exposed your [`extractors`].
431 | * You can have maximum 10 sources. 11 millions URLs accross all sources.
432 | * You can access it through the `dataSources` object, which has the following structure.
433 | *
434 | * @example
435 | * ```javascript
436 | * {
437 | * dataSourceId1: { data1: 'val1', data2: 'val2' },
438 | * dataSourceId2: { data1: 'val1', data2: 'val2' },
439 | * }
440 | * ```
441 | */
442 | externalDataSources?: Array<
443 | ExternalDataSourceCSV | ExternalDataSourceGoogleAnalytics
444 | >;
445 |
446 | /**
447 | * Determines which web pages are translated into Algolia records and in what way.
448 | *
449 | * A single action defines:
450 | * 1. The subset of your crawler's websites it targets,
451 | * 2. The extraction process for those websites,
452 | * 3. And the index(es) to which the extracted records are pushed.
453 | *
454 | * A single web page can match multiple actions. In this case, your crawler creates a record for each matched actions.
455 | */
456 | actions: Action[];
457 |
458 | /**
459 | * A configurable collection of safety checks to make sure the crawl was successful.
460 | *
461 | * This configuration describes all the checks the Crawler can perform to ensure data is correct.
462 | * For example, the number of records from one crawl to another.
463 | */
464 | safetyChecks?: {
465 | /**
466 | * Checks triggered after the Crawler is done, and before the records
467 | * are pushed to Algolia into the final index.
468 | */
469 | beforeIndexPublishing?: {
470 | /**
471 | * Defines the limit of records difference between the new and the last crawl as a percentage of total records (inclusive).
472 | *
473 | * _Default_: `10`.
474 | *
475 | * _Minimum_: `0`\
476 | * _Maximum_: `100`.
477 | *
478 | * If the new number of records is less than `last number of records * (1 - maxLostRecordsPercentage / 100)`,
479 | * the process throws a `SafeReindexingError`, blocking the Crawler until manual restart.
480 | */
481 | maxLostRecordsPercentage?: number;
482 | };
483 | };
484 | }
485 |
486 | export default Config;
487 |
--------------------------------------------------------------------------------
/src/types/configJson.ts:
--------------------------------------------------------------------------------
1 | import type { Config, Action, ExtractorCustom } from './config';
2 | import type { Modify } from './utils';
3 |
4 | export type FunctionAsString = {
5 | __type: 'function';
6 | source: string;
7 | };
8 |
9 | export type ExtractorCustomAsString = Modify<
10 | ExtractorCustom,
11 | {
12 | params: {
13 | method: FunctionAsString;
14 | };
15 | }
16 | >;
17 |
18 | export type ActionAsString = Modify<
19 | Action,
20 | {
21 | recordExtractor?: FunctionAsString;
22 | extractors?: ExtractorCustomAsString[];
23 | }
24 | >;
25 |
26 | export type ConfigJson = Modify<
27 | Config,
28 | {
29 | linkExtractor?: FunctionAsString;
30 | actions: ActionAsString[];
31 | }
32 | >;
33 |
--------------------------------------------------------------------------------
/src/types/fileTypes.ts:
--------------------------------------------------------------------------------
1 | export enum FileTypes {
2 | 'html' = 'html',
3 | 'xml' = 'xml',
4 | 'pdf' = 'pdf',
5 | 'doc' = 'doc',
6 | 'xls' = 'xls',
7 | 'ppt' = 'ppt',
8 | 'odt' = 'odt',
9 | 'ods' = 'ods',
10 | 'odp' = 'odp',
11 | }
12 |
13 | export type FileType = keyof typeof FileTypes;
14 |
--------------------------------------------------------------------------------
/src/types/github.ts:
--------------------------------------------------------------------------------
1 | export interface GithubComment {
2 | id: number;
3 | body?: string;
4 | user: {
5 | login: string;
6 | } | null;
7 | }
8 |
--------------------------------------------------------------------------------
/src/types/publicApiJsonResponses.ts:
--------------------------------------------------------------------------------
1 | import type { ConfigJson } from './configJson';
2 | import type { Optional } from './utils';
3 |
4 | enum JobStatusEnum {
5 | DONE = 'DONE',
6 | SKIPPED = 'SKIPPED',
7 | FAILED = 'FAILED',
8 | PENDING = 'PENDING',
9 | }
10 |
11 | export interface UserConfigReindexSummaryGroup {
12 | reason: string;
13 | status: keyof typeof JobStatusEnum;
14 | category?: string;
15 | readable?: string;
16 | nbUrls: number;
17 | previousNbUrls?: number;
18 | }
19 |
20 | export interface GetCrawlersResponseBody {
21 | items: Array<{ id: string; name: string }>;
22 | itemsPerPage: number;
23 | page: number;
24 | total: number;
25 | }
26 |
27 | export interface CreatedCrawlerResponseBody {
28 | id: string;
29 | }
30 |
31 | export interface UpdateConfigResponseBody {
32 | rateLimit: number;
33 | startUrls: string[];
34 | }
35 |
36 | export interface CrawlerStatusResponseBody {
37 | name: string;
38 | createdAt: string;
39 | updatedAt: string;
40 | running: boolean;
41 | reindexing: boolean;
42 | blocked: boolean;
43 | blockingError?: string;
44 | blockingTaskId?: string;
45 | lastReindexStartedAt: string | null;
46 | lastReindexEndedAt: string | null;
47 | config?: ConfigJson;
48 | }
49 |
50 | export interface GetUrlStatsResponseBody {
51 | count: number;
52 | data: UserConfigReindexSummaryGroup[];
53 | }
54 |
55 | export interface TaskResponseBody {
56 | taskId: string;
57 | }
58 |
59 | export interface AlgoliaRecord {
60 | objectID: string;
61 | [key: string]: any;
62 | }
63 |
64 | export interface RecordsPerExtractor {
65 | index: number;
66 | type: 'algoliaCache' | 'custom';
67 | records: Array>;
68 | }
69 |
70 | export interface ExtractedRecord {
71 | actionName: string;
72 | indexName: string;
73 | partialUpdate: boolean;
74 | records: AlgoliaRecord[];
75 | recordsPerExtractor: RecordsPerExtractor[];
76 | }
77 |
78 | export type UrlTesterRecord = Pick<
79 | ExtractedRecord,
80 | 'indexName' | 'records' | 'recordsPerExtractor'
81 | >;
82 |
83 | export interface ExternalDataOneUrl {
84 | url: string;
85 | dataSources: { [key: string]: any };
86 | }
87 |
88 | export interface LoginResponse {
89 | statusCode: number;
90 | cookie: string | null;
91 | httpHeaders: Headers;
92 | error?: string;
93 | }
94 |
95 | export interface UrlTestResponseBody {
96 | startDate: string;
97 | endDate: string;
98 | logs: string[][];
99 | records: UrlTesterRecord[];
100 | links: string[];
101 | externalData?: ExternalDataOneUrl['dataSources'];
102 | error?: { code?: string; message: string; details?: any };
103 | loginResponse?: LoginResponse;
104 | }
105 |
--------------------------------------------------------------------------------
/src/types/utils.ts:
--------------------------------------------------------------------------------
1 | /* eslint-disable @typescript-eslint/naming-convention */
2 |
3 | /**
4 | * Take an interface and list the keys that are optional.
5 | *
6 | * @example
7 | * interface Hello {
8 | * foo?: string;
9 | * bar?: string;
10 | * baz: string;
11 | * }
12 | *
13 | * OptionalKeys;
14 | *
15 | * Will result in:
16 | * 'foo' | 'bar'
17 | */
18 | export type OptionalKeys = {
19 | [K in keyof T]: undefined extends T[K] ? K : never;
20 | }[keyof T];
21 |
22 | /**
23 | * Take an interface and choose what property should undefined.
24 | *
25 | * @example
26 | * interface Hello {
27 | * foo: string;
28 | * bar: string;
29 | * baz?: string;
30 | * };
31 | *
32 | * Optional;
33 | *
34 | * Will results in:
35 | * {
36 | * foo: string;
37 | * bar?: string;
38 | * baz?: string;
39 | * }
40 | *
41 | */
42 | export type Optional = {
43 | [P in Exclude>>]?: T[P];
44 | } & {
45 | [P in Exclude>]: T[P];
46 | };
47 |
48 | /**
49 | * Take an interface and replace specified property. (By default Typescript merge but do not replace).
50 | *
51 | * @example
52 | * interface Hello {
53 | * foo: string;
54 | * bar: string;
55 | * };
56 | *
57 | * Modify;
58 | *
59 | * Will results in:
60 | * {
61 | * foo: string;
62 | * bar: number;
63 | * }
64 | */
65 | export type Modify = Omit & R;
66 |
67 | /**
68 | * Take an interface and choose what property should not be undefined.
69 | *
70 | * @example
71 | * interface Hello {
72 | * foo?: string;
73 | * bar?: string;
74 | * };
75 | *
76 | * RequireSome;
77 | *
78 | * Will results in:
79 | * {
80 | * foo?: string;
81 | * bar: string;
82 | * }
83 | */
84 | export type RequireSome = Omit & {
85 | [P in Exclude>]: Exclude;
86 | };
87 |
88 | // stackoverflow.com/questions/49285864/is-there-a-valueof-similar-to-keyof-in-typescript
89 | /**
90 | * Get values of type interface.
91 | *
92 | * @example
93 | * interface Foo { foo: string }
94 | * ValueOf
95 | * => string.
96 | */
97 | export type ValueOf = T[keyof T];
98 |
99 | /**
100 | * Get values of array.
101 | *
102 | * @example
103 | * const arr = [ 'foobar' ];
104 | * type ArrType = ValuesOfArray;
105 | */
106 | export type ValuesOfArray = T[number];
107 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "module": "ES2020",
4 | "esModuleInterop": true,
5 | "target": "ES2020",
6 | "baseUrl": "./",
7 | "rootDir": "./src",
8 | "outDir": "./dist",
9 | "moduleResolution": "node",
10 | "downlevelIteration": true,
11 | "incremental": false,
12 | "sourceMap": false,
13 | "removeComments": true,
14 | "allowJs": true,
15 | "noImplicitUseStrict": true,
16 | "resolveJsonModule": true,
17 | "declaration": true,
18 | "declarationMap": true,
19 | "composite": false,
20 | "preserveConstEnums": true,
21 | "noEmitOnError": false,
22 | "strict": true,
23 | "noImplicitAny": true,
24 | "strictNullChecks": true,
25 | "noImplicitThis": true,
26 | "strictFunctionTypes": true,
27 | "strictPropertyInitialization": true,
28 | "alwaysStrict": false,
29 | "noUnusedLocals": true,
30 | "noUnusedParameters": false,
31 | "noImplicitReturns": true,
32 | "noFallthroughCasesInSwitch": true,
33 | "experimentalDecorators": true,
34 | "emitDecoratorMetadata": true,
35 | "skipLibCheck": true,
36 | "types": [
37 | "node",
38 | "jest"
39 | ],
40 | },
41 | "exclude": [
42 | "node_modules",
43 | "**/*.test.ts"
44 | ],
45 | "include": [
46 | "src/**/*",
47 | "package.json"
48 | ],
49 | "typeRoots": [
50 | "node_modules/@types"
51 | ]
52 | }
53 |
--------------------------------------------------------------------------------