├── .gitignore
├── neo4j
    ├── secrets
    │   └── credentials.json
    ├── driver.mjs
    ├── gds.mjs
    ├── util.mjs
    └── community.mjs
├── util
    ├── time.mjs
    ├── commander.mjs
    ├── path.mjs
    ├── shell.mjs
    ├── promises.mjs
    ├── string.mjs
    └── array.mjs
├── es
    ├── domain.mjs
    ├── query.mjs
    ├── dump.mjs
    ├── update.mjs
    ├── pipeline.mjs
    ├── bulk.mjs
    ├── document.mjs
    ├── entities.mjs
    ├── snapshot.mjs
    ├── search.mjs
    ├── requests.mjs
    └── index.mjs
├── bin
    ├── annotationsDataQuality
    │   ├── requests
    │   │   ├── flattenedConfidenceHistogram
    │   │   │   ├── request.json
    │   │   │   └── README.md
    │   │   ├── flattenedConfidenceExtendedStats
    │   │   │   ├── request.json
    │   │   │   └── README.md
    │   │   ├── confidenceHistograms
    │   │   │   ├── README.md
    │   │   │   └── request.json
    │   │   ├── flattenedSimilarityScoreHistogram
    │   │   │   ├── request.json
    │   │   │   └── README.md
    │   │   ├── flattenedURITerms
    │   │   │   ├── README.md
    │   │   │   └── request.json
    │   │   ├── docsWithMissingDBpediaEntitiesField
    │   │   │   ├── request.json
    │   │   │   └── README.md
    │   │   ├── flattenedSimilarityScoreExtendedStats
    │   │   │   ├── request.json
    │   │   │   └── README.md
    │   │   ├── entitiesCountAggregations
    │   │   │   ├── request.json
    │   │   │   └── README.md
    │   │   ├── flattenedURITermsByConfidence
    │   │   │   ├── README.md
    │   │   │   └── request.json
    │   │   ├── textBodyAbstractArticleTokensHistogram
    │   │   │   ├── request.json
    │   │   │   └── README.md
    │   │   ├── entitiesCountAggsByConfidenceOverEntitiesCount
    │   │   │   ├── README.md
    │   │   │   ├── request.mjs
    │   │   │   └── request.json
    │   │   ├── tokenCountOverEntitiesCountAggs
    │   │   │   ├── README.md
    │   │   │   └── request.json
    │   │   ├── entitiesCountOverTokenCountByConfidence
    │   │   │   ├── README.md
    │   │   │   ├── request.mjs
    │   │   │   └── request.json
    │   │   ├── entitiesCountAggsByConfidence
    │   │   │   ├── README.md
    │   │   │   └── request.json
    │   │   └── duplicateAggregations
    │   │   │   ├── README.md
    │   │   │   └── request.json
    │   └── annotationsDataQuality.js
    ├── geo
    │   ├── generatePmTiles.sh
    │   ├── downloadBoundaries.js
    │   └── README.md
    ├── README.md
    ├── jsonToEsIndex.js
    ├── annotate.js
    ├── annotateEsIndex.js
    └── entitiesDataQuality.js
├── dbpedia
    ├── util.mjs
    ├── ontology.mjs
    ├── requests.mjs
    └── spotlight.mjs
├── terraform
    ├── state.mjs
    ├── commands.mjs
    └── configuration.mjs
├── conf
    ├── config.mjs
    └── mappings.mjs
├── auth
    └── authentication.mjs
├── sparql
    └── query.mjs
├── aws
    ├── email.mjs
    └── s3.mjs
├── bing
    └── search.mjs
├── README.md
├── LICENSE
├── wiki
    └── page.mjs
├── .github
    └── workflows
    │   └── tag_new_versions.yml
├── logging
    └── logging.mjs
├── CHANGELOG.md
├── geo
    └── download.js
└── package.json


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | logs


--------------------------------------------------------------------------------
/neo4j/secrets/credentials.json:
--------------------------------------------------------------------------------
1 | {
2 |     "user": "neo4j",
3 |     "password": "datavis"
4 | }


--------------------------------------------------------------------------------
/util/time.mjs:
--------------------------------------------------------------------------------
1 | export const sleep = ms => {
2 | 	return new Promise(resolve => {
3 | 		setTimeout(resolve, ms);
4 | 	});
5 | };
6 | 


--------------------------------------------------------------------------------
/es/domain.mjs:
--------------------------------------------------------------------------------
1 | import { buildRequest, makeRequest } from '../es/requests.mjs';
2 | 
3 | 
4 | export const info = async domain => {
5 | 	const request = buildRequest(domain, '', 'GET');
6 | 	const { body: response } = await makeRequest(request);
7 | 	return response;
8 | };
9 | 


--------------------------------------------------------------------------------
/util/commander.mjs:
--------------------------------------------------------------------------------
 1 | import { InvalidArgumentError } from 'commander';
 2 | 
 3 | export const commanderParseInt = (value, _) => {
 4 | 	const parsedValue = parseInt(value, 10);
 5 | 	if (isNaN(parsedValue)) {
 6 | 		throw new InvalidArgumentError('Not an integer.');
 7 | 	}
 8 | 	return parsedValue;
 9 | };
10 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedConfidenceHistogram/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"size": 0,
 3 | 	"aggs": {
 4 | 		"dbpedia": {
 5 | 			"nested": {
 6 | 				"path": "dbpedia_entities"
 7 | 			},
 8 | 			"aggs": {
 9 | 				"confidence": {
10 | 					"terms": {
11 | 						"field": "dbpedia_entities.confidence"
12 | 					}
13 | 				}
14 | 			}
15 | 		}
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/util/path.mjs:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from 'fs';
 2 | import * as path from 'path';
 3 | 
 4 | import { stringify } from '@svizzle/utils';
 5 | 
 6 | export const createPathAndWriteObject = async (path_, data) => {
 7 | 	const directory = path.dirname(path_);
 8 | 	await fs.mkdir(directory, { recursive: true });
 9 | 	await fs.writeFile(path_, stringify(data));
10 | };
11 | 


--------------------------------------------------------------------------------
/dbpedia/util.mjs:
--------------------------------------------------------------------------------
 1 | export const prefixes =
 2 | `
 3 | PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 4 | PREFIX dbo: <http://dbpedia.org/ontology/>
 5 | PREFIX dbr: <http://dbpedia.org/resource/>
 6 | PREFIX prov: <http://www.w3.org/ns/prov#>`;
 7 | 
 8 | export const dbr = 'http://dbpedia.org/resource/';
 9 | export const dbo = 'http://dbpedia.org/ontology/';
10 | 
11 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedConfidenceExtendedStats/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"size": 0,
 3 | 	"aggs": {
 4 | 		"dbpedia_entities": {
 5 | 			"nested": {
 6 | 				"path": "dbpedia_entities"
 7 | 			},
 8 | 			"aggs": {
 9 | 				"confidence_stats": {
10 | 					"extended_stats": {
11 | 						"field": "dbpedia_entities.confidence"
12 | 					}
13 | 				}
14 | 			}
15 | 		}
16 | 	}
17 | }


--------------------------------------------------------------------------------
/terraform/state.mjs:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from 'fs';
 2 | import * as path from 'path';
 3 | 
 4 | export const getCurrentState = async dir => {
 5 | 	let state;
 6 | 	try {
 7 | 		state = JSON.parse(await fs.readFile(path.join(dir, 'terraform.tfstate'), { encoding: 'utf-8'}));
 8 | 	} catch {
 9 | 		state = null;
10 | 	}
11 | 	if (!state) {
12 | 		return false;
13 | 	}
14 | 	return state;
15 | };
16 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/confidenceHistograms/README.md:
--------------------------------------------------------------------------------
 1 | ## Histogram of Average `confidence`
 2 | 
 3 | Produces a histogram for the average, max and min `confidence` values calculated as
 4 | part of the annotation meta-data.
 5 | 
 6 | Endpoint: `arxiv_v6/_search`
 7 | 
 8 | See:
 9 | 
10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html
11 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedSimilarityScoreHistogram/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"size": 0,
 3 | 	"aggs": {
 4 | 		"dbpedia_entities": {
 5 | 			"nested": {
 6 | 				"path": "dbpedia_entities"
 7 | 			},
 8 | 			"aggs": {
 9 | 				"similarityScore": {
10 | 					"histogram": {
11 | 						"field": "dbpedia_entities.similarityScore",
12 | 						"interval": 0.1
13 | 					}
14 | 				}
15 | 			}
16 | 		}
17 | 	}
18 | }


--------------------------------------------------------------------------------
/conf/config.mjs:
--------------------------------------------------------------------------------
 1 | export const arxliveCopy =
 2 | 	'search-datavis-arxlive-copy-n6ltva3lqh7x7xb6ucpaqfb5a4.eu-west-2.es.amazonaws.com';
 3 | export const ec2 = 'http://ec2-3-8-167-48.eu-west-2.compute.amazonaws.com';
 4 | export const spotlightEndpoint = `${ec2}:2222/rest/annotate`;
 5 | export const confidenceValues = [
 6 | 	0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1,
 7 | ];
 8 | export const confidenceScores = [
 9 | 	0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
10 | ];
11 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedURITerms/README.md:
--------------------------------------------------------------------------------
 1 | ## Term aggregation of flattened URI values
 2 | 
 3 | Counts the occurrences of the `URI` fields in `dbpedia_entities` and returns
 4 | the top 100 most frequent values.
 5 | 
 6 | Endpoint: `POST arxiv_v6/_search`
 7 | 
 8 | See:
 9 | 
10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-terms-aggregation.html
11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html


--------------------------------------------------------------------------------
/util/shell.mjs:
--------------------------------------------------------------------------------
 1 | export const displayCommandOutput = (
 2 | 	error,
 3 | 	stdout,
 4 | 	stderr,
 5 | 	{ warnings=false } = {}
 6 | ) => {
 7 | 	if (error) {
 8 | 		console.log(`error: ${error.message}`);
 9 | 		return;
10 | 	}
11 | 	if (stderr) {
12 | 		if (stderr.toLowerCase().startsWith('warning') && !warnings) {
13 | 			return;
14 | 		}
15 | 		console.log(`stderr: ${stderr}`);
16 | 		return;
17 | 	}
18 | 	if (stdout.length > 0) {
19 | 		console.log(`stdout: ${stdout}`);
20 | 	}
21 | };
22 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/docsWithMissingDBpediaEntitiesField/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "bool": {
 4 |             "must_not": {
 5 |                 "nested": {
 6 |                     "path": "dbpedia_entities",
 7 |                     "query": {
 8 |                         "exists": {
 9 |                             "field": "dbpedia_entities"
10 |                         }
11 |                     }
12 |                 }
13 |             }
14 |         }
15 |     }
16 | }


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedURITerms/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "size": 0,
 3 |     "aggs": {
 4 |         "dbpedia": {
 5 |             "nested": {
 6 |                 "path": "dbpedia_entities"
 7 |             },
 8 |             "aggs": {
 9 |                 "URIs": {
10 |                     "terms": {
11 |                         "field": "dbpedia_entities.URI",
12 |                         "size": 100
13 |                     }
14 |                 }
15 |             }
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/util/promises.mjs:
--------------------------------------------------------------------------------
 1 | import { getValue, stringify } from '@svizzle/utils';
 2 | 
 3 | import { logger } from '../logging/logging.mjs';
 4 | 
 5 | const logErrors = v => {
 6 | 	if (v.status === 'rejected') {
 7 | 		logger.error(stringify(v));
 8 | 	}
 9 | 	return v;
10 | };
11 | 
12 | const removeErrors = v => v.status !== 'rejected';
13 | 
14 | export const promisesHandler = async promises => {
15 | 	return (await Promise.allSettled(promises))
16 | 	.map(logErrors)
17 | 	.filter(removeErrors)
18 | 	.map(getValue);
19 | };
20 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedSimilarityScoreExtendedStats/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "size": 0,
 3 |     "aggs": {
 4 |         "dbpedia_entities": {
 5 |             "nested": {
 6 |                 "path": "dbpedia_entities"
 7 |             },
 8 |             "aggs": {
 9 |                 "similarityScore_stats": {
10 |                     "extended_stats": {
11 |                         "field": "dbpedia_entities.similarityScore"
12 |                     }
13 |                 }
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/util/string.mjs:
--------------------------------------------------------------------------------
 1 | import * as _ from 'lamb';
 2 | import { trim } from '@svizzle/utils';
 3 | 
 4 | export const toLowerString = v => v.toString().toLowerCase();
 5 | 
 6 | // tag function to dedent template literals
 7 | export const dedent = _.pipe([
 8 | 	_.head, // first argument is strings
 9 | 	_.splitBy('\n'),
10 | 	_.mapWith(trim),
11 | 	_.joinWith('\n'),
12 | 	trim
13 | ]);
14 | 
15 | export const hasOnlyLatinCharacters = str => (/^[a-zA-Z:]+$/u).test(str);
16 | export const hasNonAsciiCharacters = str => (/^[\u0000-\u007f]*$/u).test(str);
17 | 


--------------------------------------------------------------------------------
/auth/authentication.mjs:
--------------------------------------------------------------------------------
 1 | export const authenticate = async (endpoint, email, token) => {
 2 | 	const url = `${endpoint}?email=${email}&token=${token}`
 3 | 	const response = await fetch(url);
 4 | 	const result = await response.json();
 5 | 	return result;
 6 | };
 7 | 
 8 | export const parseBasicAuth = header => {
 9 | 	const base64String = header.slice(6, -1);
10 | 	const buff = Buffer.from(base64String, 'base64');
11 | 	const utfString = buff.toString('utf-8');
12 | 	const [ email, token ] = utfString.split(':');
13 | 	return { email, token };
14 | };
15 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountAggregations/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "size": 0,
 3 |     "aggs": {
 4 |         "entities_count_extended_stats": {
 5 |             "extended_stats": {
 6 |                 "field": "dbpedia_entities_metadata.entities_count"
 7 |             }
 8 |         },
 9 |         "entities_count_histogram": {
10 |             "histogram": {
11 |                 "field": "dbpedia_entities_metadata.entities_count",
12 |                 "interval": 1,
13 |                 "min_doc_count": 1
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/sparql/query.mjs:
--------------------------------------------------------------------------------
 1 | import { fetch } from 'undici';
 2 | 
 3 | 
 4 | export const query = async(
 5 | 	sparql,
 6 | 	endpoint='https://dbpedia.org/sparql',
 7 | 	{ responseFormat='application/json' } = {}
 8 | ) => {
 9 | 	const headers = {
10 | 		Accept: responseFormat,
11 | 		'Content-Type': 'application/sparql-query'
12 | 	};
13 | 	const response = await fetch(endpoint,
14 | 		{
15 | 			method: 'POST',
16 | 			body: sparql,
17 | 			headers
18 | 		}
19 | 	);
20 | 	if (responseFormat === 'application/json') {
21 | 		return response.json();
22 | 	}
23 | 	return response;
24 | };
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedURITermsByConfidence/README.md:
--------------------------------------------------------------------------------
 1 | ## Term aggregation of flattened URI values
 2 | 
 3 | Counts the occurrences of the `URI` fields in `dbpedia_entities` at different
 4 | confidence levels and returns the top 100 most frequent values.
 5 | 
 6 | Endpoint: `POST arxiv_v6/_search`
 7 | 
 8 | See:
 9 | 
10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/query-filter-context.html
11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-terms-aggregation.html
12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html


--------------------------------------------------------------------------------
/bin/geo/generatePmTiles.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 |     echo "Please supply path to the geojson files" 
 5 |     exit 1
 6 | fi
 7 | 
 8 | if [ ! -d $1 ]; then
 9 |     echo "Plase supply a valid path"
10 |     exit 1
11 | fi
12 | 
13 | tiles=""
14 | files=""
15 | 
16 | for file in `ls $1/*.geojson`; do
17 |     files="$file $files"
18 |     file=`basename -- $file`
19 |     file="${file%.*}"
20 |     tiles="$file"_"$tiles"
21 | done
22 | 
23 | tiles=${tiles%?}
24 | mbtiles="$tiles.mbtiles"
25 | pmtiles="$tiles.pmtiles"
26 | 
27 | tippecanoe -o $mbtiles -zg $files
28 | pmtiles convert $mbtiles $pmtiles


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/textBodyAbstractArticleTokensHistogram/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "size": 0,
 3 |     "aggs": {
 4 |         "textBody_abstract_article_token_count_histogram": {
 5 |             "histogram": {
 6 |                 "field": "textBody_abstract_article.token_count",
 7 |                 "interval": 10,
 8 |                 "min_doc_count": 1
 9 |             }
10 |         },
11 |         "textBody_abstract_article_token_count_extended_stats": {
12 |             "extended_stats": {
13 |                 "field": "textBody_abstract_article.token_count"
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedSimilarityScoreHistogram/README.md:
--------------------------------------------------------------------------------
 1 | ## Histogram of flattened `similarityScore` values
 2 | 
 3 | Aggregates all `similarityScore` values into a histogram, each bucket having an
 4 | interval of 0.1. Flattened here denotes the fact that all annotated entities are
 5 | treated as a flat list - no per document analysis is performed.
 6 | 
 7 | Endpoint: `POST arxiv_v6/_search`
 8 | 
 9 | See:
10 | 
11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html
12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html
13 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedConfidenceExtendedStats/README.md:
--------------------------------------------------------------------------------
 1 | ## Flattened `confidence` Extended Stats
 2 | 
 3 | Produces a number of different statistical measures such as average, STD, min,
 4 | max, etc. for`confidence`values. Flattened here denotes the fact
 5 | that all annotated entities are treated as a flat list - no per document
 6 | analysis is performed.
 7 | 
 8 | Endpoint: `POST arxiv_v6/_search`
 9 | 
10 | See:
11 | 
12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html
14 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountAggregations/README.md:
--------------------------------------------------------------------------------
 1 | ## Entities Count Aggregations
 2 | 
 3 | Aggregations for the `entities_count` field, which is a simple count of the
 4 | total number of entities found for that document.
 5 | 
 6 | The aggregations use both `extended_stats` and `histogram`s for the
 7 | `entities_count` metadata field.
 8 | 
 9 | Endpoint: `POST arxiv_v6/_search`
10 | 
11 | See:
12 | 
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html
14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedSimilarityScoreExtendedStats/README.md:
--------------------------------------------------------------------------------
 1 | ## Flattened `similarityScore` Extended Stats
 2 | 
 3 | Produces a number of different statistical measures such as average, STD, min,
 4 | max, etc. for `similarityScore` fields. Flattened here denotes the fact
 5 | that all annotated entities are treated as a flat list - no per document
 6 | analysis is performed.
 7 | 
 8 | Endpoint: `POST arxiv_v6/_search`
 9 | 
10 | See:
11 | 
12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html
14 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountAggsByConfidenceOverEntitiesCount/README.md:
--------------------------------------------------------------------------------
 1 | ## Entities Count by Confidence over Entities Count
 2 | 
 3 | Counts number of entities found at different confidence levels, then normalises
 4 | that count using the total count of entities found at all confidence levels.
 5 | 
 6 | Uses `extended_stats` and `histogram` aggs.
 7 | 
 8 | Endpoint: `POST arxiv_v6/_search`
 9 | 
10 | See:
11 | 
12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html


--------------------------------------------------------------------------------
/es/query.mjs:
--------------------------------------------------------------------------------
 1 | import { stringify } from '@svizzle/utils';
 2 | 
 3 | import { arxliveCopy } from '../conf/config.mjs';
 4 | import { buildRequest, makeRequest } from '../es/requests.mjs';
 5 | 
 6 | export const query = async (query_, index, domain=arxliveCopy) => {
 7 | 	const path = `${index}/_search`;
 8 | 	const payload = query_;
 9 | 	const request = buildRequest(
10 | 		domain,
11 | 		path,
12 | 		'POST',
13 | 		{ payload }
14 | 	);
15 | 	const { body: response, code } = await makeRequest(request, { verbose: true });
16 | 
17 | 	if (code !== 200) {
18 | 		throw new Error(`Query failed with response ${stringify(response)}`);
19 | 	}
20 | 
21 | 	return response;
22 | };
23 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/confidenceHistograms/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"size": 0,
 3 | 	"aggs": {
 4 | 		"avg_confidence_histogram": {
 5 | 			"histogram": {
 6 | 				"field": "dbpedia_entities_metadata.confidence_avg",
 7 | 				"interval": 1,
 8 | 				"min_doc_count": 1
 9 | 			}
10 | 		},
11 | 		"max_confidence_histogram": {
12 | 			"histogram": {
13 | 				"field": "dbpedia_entities_metadata.confidence_max",
14 | 				"interval": 10,
15 | 				"min_doc_count": 1
16 | 			}
17 | 		},
18 | 		"min_confidence_histogram": {
19 | 			"histogram": {
20 | 				"field": "dbpedia_entities_metadata.confidence_min",
21 | 				"interval": 10,
22 | 				"min_doc_count": 1
23 | 			}
24 | 		}
25 | 	}
26 | }


--------------------------------------------------------------------------------
/neo4j/driver.mjs:
--------------------------------------------------------------------------------
 1 | 
 2 | import { promises as fs } from 'fs';
 3 | import { fileURLToPath } from 'url';
 4 | import { dirname } from 'path';
 5 | 
 6 | import * as neo4j from '../neo4j-driver';
 7 | 
 8 | const __filename = fileURLToPath(import.meta.url);
 9 | const __dirname = dirname(__filename);
10 | 
11 | export const getSession = async () => {
12 | 	const credentials = `${__dirname}/secrets/credentials.json`;
13 | 	const { user, password } = JSON.parse(await fs.readFile(credentials));
14 | 	const uri = 'bolt://3.8.167.48:7687';
15 | 	const driver = neo4j.driver(uri, neo4j.auth.basic(user, password));
16 | 
17 | 	const session = driver.session();
18 | 	return [session, driver];
19 | };
20 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/tokenCountOverEntitiesCountAggs/README.md:
--------------------------------------------------------------------------------
 1 | ## `token_count` normalised by `entities_count`
 2 | 
 3 | Provides descriptive statistics on the number of tokens found in the annotated
 4 | field, divided by the total number of `dbpedia_entities` produced when annotated.
 5 | 
 6 | The aggregations use both `extended_stats` and `histogram`s for the
 7 | `entities_count` metadata field.
 8 | 
 9 | Endpoint: `POST arxiv_v6/_search`
10 | 
11 | See:
12 | 
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html
14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountOverTokenCountByConfidence/README.md:
--------------------------------------------------------------------------------
 1 | ## Entities Count by Confidence over `token_count` 
 2 | 
 3 | Counts number of entities found at different confidence levels, then normalises
 4 | that count using the `token_count`, which is a count of the number of tokens
 5 | for the field that was used as input for the annotation process.
 6 | 
 7 | Uses `extended_stats` and `histogram` aggs.
 8 | 
 9 | Endpoint: `POST arxiv_v6/_search`
10 | 
11 | See:
12 | 
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html
14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountAggsByConfidence/README.md:
--------------------------------------------------------------------------------
 1 | ## `entities_count` filtered by confidence
 2 | 
 3 | These aggregations provided statistcs for the
 4 | `dbpedia_entities_metatadata.confidence_counts.X` fields. Specifically, these
 5 | fields count the number of entities found at the varying degreees of confidence
 6 | levels. The request is a multi agg request which provides `extended_stats` and
 7 | `histograms` for all 11 possible confidence levels.
 8 | 
 9 | Endpoint: `POST arxiv_v6/_search`
10 | 
11 | See:
12 | 
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html
14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/textBodyAbstractArticleTokensHistogram/README.md:
--------------------------------------------------------------------------------
 1 | ## Histogram for `textBody_abstract_article` tokens
 2 | 
 3 | Aggregates all `textBody_abstract_article` values into a histogram, each bucket
 4 | having an interval of 10 tokens. Tokens are generate upon indexing using
 5 | ElasticSearch's standard tokenizer. Also performs an `extended_stats`
 6 | aggregation for the `token_count` field.
 7 | 
 8 | See:
 9 | 
10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html
11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/analysis-standard-tokenizer.html
12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html


--------------------------------------------------------------------------------
/dbpedia/ontology.mjs:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from 'fs';
 2 | 
 3 | import { stringify } from '@svizzle/utils';
 4 | import * as _ from 'lamb';
 5 | 
 6 | import { dbo } from '../dbpedia/util.mjs';
 7 | 
 8 | const FILE_ONTOLOGY_JSON = 'data/dbpedia/ontology.json';
 9 | 
10 | export const loadOntology = async (depth, { squash=false, fullURI=true }={}) => {
11 | 	const data = await fs.readFile(FILE_ONTOLOGY_JSON, { encoding: 'utf-8'});
12 | 	const changedURIs = fullURI
13 | 		? JSON.parse(data)
14 | 		: JSON.parse(data.replaceAll(dbo, ''));
15 | 
16 | 	const selectAtDepth = _.pickIf(value => _.getIn(value, 'depth') <= depth);
17 | 	const ontology = squash
18 | 		? _.values(_.mapValues(selectAtDepth(changedURIs), _.getKey('class_')))
19 | 		: selectAtDepth(changedURIs);
20 | 
21 | 	return ontology;
22 | };
23 | 


--------------------------------------------------------------------------------
/aws/email.mjs:
--------------------------------------------------------------------------------
 1 | import { defaultProvider } from '@aws-sdk/credential-provider-node';
 2 | import { SESClient, SendEmailCommand } from '@aws-sdk/client-ses';
 3 | 
 4 | const client = new SESClient({
 5 | 	credentials: defaultProvider(),
 6 | 	region: 'eu-west-2',
 7 | });
 8 | 
 9 | export const sendEmail = async (email, source, message, subject) => {
10 | 	const input = {
11 | 		Source: source,
12 | 		Destination: {
13 | 			ToAddresses: [email],
14 | 		},
15 | 		Message: {
16 | 			Body: {
17 | 				Html: {
18 | 					Charset: 'UTF-8',
19 | 					Data: message,
20 | 				},
21 | 			},
22 | 			Subject: {
23 | 				Charset: 'UTF-8',
24 | 				Data: subject,
25 | 			},
26 | 		},
27 | 	};
28 | 	const command = new SendEmailCommand(input);
29 | 	const response = await client.send(command);
30 | 	return response;
31 | };
32 | 


--------------------------------------------------------------------------------
/bing/search.mjs:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from 'fs';
 2 | 
 3 | import { stringify } from '@svizzle/utils';
 4 | import { fetch } from 'undici';
 5 | 
 6 | const SUBSCRIPTION_KEY = process.env.AZURE_SUBSCRIPTION_KEY;
 7 | if (!SUBSCRIPTION_KEY) {
 8 | 	throw new Error('AZURE_SUBSCRIPTION_KEY is not set.');
 9 | }
10 | 
11 | export const search = async (query, { mkt='en-GB' } = {}) => {
12 | 	const host = 'https://api.bing.microsoft.com';
13 | 	const path = `v7.0/search?q=${encodeURIComponent(query)}&mkt=${mkt}`;
14 | 	const headers = { 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY };
15 | 	const response = await fetch(`${host}/${path}`, { headers });
16 | 	if (response.status !== 200) {
17 | 		throw new Error(`Bing search failed.\nResponse:\n${stringify(response)}`);
18 | 	}
19 | 	const data = await response.json();
20 | 	return data;
21 | };
22 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/docsWithMissingDBpediaEntitiesField/README.md:
--------------------------------------------------------------------------------
 1 | ## Documents missing the `dbpedia_entities` Field
 2 | 
 3 | This query uses a combination of the nested, bool, must_not and exists API
 4 | parameters to determine which documents are missing the `dbpedia_entities`
 5 | field.
 6 | 
 7 | Endpoint: `POST arxiv_v6/_count`
 8 | 
 9 | See:
10 | 
11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html
12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/query-dsl-exists-query.html
13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/query-dsl-bool-query.html
14 | 
15 | ### Notes
16 | 
17 | We are using the query API because we can't use the `missing` Aggregation API as
18 | it does not support `nested` type fields.
19 | 
20 | See:
21 | 
22 | - https://github.com/elastic/elasticsearch/issues/9571


--------------------------------------------------------------------------------
/terraform/commands.mjs:
--------------------------------------------------------------------------------
 1 | import { promisify } from 'node:util';
 2 | import { exec } from 'child_process';
 3 | 
 4 | import { displayCommandOutput } from '../util/shell.mjs';
 5 | 
 6 | const execAwait = promisify(exec);
 7 | 
 8 | export const init = async dir => {
 9 | 	const initCommand =
10 | 		`terraform -chdir=${dir} init`;
11 | 	console.log("[+] Terraform - Initialising...");
12 | 	await execAwait(initCommand);
13 | };
14 | 
15 | export const apply = async dir => {
16 | 	const applyCommand =
17 | 	`terraform -chdir=${dir} apply -auto-approve`;
18 | 	console.log("[+] Terraform - Applying...");
19 | 	await execAwait(applyCommand);
20 | };
21 | 
22 | 
23 | export const destroy = async dir => {
24 | 	const destroyCommand =
25 | 		`terraform -chdir=${dir} destroy -auto-approve`;
26 | 	console.log("[+] Terraform - Destroying...");
27 | 	await execAwait(destroyCommand);
28 | };
29 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/tokenCountOverEntitiesCountAggs/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "size": 0,
 3 |     "aggs": {
 4 |         "token_count_over_entity_count_extended_stats": {
 5 |             "extended_stats": {
 6 |                 "script": "if (doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['textBody_abstract_article.token_count'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 7 |             }
 8 |         },
 9 |         "token_count_over_entity_count_histogram": {
10 |             "histogram": {
11 |                 "script": "if (doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['textBody_abstract_article.token_count'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
12 |                 "interval": 0.1,
13 |                 "min_doc_count": 1
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # dap_dv_backends_utils
 2 | 
 3 | Utility library and utility scripts for the DAP data visualisation team's
 4 | backend services and general code.
 5 | 
 6 | ## Installing the latest version
 7 | 
 8 | To install the latest version:
 9 | 
10 | `npm install nestauk/dap_dv_backends_utils#release`
11 | 
12 | ## Installing a specific version
13 | 
14 | To install a specific version (check the tagged version on the release branch):
15 | 
16 | `npm install nestauk/dap_dv_backends_utils#<tag>`
17 | 
18 | e.g.:
19 | 
20 | `npm install nestauk/dap_dv_backends_utils#v0.0.2`
21 | 
22 | ## Installing a specific branch
23 | 
24 | To install a specific branch (useful for development):
25 | 
26 | `npm install nestauk/dap_dv_backends_utils#my_branch`
27 | 
28 | ## Updating the installation
29 | 
30 | After installing from a branch (`release` or any other), the content of that branch can change, so if we need to update the installation with newer commits in that branch, we can use:
31 | 
32 | `npm update`
33 | 


--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
 1 | ## `annotate`
 2 | 
 3 | Running this script requires that you have data hosted on an ElasticSearch
 4 | domain and that you have a running Spotlight API endpoint.
 5 | 
 6 | ## `annotationsDataQuality`
 7 | 
 8 | This script will provide a number of aggreagations relating to the data quality
 9 | of the results provided by the annotation process. The output directory will
10 | have names relating to the kind of aggregation that was run. For further details
11 | on the kinds of aggregations and what they do, refer to the README.md files
12 | in each aggreagation requests directory in `bin/annotationsDataQuality/requests/<request>`.
13 | 
14 | ## `entitiesDataQuality`
15 | 
16 | This script will provide data quality for the actual DBpedia entities produced
17 | by the `annotate` script. It collects the set of all DBpedia URIs and uses
18 | a number of SPARQL queries to determine the quality of data provided by DBpedia,
19 | such as "how many entities have images?" and "of those images, what file type
20 | are they?" etc. The aggregations produced have self descriptive names.
21 | 


--------------------------------------------------------------------------------
/neo4j/gds.mjs:
--------------------------------------------------------------------------------
 1 | import * as _ from 'lamb';
 2 | 
 3 | import { getSession } from '../neo4j/driver.mjs';
 4 | 
 5 | import { promisify } from '../neo4j/util.mjs';
 6 | 
 7 | export const project = async(graphName, threshold) => {
 8 | 	const [session, driver] = await getSession();
 9 | 	const command = `
10 |     CALL gds.graph.project.cypher(
11 |         '${graphName}',
12 |         'MATCH (n:Entity)-[r:APPEARS_IN_ABSTRACT]-(m:Entity) WHERE r.confidence >= ${threshold} RETURN id(n) AS id',
13 |         'MATCH (n:Entity)-[r:APPEARS_IN_ABSTRACT]-(m:Entity) WHERE r.confidence >= ${threshold} RETURN id(n) AS source, id(m) AS target')
14 |       YIELD
15 |         graphName AS graph, nodeQuery, nodeCount AS nodes, relationshipQuery, relationshipCount AS rels
16 |     `;
17 | 	const result = session.run(command);
18 | 	return promisify(result, session, driver);
19 | };
20 | 
21 | export const drop = async graphName => {
22 | 	const [session, driver] = await getSession();
23 | 	const command = `CALL gds.graph.drop('${graphName}')`;
24 | 	const result = session.run(command);
25 | 	return promisify(result, session, driver);
26 | };
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Nesta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/wiki/page.mjs:
--------------------------------------------------------------------------------
 1 | import { fetch } from 'undici';
 2 | 
 3 | 
 4 | // eslint-disable-next-line no-process-env
 5 | const ACCESS_TOKEN = process.env.WIKIMEDIA_ACCESS_TOKEN;
 6 | if (!ACCESS_TOKEN) {
 7 | 	throw new Error('WIKIMEDIA_ACCESS_TOKEN is not set.');
 8 | }
 9 | 
10 | const domain = 'https://api.wikimedia.org/core/v1/wikipedia';
11 | 
12 | /**
13 |  *
14 |  * @param {string} title - Title of the Wikipedia page to fetch
15 |  * @param {Object} options
16 |  * @param {string} [options.language='en] - Language of source Wikipedia page
17 |  * @param {boolean} [options.bare=true] - Whether to fetch just the page's metadata or the entire contents of the page.
18 |  * @returns {Object} - response object
19 |  */
20 | export const getPage = async (title, { language='en', bare=true } = {}) => {
21 | 	const path = `${language}/page/${encodeURIComponent(title)}${bare ? '/bare' : ''}`;
22 | 	const url = `${domain}/${path}`;
23 | 	const response = await fetch(url, {
24 | 		headers: {
25 | 			'Authorization': `Bearer ${ACCESS_TOKEN}`,
26 | 			'Api-User-Agent': 'ai_map'
27 | 		}
28 | 	});
29 | 
30 | 	return {
31 | 		code: response.status,
32 | 		body: await response.json()
33 | 	};
34 | };
35 | 


--------------------------------------------------------------------------------
/.github/workflows/tag_new_versions.yml:
--------------------------------------------------------------------------------
 1 | name: Tag new version
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'dev'
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v3
14 |         with: 
15 |           fetch-depth: 0
16 |       - name: Get possible new version
17 |         id: newVersionStep
18 |         run: |
19 |           new=$(echo $(cat ./package.json | grep version | head -1 | awk -F: '{ print $2 }' | sed 's/[",]//g' | tr -d '[[:space:]]'))
20 |           echo "new=$new" >> $GITHUB_OUTPUT
21 |       - name: Check current version
22 |         id: currentVersionStep
23 |         run: |
24 |           current=$(git tag  | grep -E '^v[0-9]' | sort -V | tail -1 | cut -c2-) 
25 |           echo "current=$current" >> $GITHUB_OUTPUT
26 |       - name: Tag new version and push to release 
27 |         if: ${{ steps.currentVersionStep.outputs.current != steps.newVersionStep.outputs.new }}
28 |         run: |
29 |           tag=${{ format('v{0}', steps.newVersionStep.outputs.new) }}
30 |           git tag $tag
31 |           git push origin $tag 
32 |           git switch release
33 |           git merge dev
34 |           git push 


--------------------------------------------------------------------------------
/es/dump.mjs:
--------------------------------------------------------------------------------
 1 | import { SingleBar, Presets } from 'cli-progress';
 2 | import * as _ from 'lamb';
 3 | 
 4 | import { count } from '../es/index.mjs';
 5 | import { scroll, clearScroll } from '../es/search.mjs';
 6 | 
 7 | /**
 8 |  * @param {string} domain - domain on from which to dump data
 9 |  * @param {string} index - index from which to dump data
10 |  * @param {number} size size of scroll object - how many documents to fetch in a single reqeust. Maximum value is 10k
11 |  * @returns {Object} list of all documents on that index.
12 |  */
13 | export const dump = async(domain, index, size) => {
14 | 	const bar = new SingleBar(
15 | 		{ etaBuffer: size * 10 },
16 | 		Presets.rect
17 | 	);
18 | 	const totalDocuments = await count(domain, index);
19 | 
20 | 	bar.start(totalDocuments, 0);
21 | 
22 | 	const scroller = scroll(domain, index, {
23 | 		size,
24 | 		pages: 'all'
25 | 	});
26 | 
27 | 	// mutation required due to await
28 | 	let documents = [];
29 | 	for await (let page of scroller) {
30 | 		documents.push(
31 | 			..._.map(page.hits.hits, doc => {
32 | 				bar.increment();
33 | 				return doc._source;
34 | 			})
35 | 		);
36 | 	}
37 | 
38 | 	bar.stop();
39 | 
40 | 	clearScroll(domain);
41 | 
42 | 	return documents;
43 | };
44 | 


--------------------------------------------------------------------------------
/neo4j/util.mjs:
--------------------------------------------------------------------------------
 1 | import * as _ from 'lamb';
 2 | 
 3 | const resolveValue = value => {
 4 | 	if (!value || typeof value === 'String') {
 5 | 		return value;
 6 | 	}
 7 | 
 8 | 	const className = value.constructor.name;
 9 | 	switch (className) {
10 | 		case 'Integer':
11 | 			return value.toInt();
12 | 		case 'Object':
13 | 			return _.mapValues(value, resolveValue);
14 | 		case 'Array':
15 | 			return _.map(value, resolveValue);
16 | 		default:
17 | 			return value;
18 | 	}
19 | };
20 | 
21 | const parseRecord = record => {
22 | 	const fields = _.reduce(
23 | 		_.range(0, record.length),
24 | 		(acc, idx) => {
25 | 			const value = record.get(idx);
26 | 			return {
27 | 				...acc,
28 | 				[record.keys[idx]]: resolveValue(value)
29 | 			};
30 | 		},
31 | 		{}
32 | 	);
33 | 	return fields;
34 | };
35 | 
36 | export const promisify = (result, session, driver) => {
37 | 	const data = [];
38 | 	return new Promise((resolve, reject) => {
39 | 		result.subscribe({
40 | 			onNext: record => {
41 | 				data.push(parseRecord(record));
42 | 			},
43 | 			onCompleted: () => {
44 | 				session.close().then(driver.close());
45 | 				resolve(data);
46 | 			},
47 | 			onError: error => {
48 | 				reject(error);
49 | 			}
50 | 		});
51 | 	});
52 | };
53 | 


--------------------------------------------------------------------------------
/terraform/configuration.mjs:
--------------------------------------------------------------------------------
 1 | import * as _ from 'lamb';
 2 | 
 3 | import { createPathAndWriteObject } from '../util/path.mjs';
 4 | import { ami, scaffold, spotlightInstanceType } from '../conf/infrastructure.mjs';
 5 | 
 6 | 
 7 | export const generateConfiguration = async(workers, path=null) => {
 8 | 	const identifiers = [...Array(workers).keys()];
 9 | 	const resource = _.map(identifiers, id => (
10 | 		{
11 | 			aws_instance: [
12 | 				{
13 | 					[`spotlight-node-${id}`]: [
14 | 						{
15 | 							ami,
16 | 							instance_type: spotlightInstanceType,
17 | 							key_name: 'spotlight',
18 | 							vpc_security_group_ids: ['sg-026313a646e2d8470'],
19 | 							tags: {
20 | 								Name: `spotlight-node-${id}`,
21 | 							},
22 | 						},
23 | 					],
24 | 				},
25 | 			],
26 | 		}
27 | 	));
28 | 	const output = _.map(identifiers, id => (
29 | 		{
30 | 			[`spotlight-node-${id}-public_ip`]: [
31 | 				{
32 | 					"value": `\${aws_instance.spotlight-node-${id}.public_ip}`
33 | 				}
34 | 			]
35 | 		}
36 | 	));
37 | 	const configuration = {
38 | 		...scaffold,
39 | 		output,
40 | 		resource
41 | 	};
42 | 
43 | 	if (path) {
44 | 		await createPathAndWriteObject(path, configuration);
45 | 	}
46 | 	return configuration;
47 | };
48 | 
49 | 


--------------------------------------------------------------------------------
/logging/logging.mjs:
--------------------------------------------------------------------------------
 1 | import * as fs from 'fs/promises';
 2 | import { createLogger, format, transports } from 'winston';
 3 | 
 4 | await fs.mkdir('logs', { recursive: true });
 5 | 
 6 | export const logger = createLogger({
 7 | 	level: 'info',
 8 | 	format: format.combine(
 9 | 		format.timestamp({
10 | 			format: 'YYYY-MM-DD HH:mm:ss',
11 | 		}),
12 | 		format.errors({ stack: true }),
13 | 		format.splat(),
14 | 		format.json()
15 | 	),
16 | 	defaultMeta: { service: 'arxlive-spotlight-annotator' },
17 | 	transports: [
18 | 
19 | 		//
20 | 		// - Write to all logs with level `info` and below to `quick-start-combined.log`.
21 | 		// - Write all logs error (and below) to `quick-start-error.log`.
22 | 		//
23 | 		new transports.File({
24 | 			filename: 'logs/error.log',
25 | 			level: 'error',
26 | 		}),
27 | 		new transports.File({ filename: 'logs/all.log' }),
28 | 	],
29 | });
30 | 
31 | //
32 | // If we're not in production then **ALSO** log to the `console`
33 | // with the colorized simple format.
34 | //
35 | // eslint-disable-next-line no-process-env
36 | if (process.env.NODE_ENV !== 'production') {
37 | 	logger.add(
38 | 		new transports.Console({
39 | 			format: format.combine(format.colorize(), format.simple()),
40 | 		})
41 | 	);
42 | }
43 | 


--------------------------------------------------------------------------------
/bin/geo/downloadBoundaries.js:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env node
 2 | 
 3 | import fs from 'fs';
 4 | 
 5 | import json from '@discoveryjs/json-ext'
 6 | import { readJson } from '@svizzle/file';
 7 | import { Command } from 'commander';
 8 | import * as _ from 'lamb';
 9 | 
10 | import { collectAllFeatures } from '../../geo/download.js';
11 | 
12 | 
13 | const { stringifyStream } = json;
14 | 
15 | const program = new Command();
16 | program.requiredOption('-i, --config <configuration>', 'Configuration file. More on this in the README');
17 | program.requiredOption('-o, --output <path>', 'Path in which to save the outputp data');
18 | 
19 | program.parse();
20 | const options = program.opts();
21 | 
22 | const downloadBoundaries = async inputs => {
23 |     for await (const { boundary, endpoint } of inputs) {
24 |         console.log(`Collecting ${boundary}...`)
25 |         const writeStream = fs.createWriteStream(`${options.output}/${boundary}.geojson`);
26 |         const collection = await collectAllFeatures(endpoint);
27 |         stringifyStream(collection).pipe(writeStream);
28 |     }
29 | }
30 | 
31 | const main = async () => {
32 |     readJson(options.config)
33 |     .then(downloadBoundaries)
34 |     .catch(() => { throw new Error('Unable to parse configuration') });
35 | }
36 | 
37 | main();


--------------------------------------------------------------------------------
/es/update.mjs:
--------------------------------------------------------------------------------
 1 | import { stringify } from '@svizzle/utils';
 2 | import * as _ from 'lamb';
 3 | 
 4 | import { buildRequest, makeRequest } from '../es/requests.mjs';
 5 | 
 6 | /**
 7 |  * @function update
 8 |  * @description update a document on an ES index.
 9 |  * @param {string} domain - domain on which to update.
10 |  * @param {string} index - index on which to update.
11 |  * @param {string} id - id of document to update.
12 |  * @param {Object} doc - an object containing the new fields and properties that constitute the update.
13 |  * @returns {HttpResponse} response of the update reqeuest.
14 |  */
15 | export const update = async (
16 | 	domain,
17 | 	index,
18 | 	id,
19 | 	doc,
20 | 	payloadOptions={},
21 | 	query={},
22 | 	{ checkStatus=true } = {}
23 | ) => {
24 | 	const path = `${index}/_update/${encodeURIComponent(id)}`;
25 | 	const payload = { ...payloadOptions, doc };
26 | 	const request = buildRequest(domain, path, 'POST', { payload, query });
27 | 	const { body: response, code } = await makeRequest(request);
28 | 
29 | 	if (!checkStatus) {
30 | 		return { response, code };
31 | 	}
32 | 
33 | 	if (code !== 200) {
34 | 		throw Error(
35 | 			`Update failed at ${domain}/${index} for document with ID: ${id}.
36 | 			Response:\n${stringify(response)}`
37 | 		);
38 | 	}
39 | 
40 | 	return response;
41 | };
42 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/duplicateAggregations/README.md:
--------------------------------------------------------------------------------
 1 | ## Duplicate Aggregations
 2 | 
 3 | These aggregations include a mixture of different descriptive statistics
 4 | relating to metadata which describes the duplicates found for entities provided
 5 | by the Spotlight Tool. In particular, `dupes_10` and `dupes_60` are measures of
 6 | how many duplicates were found **at that confidence level**. So if
 7 | `dupes_10_count`'s value is 6, then there were a total of 6 duplicates found at
 8 | confidence level 10. One entity having duplicates counts as a single occurrence
 9 | of a duplicate, e.g. if `Photon` has 3 duplicates found at confidence level 10,
10 | it will contribute 1 occurrence to the total `dupes_10_count`.
11 | 
12 | We also provide aggregations on the `dupes_ratio_X` metadata value, which is
13 | simply the `dupes_count_X` value divided by the total number of entities
14 | annotated for that piece of text.
15 | 
16 | The aggregations use both `extended_stats` and `histogram`s for each metadata
17 | value (`dupes_count_X` and `dupes_ratio_X`)
18 | 
19 | Endpoint: `POST arxiv_v6/_search`
20 | 
21 | See:
22 | 
23 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html
24 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html


--------------------------------------------------------------------------------
/util/array.mjs:
--------------------------------------------------------------------------------
 1 | import { isNotNil, mergeObjects } from '@svizzle/utils';
 2 | import * as cliProgress from 'cli-progress';
 3 | import * as _ from 'lamb';
 4 | 
 5 | const _batch = (arr, batchSize) => {
 6 | 	return arr.map((val, i) => {
 7 | 		if (i % batchSize === 0) {
 8 | 			return arr.slice(i, i + batchSize);
 9 | 		}
10 | 		return null;
11 | 	});
12 | };
13 | 
14 | export const batch = _.pipe([_batch, _.filterWith(isNotNil)]);
15 | 
16 | export const batchIterate = async(iterable, func, options={}) => {
17 | 	const { batchSize=100 } = options;
18 | 
19 | 	const bar = new cliProgress.Bar(null, cliProgress.Presets.rect);
20 | 	bar.start(iterable.length, 0);
21 | 	const batches = batch(iterable, batchSize);
22 | 	let results = [];
23 | 	for (const batch_ of batches) {
24 | 		// eslint-disable-next-line no-await-in-loop
25 | 		const result = await func(batch_);
26 | 		results = [...results, result];
27 | 		bar.increment(batch_.length);
28 | 	}
29 | 
30 | 	bar.stop();
31 | 	return results;
32 | };
33 | 
34 | export const batchIterateFlatten = async(iterable, func, options) => {
35 | 	const results = await batchIterate(iterable, func, options);
36 | 	return _.shallowFlatten(results);
37 | };
38 | 
39 | export const batchIterateMerge = async(iterable, func, options) => {
40 | 	const results = await batchIterate(iterable, func, options);
41 | 	return mergeObjects(results);
42 | };
43 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # v0.0.15
 2 | 
 3 | Create `annotateEsIndex` script
 4 | 
 5 | # v0.0.14
 6 | 
 7 | Fix `#getEntities` function
 8 | Fix `entitiesDataQuality` bug
 9 | 
10 | # v0.0.13
11 | 
12 | Fix Arcgis FeatureServer bug
13 | 
14 | # v0.0.12
15 | 
16 | Update package dependencies
17 | 
18 | # v0.0.11
19 | 
20 | Due to uknown bug, the changes below were never pushed to GH
21 | 
22 | # v0.0.10
23 | 
24 | Fix Arcgis FeatureServer bug where features weren't returning all properties.
25 | 
26 | # v0.0.9
27 | 
28 | Fix ES query bug
29 | 
30 | # v0.0.8
31 | 
32 | Add scripts for downloading geographic boundaries from an arcGis FeatureServer,
33 | and for converting these boundaries to mbtiles/pmtiles, and uploading the
34 | pmtiles file to an s3 bucket.
35 | 
36 | # v0.0.7
37 | 
38 | Fix authentication bug. Authentication endpoint expects a GET request with
39 | email and token provided in the URLSearchParams, but instead was being passed
40 | as the body of a POST request. This change fixes that bug.
41 | 
42 | # v0.0.6
43 | 
44 | Add authentication logic formerly contained in the
45 | annotation service
46 | 
47 | # v0.0.5
48 | 
49 | Patch import errors in `jsonToEsIndex.js` script 
50 | and patch a bulk request bug.
51 | 
52 | # v0.0.4
53 | 
54 | Add the jsonToEsIndex script to `bin/`
55 | 
56 | # v0.0.3
57 | 
58 | Port Terraform configuration to nestauk/dap_dv_backends
59 | 
60 | # v0.0.2
61 | 
62 | Added some executable scripts in `bin/`
63 | 
64 | # v0.0.1
65 | 
66 | Copy of utilities from nestauk/dap_dv_backends@ce64d0c
67 | 


--------------------------------------------------------------------------------
/es/pipeline.mjs:
--------------------------------------------------------------------------------
 1 | import { arxliveCopy } from '../conf/config.mjs';
 2 | import { buildRequest, makeRequest } from '../es/requests.mjs';
 3 | 
 4 | /**
 5 |  *
 6 |  * @param {string} name - name of pipeline in url parsable form.
 7 |  * @param {string} description - description of pipeline.
 8 |  * @param {Array<Object>} processors - list of processors for pipeline.
 9 |  * @param {string} domain - domain on which to put pipeline.
10 |  * @returns {Object} response object.
11 |  */
12 | const generic = (name, description, processors, domain) => {
13 | 	const path = `_ingest/pipeline/${name}`;
14 | 	const payload = { description, processors };
15 | 
16 | 	const request = buildRequest(domain, path, 'PUT', { payload });
17 | 	return makeRequest(request);
18 | };
19 | 
20 | /**
21 |  *
22 |  * @param {Array<string>} fields - list of fields to remove upon ingestion.
23 |  * @param {string} domain - domain on which to put pipeline.
24 |  * @returns {string} name of created pipeline
25 |  */
26 | export const remove = async (fields, domain = arxliveCopy) => {
27 | 	const description = `Remove ${fields.join(' ')}`;
28 | 	const name = `remove-${fields.join('-')}`;
29 | 	const processors = [
30 | 		{
31 | 			remove: {
32 | 				field: fields,
33 | 				ignore_failure: true,
34 | 			},
35 | 		},
36 | 	];
37 | 	const response = await generic(name, description, processors, domain);
38 | 	if (response.code === 200) {
39 | 		return name;
40 | 	}
41 | 	throw new Error(
42 | 		`Failed to create remove pipeline. Response:\n${response}`
43 | 	);
44 | 
45 | };
46 | 


--------------------------------------------------------------------------------
/neo4j/community.mjs:
--------------------------------------------------------------------------------
 1 | import * as _ from 'lamb';
 2 | 
 3 | import { getSession } from '../neo4j/driver.mjs';
 4 | import { promisify } from '../neo4j/util.mjs';
 5 | 
 6 | const getMetadata = data => {
 7 | 	const intermediateCommunities = data[0].intermediateCommunityIds.length;
 8 | 	const communityCount = _.keys(_.group(data, _.getKey('community'))).length;
 9 | 	const intermediateCounts = _.map(
10 | 		_.range(0, intermediateCommunities),
11 | 		idx => {
12 | 			const communities = _.group(data, r => r.intermediateCommunityIds[idx]);
13 | 			const counts = _.keys(communities).length;
14 | 			return counts;
15 | 		}
16 | 	);
17 | 	const metadata = {
18 | 		intermediateCommunities,
19 | 		communityCount,
20 | 		intermediateCounts
21 | 	};
22 | 	return metadata;
23 | };
24 | 
25 | const objectToString = object => _.reduce(
26 | 	_.pairs(object),
27 | 	(acc, [key, value]) => `${acc.length ? `${acc},` : ''} ${key}: ${value}`,
28 | 	''
29 | );
30 | 
31 | const generateCommand = (graph, options) => `
32 | CALL gds.louvain.stream('${graph}', { ${objectToString(options)} })
33 | YIELD nodeId, communityId, intermediateCommunityIds
34 | RETURN gds.util.asNode(nodeId).URI AS URI, communityId, intermediateCommunityIds
35 | ORDER BY communityId ASC
36 | `;
37 | 
38 | export const stream = async (graph, options) => {
39 | 	const [session, driver] = await getSession();
40 | 	const command = generateCommand(graph, options);
41 | 	const result = session.run(command);
42 | 	const data = await promisify(result, session, driver);
43 | 	const metadata = getMetadata(data);
44 | 
45 | 	return { data, metadata };
46 | };
47 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountOverTokenCountByConfidence/request.mjs:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from 'fs';
 2 | import { fileURLToPath } from 'url';
 3 | import { dirname } from 'path';
 4 | 
 5 | const main = async () => {
 6 | 	const confidences = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100];
 7 | 	const aggs = confidences.reduce((acc, conf) => {
 8 | 		return {
 9 | 			...acc,
10 | 			[`entities_count_over_token_count_at_${conf}_extended_stats`]: {
11 | 				extended_stats: {
12 | 					script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['textBody_abstract_article.token_count'].value;`
13 | 				}
14 | 			},
15 | 			[`entities_count_over_token_count_at_${conf}_histogram`]: {
16 | 				histogram: {
17 | 					script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['textBody_abstract_article.token_count'].value;`,
18 | 					interval: 0.01,
19 | 					min_doc_count: 1
20 | 				}
21 | 			}
22 | 		};
23 | 	}, {});
24 | 	const payload = {
25 | 		size: 0,
26 | 		aggs
27 | 	};
28 | 	const requestString = JSON.stringify(payload, null, 4);
29 | 	const __filename = fileURLToPath(import.meta.url);
30 | 	const __dirname = dirname(__filename);
31 | 	await fs.writeFile(`${__dirname}/request.json`, requestString);
32 | };
33 | 
34 | main();
35 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountAggsByConfidenceOverEntitiesCount/request.mjs:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from 'fs';
 2 | import { fileURLToPath } from 'url';
 3 | import { dirname } from 'path';
 4 | 
 5 | const main = async () => {
 6 | 	const confidences = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100];
 7 | 	const aggs = confidences.reduce((acc, conf) => {
 8 | 		return {
 9 | 			...acc,
10 | 			[`confidence_${conf}_normalised_with_entities_count_extended_stats`]: {
11 | 				extended_stats: {
12 | 					script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['dbpedia_entities_metadata.entities_count'].value;`
13 | 				}
14 | 			},
15 | 			[`confidence_${conf}_normalised_with_entities_count_histogram`]: {
16 | 				histogram: {
17 | 					script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['dbpedia_entities_metadata.entities_count'].value;`,
18 | 					interval: 0.01,
19 | 					min_doc_count: 1
20 | 				}
21 | 			}
22 | 		};
23 | 	}, {});
24 | 	const payload = {
25 | 		size: 0,
26 | 		aggs
27 | 	};
28 | 	const requestString = JSON.stringify(payload, null, 4);
29 | 	const __filename = fileURLToPath(import.meta.url);
30 | 	const __dirname = dirname(__filename);
31 | 	await fs.writeFile(`${__dirname}/request.json`, requestString);
32 | };
33 | 
34 | main();
35 | 


--------------------------------------------------------------------------------
/conf/mappings.mjs:
--------------------------------------------------------------------------------
 1 | export const defaultMapping = {
 2 | 	type: 'nested',
 3 | 	properties: {
 4 | 		URI: {
 5 | 			type: 'keyword',
 6 | 		},
 7 | 		confidence: {
 8 | 			type: 'integer',
 9 | 		},
10 | 		percentageOfSecondRank: {
11 | 			type: 'float',
12 | 		},
13 | 		similarityScore: {
14 | 			type: 'float',
15 | 		},
16 | 		surfaceForm: {
17 | 			type: 'text',
18 | 		},
19 | 		duplicates_60: {
20 | 			type: 'integer',
21 | 		},
22 | 		duplicates_10: {
23 | 			type: 'integer',
24 | 		}
25 | 	},
26 | };
27 | 
28 | export const metaDataMapping = {
29 | 	properties: {
30 | 		confidence_avg: {
31 | 			type: 'float'
32 | 		},
33 | 		confidence_max: {
34 | 			type: 'integer'
35 | 		},
36 | 		confidence_min: {
37 | 			type: 'integer'
38 | 		},
39 | 		entities_count: {
40 | 			type: 'integer'
41 | 		},
42 | 		dupes_10_ratio: {
43 | 			type: 'float'
44 | 		},
45 | 		dupes_60_ratio: {
46 | 			type: 'float'
47 | 		},
48 | 		dupes_10_count: {
49 | 			type: 'integer'
50 | 		},
51 | 		dupes_60_count: {
52 | 			type: 'integer'
53 | 		},
54 | 		confidence_counts: {
55 | 			properties: {
56 | 				"0": {
57 | 					type: 'integer'
58 | 				},
59 | 				"10": {
60 | 					type: 'integer'
61 | 				},
62 | 				"20": {
63 | 					type: 'integer'
64 | 				},
65 | 				"30": {
66 | 					type: 'integer'
67 | 				},
68 | 				"40": {
69 | 					type: 'integer'
70 | 				},
71 | 				"50": {
72 | 					type: 'integer'
73 | 				},
74 | 				"60": {
75 | 					type: 'integer'
76 | 				},
77 | 				"70": {
78 | 					type: 'integer'
79 | 				},
80 | 				"80": {
81 | 					type: 'integer'
82 | 				},
83 | 				"90": {
84 | 					type: 'integer'
85 | 				},
86 | 				"100": {
87 | 					type: 'integer'
88 | 				}
89 | 			}
90 | 		}
91 | 	}
92 | 
93 | };
94 | 


--------------------------------------------------------------------------------
/geo/download.js:
--------------------------------------------------------------------------------
 1 | import * as _ from 'lamb'
 2 | 
 3 | import { batchIterateFlatten } from '../util/array.mjs'
 4 | 
 5 | // API docs: https://developers.arcgis.com/rest/services-reference/enterprise/query-feature-service-layer-.htm
 6 | 
 7 | export const getAllFeatureIds = async featureServerEndpoint => {
 8 | 
 9 |     const url = new URL(`${featureServerEndpoint}/0/query`)
10 |     url.search = new URLSearchParams({
11 |         f: 'json',
12 |         returnIdsOnly: true,
13 |         where: '1=1',
14 |         outFields: '*'
15 |     }).toString()
16 | 
17 |     const response = await fetch(url, { method: 'POST' });
18 |     const result = await response.json();
19 |     return result.objectIds;
20 | }
21 | 
22 | export const collectAllFeatures = async featureServerEndpoint => {
23 | 
24 |     const ids = await getAllFeatureIds(featureServerEndpoint)
25 |     const url = new URL(`${featureServerEndpoint}/0/query`)
26 | 
27 |     const downloadFeatures = async batch => {
28 |         url.search = new URLSearchParams({
29 |             f: 'geoJSON',
30 |             where: '1=1',
31 |             objectIds: batch,
32 |             outFields: '*',
33 |         })
34 |         const response = await fetch(url, { method: 'POST' });
35 |         const result = await response.json();
36 |         return result;
37 |     }
38 |     const results = await batchIterateFlatten(ids, downloadFeatures, { batchSize: 100 });
39 |     const collection = _.reduce(
40 |         results,
41 |         (acc, curr) => {
42 |             acc.features.push(...curr.features);
43 |             return acc
44 |         }
45 |     )
46 |     return {
47 |         type: "FeatureCollection",
48 |         ...collection
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bin": {
 3 |     "annotate": "bin/annotate.js",
 4 |     "annotateEsIndex": "bin/annotateEsIndex.js",
 5 |     "jsonToEsIndex": "bin/jsonToEsIndex.js",
 6 |     "annotationsDataQuality": "bin/annotationsDataQuality/annotationsDataQuality.js",
 7 |     "entitiesDataQuality": "bin/entitiesDataQuality.js",
 8 |     "downloadBoundaries": "bin/geo/downloadBoundaries.js",
 9 |     "generatePmTiles": "bin/geo/generatePmTiles.sh"
10 |   },
11 |   "bugs": {
12 |     "url": "https://github.com/nestauk/dap_dv_backends_utils/issues"
13 |   },
14 |   "dependencies": {
15 |     "@aws-crypto/sha256-browser": "^2.0.1",
16 |     "@aws-sdk/client-s3": "^3.121.0",
17 |     "@aws-sdk/client-ses": "^3.128.0",
18 |     "@aws-sdk/credential-provider-node": "^3.49.0",
19 |     "@aws-sdk/node-http-handler": "^3.49.0",
20 |     "@aws-sdk/protocol-http": "^3.49.0",
21 |     "@aws-sdk/signature-v4": "^3.49.0",
22 |     "@discoveryjs/json-ext": "^0.5.7",
23 |     "@svizzle/file": "^0.12.0",
24 |     "@svizzle/utils": "^0.16.0",
25 |     "cli-progress": "^3.10.0",
26 |     "commander": "^9.0.0",
27 |     "lamb": "^0.60.0",
28 |     "mkdirp": "^2.1.3",
29 |     "neo4j-driver": "^5.0.1",
30 |     "undici": "^5.22.1",
31 |     "winston": "^3.5.1"
32 |   },
33 |   "description": "Utilties for the DAP data visualisation team's backend services",
34 |   "devDependencies": {
35 |     "eslint": "^8.14.0",
36 |     "mocha": "^9.1.3"
37 |   },
38 |   "homepage": "https://github.com/nestauk/dap_dv_backends_utils#readme",
39 |   "keywords": [
40 |     "utilities",
41 |     "utils",
42 |     "dap",
43 |     "backend",
44 |     "data-visualisation"
45 |   ],
46 |   "license": "MIT",
47 |   "name": "dap_dv_backends_utils",
48 |   "repository": {
49 |     "type": "git",
50 |     "url": "git+https://github.com/nestauk/dap_dv_backends_utils.git"
51 |   },
52 |   "type": "module",
53 |   "version": "0.0.16"
54 | }


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/duplicateAggregations/request.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "size": 0,
 3 |     "aggs": {
 4 |         "dupes_10_count_extended_stats": {
 5 |             "extended_stats": {
 6 |                 "field": "dbpedia_entities_metadata.dupes_10_count"
 7 |             }
 8 |         },
 9 |         "dupes_10_count_histogram": {
10 |             "histogram": {
11 |                 "field": "dbpedia_entities_metadata.dupes_10_count",
12 |                 "interval": 1,
13 |                 "min_doc_count": 1
14 |             }
15 |         },
16 |         "dupes_10_ratio_extended_stats": {
17 |             "extended_stats": {
18 |                 "field": "dbpedia_entities_metadata.dupes_10_ratio"
19 |             }
20 |         },
21 |         "dupes_10_ratio_histogram": {
22 |             "histogram": {
23 |                 "field": "dbpedia_entities_metadata.dupes_10_ratio",
24 |                 "interval": 0.01,
25 |                 "min_doc_count": 1
26 |             }
27 |         },
28 |         "dupes_60_count_extended_stats": {
29 |             "extended_stats": {
30 |                 "field": "dbpedia_entities_metadata.dupes_60_count"
31 |             }
32 |         },
33 |         "dupes_60_count_histogram": {
34 |             "histogram": {
35 |                 "field": "dbpedia_entities_metadata.dupes_60_count",
36 |                 "interval": 1,
37 |                 "min_doc_count": 1
38 |             }
39 |         },
40 |         "dupes_60_ratio_extended_stats": {
41 |             "extended_stats": {
42 |                 "field": "dbpedia_entities_metadata.dupes_60_ratio"
43 |             }
44 |         },
45 |         "dupes_60_ratio_histogram": {
46 |             "histogram": {
47 |                 "field": "dbpedia_entities_metadata.dupes_60_ratio",
48 |                 "interval": 0.01,
49 |                 "min_doc_count": 1
50 |             }
51 |         }
52 |     }
53 | }


--------------------------------------------------------------------------------
/es/bulk.mjs:
--------------------------------------------------------------------------------
 1 | import { stringify } from '@svizzle/utils';
 2 | import * as _ from 'lamb';
 3 | 
 4 | import { buildRequest, makeRequest } from '../es/requests.mjs';
 5 | import { logger } from '../logging/logging.mjs';
 6 | 
 7 | const generateBulkPayload = (method, index) => _.pipe([
 8 | 	_.flatMapWith(doc =>
 9 | 		[
10 | 			{ [method]: {
11 | 				...('_id' in doc && { "_id": doc._id }),
12 | 				"_index": index
13 | 			} },
14 | 			method === 'update' ? { doc: doc.data } : doc.data
15 | 		]
16 | 	),
17 | 	_.reduceWith((acc, curr) => `${acc}\n${JSON.stringify(curr)}`, ''),
18 | 	json => `${json}\n`
19 | ]);
20 | 
21 | /**
22 |  * @function bulkRequest
23 |  * @description creates multiple documents on an ES index in one request.
24 |  * @param {string} domain - domain on which to update.
25 |  * @param {string} index - index     on which to update.
26 |  * @param {Object[]} documents - list of documents, where each object has an id
27 |  * key and a data key. The data key is the document intended to be created.
28 |  * @param {string} method - the method to use (create, update, delete, etc.)
29 |  * @returns {HttpResponse} response of the update reqeuest.
30 |  */
31 | export const bulkRequest = async (
32 | 	domain,
33 | 	index,
34 | 	documents,
35 | 	method,
36 | 	{ error=true, refresh="false" }={}
37 | ) => {
38 | 	const path = `${index}/_bulk`;
39 | 	const generate = generateBulkPayload(method, index);
40 | 	const payload = generate(documents);
41 | 
42 | 	// if payload is empty, no docs were supplied to the function
43 | 	if (!payload.trim()) {
44 | 		console.log("Payload empty");
45 | 		return { response: "Payload empty", code: 204 };
46 | 	}
47 | 
48 | 	const request = buildRequest(
49 | 		domain, path, 'POST',
50 | 		{ payload, contentType: 'application/x-ndjson', query: { refresh } }
51 | 	);
52 | 	const { body: response, code } = await makeRequest(request);
53 | 	if (response.error) {
54 | 		if (error) {
55 | 			throw new Error(stringify(response));
56 | 		} else {
57 | 			logger.error(stringify(response));
58 | 		}
59 | 	}
60 | 	return { response, code };
61 | };
62 | 


--------------------------------------------------------------------------------
/es/document.mjs:
--------------------------------------------------------------------------------
 1 | import { stringify } from '@svizzle/utils';
 2 | 
 3 | import { buildRequest, makeRequest } from '../es/requests.mjs';
 4 | 
 5 | /**
 6 |  * @function create
 7 |  * @description creates a document on an ES index.
 8 |  * @param {string} domain - domain on which to create the document.
 9 |  * @param {string} index - index on which to create the document.
10 |  * @param {Object} doc - an object containing the new fields and properties that constitute the update.
11 |  * @param {Object} [options={}]
12 |  * @param {string} [options.id=''] - id of document (if empty, ElasticSearch creates one for you).
13 |  * @returns {HttpResponse} response of the update reqeuest.
14 |  */
15 | export const create = async (domain, index, doc, { id = '', checkStatus=true} = {}) => {
16 | 	const path = `${index}/_doc/${encodeURIComponent(id)}`;
17 | 	const payload = doc;
18 | 	const request = buildRequest(domain, path, 'POST', { payload });
19 | 	const { body: response, code } = await makeRequest(request);
20 | 	if (!checkStatus) {
21 | 		return { response, code };
22 | 	}
23 | 	if (parseInt(code / 200, 10) !== 1) {
24 | 		console.log(response);
25 | 		throw Error(
26 | 			`Creating document failed at ${domain}/${index} for document\n${JSON.stringify(doc, null, 2)}`
27 | 		);
28 | 	}
29 | 	return response;
30 | };
31 | 
32 | /**
33 |  *
34 |  * @param {string} domain - domain on which to retrieve document
35 |  * @param {string} index - index on which to retrieve document
36 |  * @param {string} id - id of document to retrieve
37 |  * @param {Object} [options={}]
38 |  * @param {boolean} [options.source=false] - whether to return just the source of the document
39 |  * @returns {Object} an ElasticSearch document
40 | */
41 | export const get = async (domain, index, id, { source=false } = {}) => {
42 | 	const path = `${index}/_doc/${id}`;
43 | 	const request = buildRequest(domain, path, 'GET');
44 | 	const { body: response, code } = await makeRequest(request);
45 | 	if (code !== 200) {
46 | 		console.log(response);
47 | 		throw Error(`Getting document for ${id} failed with response \n${stringify(response)}`);
48 | 	}
49 | 	if (source) {
50 | 		return response._source;
51 | 	}
52 | 	return response;
53 | };
54 | 


--------------------------------------------------------------------------------
/bin/jsonToEsIndex.js:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env node
 2 | 
 3 | import { promises as fs } from 'fs';
 4 | 
 5 | import { Command } from 'commander';
 6 | import * as _ from 'lamb';
 7 | 
 8 | import { arxliveCopy } from '../conf/config.mjs';
 9 | import { bulkRequest } from '../es/bulk.mjs';
10 | import { createIndex } from '../es/index.mjs';
11 | import { logger } from '../logging/logging.mjs';
12 | import { batch } from '../util/array.mjs';
13 | import { commanderParseInt } from '../util/commander.mjs';
14 | 
15 | const program = new Command();
16 | program.option(
17 | 	'-d, --domain <domain>',
18 | 	'ES domain on which to ingest documents',
19 | 	arxliveCopy
20 | );
21 | program.requiredOption('-i, --index <index>', 'Index on which to ingest');
22 | program.requiredOption('-p, --path <path>', 'Path to JSON data');
23 | program.option('--batch-size <size>', 'Size of batch of docs to upload', commanderParseInt, 100);
24 | program.option(
25 | 	'--key <key>',
26 | 	'Top level key in JSON object to use as key. If not supplied, keys will be generated automatically',
27 | 	null
28 | );
29 | program.option(
30 | 	'--list-key <key>',
31 | 	'Key for the documents if documents are stored as a value at the root level of the json file. Not recommended',
32 | 	null
33 | );
34 | 
35 | program.parse();
36 | const options = program.opts();
37 | 
38 | const main = async () => {
39 | 
40 | 	await createIndex(options.domain, options.index);
41 | 
42 | 	const json = JSON.parse(
43 | 		await fs.readFile(options.path, { encoding: 'utf-8' })
44 | 	);
45 | 	const data = options.listKey ? json[options.listKey] : json;
46 | 
47 | 	const documents = options.key
48 | 		? _.map(data, object => {
49 | 			const { [options.key]: _id, ...contents } = object;
50 | 			return { _id, data: contents };
51 | 		})
52 | 		: _.map(data, (contents, _id) => ({ _id, data: contents }));
53 | 
54 | 	const docsWithId = _.filter(documents, doc => '_id' in doc);
55 | 
56 | 	for (const docs of batch(docsWithId, options.batchSize)) {
57 | 		// eslint-disable-next-line no-await-in-loop
58 | 		const response = await bulkRequest(options.domain, options.index, docs, 'create');
59 | 		if (response.code !== 200) {
60 | 			logger.error(response);
61 | 		}
62 | 	};
63 | };
64 | 
65 | main();
66 | 


--------------------------------------------------------------------------------
/es/entities.mjs:
--------------------------------------------------------------------------------
 1 | import * as _ from 'lamb';
 2 | 
 3 | import { arxliveCopy } from '../conf/config.mjs';
 4 | import { dbr } from '../dbpedia/util.mjs';
 5 | import { scroll, clearScroll } from '../es/search.mjs';
 6 | 
 7 | // titles are the Wiki pages with whitepace replaced with underscores, so
 8 | // World War 1 => World_War_1
 9 | // We use this terminology to stay consistent with Wikimedia's API, where the
10 | // this parameter is also named title.
11 | // https://api.wikimedia.org/wiki/API_reference/Core/Pages/Get_page
12 | export const getEntities = async(
13 | 	index,
14 | 	domain=arxliveCopy,
15 | 	{ asTitle=true } = {}
16 | ) => {
17 | 
18 | 	const scroller = scroll(domain, index, { size: 10000, });
19 | 	const uriCounts = {};
20 | 	let page;
21 | 	for await (page of scroller) {
22 | 		_.forEach(page.hits.hits, doc => {
23 | 			if ('dbpedia_entities' in doc._source) {
24 | 				_.forEach(doc._source.dbpedia_entities, ({ URI }) => {
25 | 					const key = asTitle
26 | 						? URI.replace(dbr, '')
27 | 						: URI;
28 | 					uriCounts[key] = uriCounts[key] ? uriCounts[key] + 1 : 1;
29 | 				});
30 | 			}
31 | 		});
32 | 	}
33 | 	if (page) {
34 | 		clearScroll(domain, page._scroll_id);
35 | 	}
36 | 	const entities = _.keys(uriCounts);
37 | 	return entities;
38 | };
39 | 
40 | /**
41 |  * @function getAllConfidenceLevels
42 |  * @description counts the different confidence values found for every unique entity on a given ES index.
43 |  * @param {string} index Index on which to count confidence levels
44 |  * @param {string} domain Domain on which the index sits
45 |  * @returns { Object.<string, number[]> } an object where keys are the unique
46 |  * entity URIs and values are an array of confidence values found for that entity.
47 |  */
48 | export const getAllConfidenceLevels = async(
49 | 	index,
50 | 	domain=arxliveCopy
51 | ) => {
52 | 	const scroller = scroll(domain, index, { size: 10000, });
53 | 	const confidenceCounts = {};
54 | 	let page;
55 | 	for await (page of scroller) {
56 | 		const entities = _.flatMap(
57 | 			page.hits.hits,
58 | 			_.getPath('_source.dbpedia_entities')
59 | 		);
60 | 		_.forEach(
61 | 			entities,
62 | 			({ URI, confidence }) => {
63 | 				confidenceCounts[URI] = URI in confidenceCounts
64 | 					? [ ...confidenceCounts[URI], confidence ]
65 | 					: [ confidence ];
66 | 			}
67 | 		);
68 | 	}
69 | 	if (page) {
70 | 		clearScroll(domain, page._scroll_id);
71 | 	}
72 | 	return confidenceCounts;
73 | };
74 | 


--------------------------------------------------------------------------------
/bin/geo/README.md:
--------------------------------------------------------------------------------
 1 | ## downloadBoundaries
 2 | 
 3 | This script downloads boundaries from an arcGis server and saves them as geoJSON
 4 | to a specified output directory. In order to know which boundaries to download,
 5 | you must supply a configuration file with a list of objects, where each object
 6 | has a `boundary` and `endpoint` key. The `boundary` key will be used to name the
 7 | resulting geoJSON file for that boundary, and the `endpoint` should point to the
 8 | arcGis FeatureServer related to that boundary. Here is an example for the three
 9 | different levels of International Territorial boundaries:
10 | 
11 | ```json
12 | [
13 |     {
14 |         "boundary": "itl1",
15 |         "endpoint": "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/International_Territorial_Level_1_January_2021_UK_BFC_2022/FeatureServer"
16 |     },
17 |     {
18 |         "boundary": "itl2",
19 |         "endpoint": "https://services1.arcgis.com/ESMARspQHYMw9BZ9/ArcGIS/rest/services/International_Territorial_Level_2_January_2021_UK_BFC_V2_2022/FeatureServer"
20 |     },
21 |     {
22 |         "boundary": "itl3",
23 |         "endpoint": "https://services1.arcgis.com/ESMARspQHYMw9BZ9/ArcGIS/rest/services/International_Territorial_Level_3_January_2021_UK_BFC_V3_2022/FeatureServer"
24 |     }
25 | ]
26 | ```
27 | 
28 | ## generatePmTiles
29 | 
30 | ### Requirements
31 | 
32 | [tippecanoe](https://github.com/mapbox/tippecanoe), which can be installed with
33 | brew `brew install tippecanoe`
34 | [pmtiles](https://github.com/protomaps/go-pmtiles/releases), download the
35 | relevant binary at this link, and add it to your system's path.
36 | 
37 | The script was written and tested using the following versions of the software
38 | above:
39 | 
40 | `tippecanoe v2.23.0`
41 | `pmtiles v1.70`
42 | 
43 | ### Running the script
44 | 
45 | Script for generating a pmtiles file and uploading it to s3. You specify the
46 | directory in which the boundaries are kept (all in geojson) as the first
47 | argument, and the s3 URI of your desired bucket as the second argument:
48 | 
49 | ```sh
50 | npx generatePmTiles boundaries/ s3://path-to-bucket
51 | ```
52 | 
53 | ### Uploading to s3
54 | 
55 | You must have `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment
56 | variables set in order to upload the resulting pmtiles file to s3.
57 | 
58 | To upload, simply run the following:
59 | 
60 | pmtiles upload $pmtiles --bucket=$2 $pmtiles
61 | 
62 | ```sh
63 | pmtiles upload <path-to-pmtiles> --bucket=<s3://path-to-bucket> <path-to-pmtiles>
64 | ```
65 | 


--------------------------------------------------------------------------------
/bin/annotate.js:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env node
  2 | 
  3 | import { Command } from 'commander';
  4 | import * as _ from 'lamb';
  5 | import { performance } from 'perf_hooks';
  6 | 
  7 | import { arxliveCopy } from '../conf/config.mjs';
  8 | import { getMappings } from '../es/index.mjs';
  9 | import { annotateIndex } from '../dbpedia/spotlight.mjs';
 10 | import { commanderParseInt } from '../util/commander.mjs';
 11 | import { dedent } from '../util/string.mjs';
 12 | 
 13 | const program = new Command();
 14 | program.option(
 15 | 	'-d, --domain <domain>',
 16 | 	'ES domain on which to annotate',
 17 | 	arxliveCopy
 18 | );
 19 | program.requiredOption(
 20 | 	'-i, --index <index>',
 21 | 	'Index on which to annotate',
 22 | );
 23 | program.requiredOption(
 24 | 	'-s, --spotlight <endpoint>',
 25 | 	'Endpoint for spotlight annotator',
 26 | );
 27 | program.requiredOption(
 28 | 	'-f, --field-name <field>',
 29 | 	'Field of doc to be used as input text for annotation'
 30 | );
 31 | program.option(
 32 | 	'-n, --new-field-name <annotated_field_name>',
 33 | 	'Name of new field to be created',
 34 | 	'dbpedia_entities'
 35 | );
 36 | program.option(
 37 | 	'-p, --page-size <page size>',
 38 | 	'Size of page to scroll with',
 39 | 	commanderParseInt,
 40 | 	10000
 41 | );
 42 | program.option(
 43 | 	'-b, --batch-size <batch size>',
 44 | 	'Size of batch to annotate over',
 45 | 	commanderParseInt,
 46 | 	10
 47 | );
 48 | program.option(
 49 | 	'-g, --group-size <size>',
 50 | 	'Size of group of batches, usually corresponds to the number of worker nodes',
 51 | 	commanderParseInt,
 52 | 	4
 53 | );
 54 | program.option(
 55 | 	'-z, --pages <number of pages>',
 56 | 	'Number of pages to iterate over',
 57 | 	'all'
 58 | );
 59 | program.option(
 60 | 	'--force',
 61 | 	'Force the annotation process, even if no snapshots can be created'
 62 | );
 63 | program.option(
 64 | 	'--include-metadata',
 65 | 	'Include metadata fields on the index',
 66 | 	true
 67 | );
 68 | 
 69 | program.showHelpAfterError();
 70 | program.parse();
 71 | const options = program.opts();
 72 | 
 73 | const main = async () => {
 74 | 
 75 | 	const currentMapping = await getMappings(options.domain, options.index);
 76 | 	if (
 77 | 		options.newFieldName in currentMapping[options.index].mappings.properties &&
 78 | 		!options.force
 79 | 	) {
 80 | 		throw new Error(
 81 | 			dedent`Field already exists at index mapping, and force 
 82 | 				   flag or continue flag not supplied`
 83 | 		);
 84 | 	}
 85 | 
 86 | 	const startTime = performance.now();
 87 | 
 88 | 	await annotateIndex(
 89 | 		options.domain,
 90 | 		options.index,
 91 | 		options.spotlight,
 92 | 		options.fieldName,
 93 | 		options
 94 | 	);
 95 | 
 96 | 	const endTime = performance.now();
 97 | 	console.log(`Total time taken (in ms): ${endTime - startTime}`);
 98 | };
 99 | 
100 | main();
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/annotationsDataQuality.js:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env node
 2 | 
 3 | import { exec } from 'child_process';
 4 | import fs from 'fs';
 5 | import path from 'path';
 6 | import { fileURLToPath } from 'url';
 7 | 
 8 | import { Command } from 'commander';
 9 | import * as _ from 'lamb';
10 | 
11 | import { buildRequest, makeRequest } from '../../es/requests.mjs';
12 | 
13 | const __filename = fileURLToPath(import.meta.url);
14 | const __dirname = path.dirname(__filename);
15 | 
16 | const program = new Command();
17 | program.requiredOption(
18 | 	'-d, --domain <domain>',
19 | 	'ES domain on which to aggregate',
20 | );
21 | program.requiredOption('-i, --index <index>', 'ES index on which to aggregate');
22 | program.option(
23 | 	'-p, --path <path>',
24 | 	'Path to directory containing requests',
25 | 	`${__dirname}/requests`
26 | );
27 | program.requiredOption(
28 | 	'-o, --out <path>',
29 | 	'Path to directory in which to save results.',
30 | );
31 | 
32 | program.showHelpAfterError();
33 | program.parse();
34 | const options = program.opts();
35 | 
36 | const filterDirectory = predicate => _.pipe([
37 | 	dirPath => fs.readdirSync(dirPath, { withFileTypes: true }),
38 | 	_.filterWith(predicate),
39 | 	_.mapWith(_.getKey('name'))
40 | ]);
41 | 
42 | const getSubDirectories = filterDirectory(dirEnt => dirEnt.isDirectory());
43 | 
44 | const main = async () => {
45 | 	const aggregationDirectories = getSubDirectories(options.path);
46 | 
47 | 	const payloads = await Promise.all(
48 | 		_.map(aggregationDirectories, dir => {
49 | 			const subPath = path.join(options.path, dir);
50 | 
51 | 			// if file is generated using script, regenerate
52 | 			if (fs.existsSync(path.join(subPath, 'request.mjs'))) {
53 | 				exec(`node ${path.join(subPath, 'request.mjs')}`);
54 | 			}
55 | 			const payload = fs.readFileSync(
56 | 				path.join(subPath, 'request.json'), { encoding: 'utf-8' });
57 | 			return { name: dir, payload };
58 | 		}));
59 | 
60 | 	const responses = await Promise.all(
61 | 		_.map(payloads, async ({ name, payload }) => {
62 | 			const requestPath = `${options.index}/_search`;
63 | 			const request = buildRequest(
64 | 				options.domain,
65 | 				requestPath,
66 | 				'POST',
67 | 				{ payload }
68 | 			);
69 | 			const { body: response } = await makeRequest(request);
70 | 			return { name, payload, response };
71 | 		}));
72 | 
73 | 	if (options.out) {
74 | 		if (!fs.existsSync(options.out)) {
75 | 			fs.mkdirSync(options.out, { recursive: true });
76 | 		}
77 | 	}
78 | 	await Promise.all(
79 | 		_.map(responses, response => {
80 | 			const outputPath = options.out
81 | 				? path.join(options.out, `${response.name}.json`)
82 | 				: path.join(options.path, response.name, 'response.json');
83 | 			fs.writeFileSync(
84 | 				outputPath,
85 | 				JSON.stringify(response.response, null, 4)
86 | 			);
87 | 		}));
88 | };
89 | 
90 | main();
91 | 


--------------------------------------------------------------------------------
/bin/annotateEsIndex.js:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env node
  2 | 
  3 | import { Command } from 'commander';
  4 | import * as _ from 'lamb';
  5 | 
  6 | import { arxliveCopy } from '../conf/config.mjs';
  7 | import { sleep } from '../util/time.mjs';
  8 | import { commanderParseInt } from '../util/commander.mjs';
  9 | 
 10 | 
 11 | const { NESTA_EMAIL, NESTA_TOKEN } = process.env;
 12 | 
 13 | if (!NESTA_EMAIL || !NESTA_TOKEN) {
 14 |     throw new Error(`
 15 |     Please export your NESTA_EMAIL and NESTA_TOKEN as environment variables.
 16 |     More information on how to retrieve these can be found here:
 17 |     https://github.com/nestauk/dap_dv_backends/tree/dev/src/services/authentication`
 18 |     )
 19 | }
 20 | 
 21 | const program = new Command();
 22 | program.requiredOption(
 23 |     '-d, --domain <domain>',
 24 |     'ES domain on which to annotate',
 25 |     arxliveCopy
 26 | );
 27 | program.requiredOption(
 28 |     '-e, --endpoint <url>',
 29 |     'Endpoint to be used for annotation'
 30 | );
 31 | program.requiredOption(
 32 |     '-i, --index <index>',
 33 |     'Index on which to annotate',
 34 | );
 35 | program.requiredOption(
 36 |     '-f, --field-name <field>',
 37 |     'Field of doc to be used as input text for annotation'
 38 | );
 39 | program.option(
 40 |     '-n, --new-field-name <annotated_field_name>',
 41 |     'Name of new field to be created',
 42 |     'dbpedia_entities'
 43 | );
 44 | program.option(
 45 |     '--include-metadata',
 46 |     'Include metadata fields on the index',
 47 |     true
 48 | );
 49 | program.option(
 50 |     '--workers <value>',
 51 |     'Number of workers to use',
 52 |     commanderParseInt,
 53 |     2
 54 | );
 55 | 
 56 | program.showHelpAfterError();
 57 | program.parse();
 58 | const options = program.opts();
 59 | 
 60 | const main = async () => {
 61 | 
 62 |     const authHeader = `Basic ${Buffer.from(NESTA_EMAIL + ':' + NESTA_TOKEN).toString('base64')}`;
 63 | 
 64 |     const query = {
 65 |         domain: options.domain,
 66 |         index: options.index,
 67 |         field: options.fieldName,
 68 |         newField: options.newFieldName,
 69 |         includeMetaData: options.includeMetadata,
 70 |         workers: options.workers
 71 |     }
 72 | 
 73 |     const queryString = new URLSearchParams(query);
 74 |     const url = `${options.endpoint}/es?${queryString.toString()}`;
 75 | 
 76 |     let requestOptions = {
 77 |         method: 'GET',
 78 |         headers: {
 79 |             Authorization: authHeader
 80 |         }
 81 |     };
 82 | 
 83 |     let response = await fetch(url, requestOptions);
 84 |     const { id } = await response.json();
 85 | 
 86 |     console.log(id);
 87 | 
 88 |     const progressEndpoint = `${options.endpoint}/progress/`
 89 |     response = await fetch(`${progressEndpoint}/${id}`)
 90 |     let progress = await response.json();
 91 | 
 92 |     while (progress.status !== 'finished') {
 93 |         response = await fetch(`${progressEndpoint}/${id}`)
 94 |         progress = await response.json();
 95 |         console.log(progress);
 96 | 
 97 |         await sleep(1000 * 10);
 98 |     }
 99 | };
100 | 
101 | main();


--------------------------------------------------------------------------------
/es/snapshot.mjs:
--------------------------------------------------------------------------------
 1 | import { settings as globalSettings } from '../conf/config.mjs';
 2 | import { buildRequest, makeRequest } from '../es/requests.mjs';
 3 | 
 4 | const settings = globalSettings.snapshotSettings;
 5 | 
 6 | /**
 7 |  * @function register
 8 |  * @description Registers a snapshot repository on the specified domain. This
 9 |  * repository is essentially a directory to contain snapshots, and
10 |  * a default one for ES snapshots is typically created when the
11 |  * user has specified the correct AWS configurations.
12 |  * @param {string} domain - domain on which to register snapshot.
13 |  * @param {string} repository - name of repository to register.
14 |  * @returns {Object} reponse of request.
15 |  */
16 | export const register = (domain, repository) => {
17 | 	const path = `_snapshot/${repository}`;
18 | 	const payload = {
19 | 		type: 's3',
20 | 		settings: {
21 | 			bucket: settings.bucketName,
22 | 			region: settings.region,
23 | 			role_arn: `arn:aws:iam::${settings.awsID}:role/${settings.snapshotRole}`,
24 | 		},
25 | 	};
26 | 	const request = buildRequest(domain, path, 'PUT', { payload });
27 | 	return makeRequest(request, { verbose: true });
28 | };
29 | 
30 | /**
31 |  * @function trigger
32 |  * @description this function triggers a snapshot for the specified domain and
33 |  * saves it in the repository with the given snapshot name.
34 |  * @param {string} domain - domain on which to trigger the snapshot.
35 |  * @param {string} repository - repository in which to save the snapshot result.
36 |  * @param {string} snapshot - name of the snapshot.
37 |  * @returns {Object} response of request.
38 |  */
39 | export const trigger = (domain, repository, snapshot) => {
40 | 	const path = `_snapshot/${repository}/${snapshot}`;
41 | 	const request = buildRequest(domain, path, 'PUT');
42 | 	makeRequest(request, { verbose: true });
43 | };
44 | 
45 | /**
46 |  * @function list
47 |  * @description lists the snapshots for the specified domain and repository.
48 |  * @param {string} domain - domain on which to list the snapshots.
49 |  * @param {string} repository - repository in which to list the snapshots.
50 |  * @returns {Object} response object, containng the list of snapshots.
51 |  */
52 | export const list = (domain, repository) => {
53 | 	const path = repository ? `_snapshot/${repository}/_all` : '_snapshot';
54 | 	const request = buildRequest(domain, path, 'GET');
55 | 	return makeRequest(request, { verbose: true });
56 | };
57 | 
58 | /**
59 |  * @function restore
60 |  * @description restores the domain to the specified snapshot located in the
61 |  * specified repository.
62 |  * @param {string} domain domain on which to restore.
63 |  * @param {string} repository - repository from which to get the snapshot needed to restore.
64 |  * @param {string} snapshot - name of snapshot used to restore.
65 |  * @returns {Object} response of request.
66 |  */
67 | export const restore = (domain, repository, snapshot) => {
68 | 	const payload = { indices: '-.kibana*,-.opendistro*' };
69 | 	const path = `_snapshot/${repository}/${snapshot}/_restore`;
70 | 	const request = buildRequest(domain, path, 'POST', { payload });
71 | 	return makeRequest(request, { verbose: true });
72 | };
73 | 
74 | /**
75 |  * @function status
76 |  * @description retrieves snapshot status of specified domain.
77 |  * @param {string} domain - domain on which to retrieve status.
78 |  * @returns response object containing snapshot status of specified domain.
79 |  */
80 | export const status = domain => {
81 | 	const path = '_snapshot/_status';
82 | 	const request = buildRequest(domain, path, 'GET');
83 | 	return makeRequest(request, { verbose: true });
84 | };
85 | 


--------------------------------------------------------------------------------
/es/search.mjs:
--------------------------------------------------------------------------------
 1 | import { buildRequest, makeRequest } from '../es/requests.mjs';
 2 | 
 3 | /**
 4 |  * @function clearScroll
 5 |  * @description clears scroll objects on an ElasticSearch domain
 6 |  * @param {string} domain - domain on which to clear the scroll object
 7 |  * @param {string} [id=null] - id of scroll object. If not supplied, the function will clear all scroll objects on the specified domain.
 8 |  * @returns {Object} response to the request made to clear the scroll.
 9 |  */
10 | export const clearScroll = (domain, id = null) => {
11 | 	const payload = id ? { scroll_id: id } : undefined;
12 | 	const path = id ? '_search/scroll' : '_search/scroll/_all';
13 | 	const request = buildRequest(domain, path, 'DELETE', { payload });
14 | 	const { body: result } = makeRequest(request);
15 | 	return result;
16 | };
17 | 
18 | /**
19 |  * @function first
20 |  * @description retrieves the first batch of documents (or first page) and the associated scroll id.
21 |  * @param {string} domain - domain on which to scroll.
22 |  * @param {string} index - index on which to scroll.
23 |  * @param {number} size - size of pages, this will determine how many documents per page to return.
24 |  * @returns {HttpResponse} the first response to the scroll API call. This response contains both the documents and the id for the scroll object which is needed for subsequent calls.
25 |  */
26 | const first = async (domain, index, size) => {
27 | 	const path = `${index}/_search`;
28 | 	const query = { scroll: '1h' };
29 | 	const payload = { size, sort: ['_doc'] };
30 | 	const firstRequest = buildRequest(domain, path, 'POST', {
31 | 		payload,
32 | 		query,
33 | 	});
34 | 	const { body: result } = await makeRequest(firstRequest, { retry: 5000 });
35 | 	return result;
36 | };
37 | 
38 | /**
39 |  * @function subsequent
40 |  * @description function for subsequent calls to the scroll API after having first called {@link first}
41 |  * @param {string} domain - domain on which to scroll
42 |  * @param {string} id - id for scroll object
43 |  * @returns {HttpResponse} response object containing the documents for the current iteration of the scroll.
44 |  */
45 | const subsequent = async (domain, id) => {
46 | 	const path = `_search/scroll`;
47 | 	const payload = { scroll: '1h', scroll_id: id };
48 | 	const subsequentRequest = buildRequest(domain, path, 'POST', {
49 | 		payload,
50 | 	});
51 | 	const { body: result } = await makeRequest(subsequentRequest, { retry: 5000 });
52 | 	return result;
53 | };
54 | 
55 | /**
56 |  * @function scroll
57 |  * @description Generator function, returns an iterate which uses the Scroll API to iterate over huge numbers of documents (potentially all documents) on a given index.
58 |  * @param {string} domain - domain on which to scroll.
59 |  * @param {string} index - indeox on which to scroll
60 |  * @param {Object} [options]
61 |  * @param {number} [options.size=1000] - size of page - this determines how many documents are returned per iteration.
62 |  * @param {string|number} [options.pages='all'] - number of pages to return. If not specified, the index will iterate exhaustively until all documents are returned.
63 |  * @returns {Generator} a generator which yields HttpResponses. Each response has {@link options.size} number of documents.
64 |  */
65 | export async function *scroll(
66 | 	domain,
67 | 	index,
68 | 	{ size = 1000, pages = 'all' } = {}
69 | ) {
70 | 
71 | 	// set limit to infinity if all to iterate all results
72 | 	const limit = pages === 'all' ? Infinity : pages;
73 | 	let next = await first(domain, index, size);
74 | 	for (
75 | 		let i = 0;
76 | 		i < limit && next.hits && next.hits.hits.length !== 0;
77 | 		i++
78 | 	) {
79 | 		yield next;
80 | 		// eslint-disable-next-line no-await-in-loop
81 | 		next = await subsequent(domain, next._scroll_id);
82 | 	}
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/es/requests.mjs:
--------------------------------------------------------------------------------
  1 | import sha256 from '@aws-crypto/sha256-browser';
  2 | import { defaultProvider } from '@aws-sdk/credential-provider-node';
  3 | import { NodeHttpHandler } from '@aws-sdk/node-http-handler';
  4 | import { HttpRequest } from '@aws-sdk/protocol-http';
  5 | import { SignatureV4 } from '@aws-sdk/signature-v4';
  6 | 
  7 | import { sleep } from '../util/time.mjs';
  8 | 
  9 | const { Sha256 } = sha256;
 10 | 
 11 | const signer = new SignatureV4({
 12 | 	credentials: defaultProvider(),
 13 | 	region: 'eu-west-2',
 14 | 	service: 'es',
 15 | 	sha256: Sha256,
 16 | });
 17 | 
 18 | /**
 19 |  * @function buildRequest
 20 |  * @description builds a HttpRequest object using the AWS sdk. Needed for signing the request using Environment variables.
 21 |  * @param {string} domain ElasticSearch domain on which to make request.
 22 |  * @param {string} path - additional path, appended after the domain in the request URL.
 23 |  * @param {string} method - HTTP request method (GET, POST, etc.).
 24 |  * @param {Object} [options]
 25 |  * @param {Object|string} [options.payload] - optional payload for the request. Can be passed as object and subsequently stringifyed.
 26 |  * @param {Object} [options.query={}] - optionaly query object for using the search API.
 27 |  * @returns {HttpRequest} the AWS HttpRequest object, signed using AWS credentials.
 28 |  */
 29 | export const buildRequest = (
 30 | 	domain,
 31 | 	path,
 32 | 	method,
 33 | 	{ payload, contentType = 'application/json', query = {} } = {}
 34 | ) => {
 35 | 	const body =
 36 | 		payload && typeof payload !== 'string'
 37 | 			? JSON.stringify(payload)
 38 | 			: payload;
 39 | 	return new HttpRequest({
 40 | 		body,
 41 | 		method,
 42 | 		path,
 43 | 		query,
 44 | 		headers: {
 45 | 			'Content-Type': contentType,
 46 | 			host: domain,
 47 | 		},
 48 | 		hostname: domain,
 49 | 	});
 50 | };
 51 | 
 52 | /**
 53 |  * @function parseResponseBody
 54 |  * @description helper function which returns promise of signed response object's body
 55 |  * @param {Object} response - response object obtained from {@link makeRequest}
 56 |  * @returns {Promise} promise which resolves to the response's body
 57 |  */
 58 | const parseResponseBody = response => {
 59 | 	let responseBody = '';
 60 | 	return new Promise((resolve, reject) => {
 61 | 		response.body.on('data', chunk => {
 62 | 			responseBody += chunk;
 63 | 		});
 64 | 		response.body.on('end', () => {
 65 | 			try {
 66 | 				resolve(JSON.parse(responseBody));
 67 | 			} catch (e) {
 68 | 				reject(e);
 69 | 			}
 70 | 		});
 71 | 	});
 72 | };
 73 | 
 74 | /**
 75 |  * @function _makeRequest
 76 |  * @description makes a request using a HttpRequest object.
 77 |  * @param {HttpRequest} request - the HttpRequest object built using {@link buildRequest}
 78 |  * @param {Object} [options={}]
 79 |  * @param {boolean} [options.verbose] - whether to log the output of the request and response.
 80 |  * @returns {Object} the HttpResponse object
 81 |  */
 82 | const _makeRequest = async request => {
 83 | 
 84 | 	// Sign the request
 85 | 	const signedRequest = await signer.sign(request);
 86 | 
 87 | 	// Send the request
 88 | 	const client = new NodeHttpHandler();
 89 | 	const { response } = await client.handle(signedRequest);
 90 | 	const responseBody = await parseResponseBody(response);
 91 | 
 92 | 	return {
 93 | 		code: response.statusCode,
 94 | 		message: response.body.statusMessage,
 95 | 		body: responseBody,
 96 | 	};
 97 | };
 98 | 
 99 | /**
100 |  * @function makeRequest
101 |  * @description wraps the makeRequest function with try/catch and retry logic.
102 |  * @param {HttpRequest} request - the HttpRequest object built using {@link buildRequest}
103 |  * @param {Object} [options={}]
104 |  * @param {boolean} [options.retry] - how long to wait between trys.
105 |  * @param {boolean} [options.limit] - how many times to retry.
106 |  * @returns {Object} the HttpResponse object
107 |  */
108 | export const makeRequest = async (request, { retry=null, limit=10 }={}) => {
109 | 	const promise = _makeRequest(request);
110 | 	const result = promise
111 | 	.then(value => value)
112 | 	.catch(async err => {
113 | 		if (retry && limit !== 0) {
114 | 			await sleep(retry);
115 | 			return makeRequest(request, { retry, limit: limit-1 });
116 | 		}
117 | 		throw err;
118 | 
119 | 	});
120 | 	return result;
121 | };
122 | 


--------------------------------------------------------------------------------
/dbpedia/requests.mjs:
--------------------------------------------------------------------------------
  1 | import * as _ from 'lamb';
  2 | 
  3 | import { getValue, isIterableLongerThan1 } from '@svizzle/utils';
  4 | 
  5 | import { loadOntology } from '../dbpedia/ontology.mjs';
  6 | import { dbr, prefixes } from '../dbpedia/util.mjs';
  7 | import { query } from '../sparql/query.mjs';
  8 | 
  9 | const sanitizeInput = input => {
 10 | 	const URIs = typeof input === 'string' ? [input] : input;
 11 | 	const sanitizedURIs = _.map(URIs, URI =>
 12 | 		URI.charAt(0) !== '<'
 13 | 			? URI.startsWith(dbr) ? `<${URI}>` : `<${dbr}${URI}>`
 14 | 			: URI
 15 | 	);
 16 | 	return sanitizedURIs;
 17 | };
 18 | 
 19 | const buildIndividualQueries = (inputs, template) => _.map(
 20 | 	inputs,
 21 | 	input => _.replace(/\$\$URI\$\$/gu, input)(template)
 22 | );
 23 | 
 24 | const buildQuery = queries => {
 25 | 	const body = _.join(queries, '\nUNION\n');
 26 | 	const sparql = `
 27 | 	${prefixes}
 28 | 	SELECT * WHERE {
 29 | 		${body}
 30 | 	}`;
 31 | 	return sparql;
 32 | };
 33 | 
 34 | const makeRequest = async sparql => {
 35 | 	const { results } = await query(sparql);
 36 | 	const values = _.map(results.bindings, _.mapValuesWith(getValue));
 37 | 	return values;
 38 | };
 39 | 
 40 | const genericRequest = async (input, template) => {
 41 | 	const sanitizedInput = sanitizeInput(input);
 42 | 	const queries = buildIndividualQueries(sanitizedInput, template);
 43 | 	const sparql = buildQuery(queries);
 44 | 	const values = await makeRequest(sparql);
 45 | 	return values;
 46 | };
 47 | 
 48 | /**
 49 |  * @function getEntityDetails
 50 |  * @description provides details such as imageURL and abstract for supplied DBpedia URIs
 51 |  * @param {String|String[]} input - a single DBpedia URI or a list of URIs.
 52 |  * @returns a list of entities for the supplied DBPedia URIs.
 53 |  */
 54 | export const getEntityDetails = async input => {
 55 | 	const template =
 56 | 		`{
 57 | 			BIND ($$URI$$ as ?URI)
 58 | 			OPTIONAL { 
 59 | 				$$URI$$ dbo:abstract ?abstract .
 60 | 				FILTER (langMatches(lang(?abstract),"en"))
 61 | 			}
 62 | 			OPTIONAL { $$URI$$ prov:wasDerivedFrom ?derivedFrom . }
 63 | 			OPTIONAL { $$URI$$ dbo:thumbnail ?imageURL . }
 64 | 		}`;
 65 | 
 66 | 	const values = await genericRequest(input, template);
 67 | 
 68 | 	// filter out bad encodings
 69 | 	const filteredValues = _.map(values, entity => {
 70 | 		if ('imageURL' in entity) {
 71 | 			if (entity.imageURL.includes('�')) {
 72 | 				const { imageURL, ...rest } = entity;
 73 | 				return rest;
 74 | 			}
 75 | 		}
 76 | 		return entity;
 77 | 	});
 78 | 
 79 | 	return filteredValues;
 80 | };
 81 | 
 82 | export const getEntityAbstract = input => {
 83 | 	const template = `{
 84 | 		BIND ($$URI$$ as ?URI)
 85 | 		OPTIONAL { 
 86 | 			$$URI$$ dbo:abstract ?abstract .
 87 | 			FILTER (langMatches(lang(?abstract),"en"))
 88 | 		}
 89 | 	}`;
 90 | 	return genericRequest(input, template);
 91 | };
 92 | 
 93 | export const isDisambiguation = async input => {
 94 | 	const template = `{
 95 | 		BIND ($$URI$$ as ?title)
 96 | 		OPTIONAL { $$URI$$ dbo:wikiPageDisambiguates ?resource . }
 97 | 	}`;
 98 | 	const values = await genericRequest(input, template);
 99 | 	const groups = _.group(values, _.getKey('title'));
100 | 
101 | 	// if the dbo:wikiPageDisambiguates predicate returns at least one value
102 | 	// for the URI, then it's a disambiguation page. As the title binding
103 | 	// will always be found, we check for length > 1
104 | 	const disambiguations = _.mapValues(groups, isIterableLongerThan1);
105 | 	return disambiguations;
106 | };
107 | 
108 | export const getClasses = async (
109 | 	input,
110 | 	{
111 | 		depth=Infinity,
112 | 		squash=true,
113 | 		fullURI=true
114 | 	} = {}
115 | ) => {
116 | 
117 | 	const template = `{
118 | 			BIND ($$URI$$ as ?title)
119 | 			OPTIONAL { $$URI$$ rdf:type ?type . }
120 | 	}`;
121 | 	const values = await genericRequest(input, template);
122 | 	const groups = _.group(values, _.getKey('title'));
123 | 	const types = _.mapValues(groups, group => _.map(group, _.getKey('type')));
124 | 	const classFilter = await loadOntology(depth);
125 | 	const filteredTypes = _.mapValues(
126 | 		types,
127 | 		typeList => {
128 | 			const filtered = _.filter(typeList, t => t in classFilter);
129 | 			const squashed = squash
130 | 				? filtered
131 | 				: _.map(filtered, key => _.getIn(classFilter, key));
132 | 			const URIs = fullURI
133 | 				? squashed
134 | 				: JSON.parse(stringify(squashed).replaceAll(dbo, ''));
135 | 			return URIs;
136 | 		}
137 | 	);
138 | 	return filteredTypes;
139 | };
140 | 
141 | export const hasInfoBoxTemplate = async input => {
142 | 	const template = `{
143 | 			BIND ($$URI$$ as ?URI)
144 | 			OPTIONAL { $$URI$$ dbp:wikiPageUsesTemplate ?template . }
145 | 	}`;
146 | 	const values = await genericRequest(input, template);
147 | 	const groups = _.group(values, _.getKey('URI'));
148 | 	const wikiTemplates = _.mapValues(groups, _.pluck('template'));
149 | 	const infobox =  'http://dbpedia.org/resource/Template:Infobox';
150 | 
151 | 	const results = _.mapValues(
152 | 		wikiTemplates,
153 | 		_.some(t => (t || '').startsWith(infobox))
154 | 	);
155 | 	return results;
156 | };
157 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountAggsByConfidence/request.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "size": 0,
  3 |     "aggs": {
  4 |         "confidence_0_extended_stats": {
  5 |             "extended_stats": {
  6 |                 "field": "dbpedia_entities_metadata.confidence_counts.0"
  7 |             }
  8 |         },
  9 |         "confidence_10_extended_stats": {
 10 |             "extended_stats": {
 11 |                 "field": "dbpedia_entities_metadata.confidence_counts.10"
 12 |             }
 13 |         },
 14 |         "confidence_20_extended_stats": {
 15 |             "extended_stats": {
 16 |                 "field": "dbpedia_entities_metadata.confidence_counts.20"
 17 |             }
 18 |         },
 19 |         "confidence_30_extended_stats": {
 20 |             "extended_stats": {
 21 |                 "field": "dbpedia_entities_metadata.confidence_counts.30"
 22 |             }
 23 |         },
 24 |         "confidence_40_extended_stats": {
 25 |             "extended_stats": {
 26 |                 "field": "dbpedia_entities_metadata.confidence_counts.40"
 27 |             }
 28 |         },
 29 |         "confidence_50_extended_stats": {
 30 |             "extended_stats": {
 31 |                 "field": "dbpedia_entities_metadata.confidence_counts.50"
 32 |             }
 33 |         },
 34 |         "confidence_60_extended_stats": {
 35 |             "extended_stats": {
 36 |                 "field": "dbpedia_entities_metadata.confidence_counts.60"
 37 |             }
 38 |         },
 39 |         "confidence_70_extended_stats": {
 40 |             "extended_stats": {
 41 |                 "field": "dbpedia_entities_metadata.confidence_counts.70"
 42 |             }
 43 |         },
 44 |         "confidence_80_extended_stats": {
 45 |             "extended_stats": {
 46 |                 "field": "dbpedia_entities_metadata.confidence_counts.80"
 47 |             }
 48 |         },
 49 |         "confidence_90_extended_stats": {
 50 |             "extended_stats": {
 51 |                 "field": "dbpedia_entities_metadata.confidence_counts.90"
 52 |             }
 53 |         },
 54 |         "confidence_100_extended_stats": {
 55 |             "extended_stats": {
 56 |                 "field": "dbpedia_entities_metadata.confidence_counts.100"
 57 |             }
 58 |         },
 59 |         "confidence_0_histogram": {
 60 |             "histogram": {
 61 |                 "field": "dbpedia_entities_metadata.confidence_counts.0",
 62 |                 "interval": 1,
 63 |                 "min_doc_count": 1
 64 |             }
 65 |         },
 66 |         "confidence_10_histogram": {
 67 |             "histogram": {
 68 |                 "field": "dbpedia_entities_metadata.confidence_counts.10",
 69 |                 "interval": 1,
 70 |                 "min_doc_count": 1
 71 |             }
 72 |         },
 73 |         "confidence_20_histogram": {
 74 |             "histogram": {
 75 |                 "field": "dbpedia_entities_metadata.confidence_counts.20",
 76 |                 "interval": 1,
 77 |                 "min_doc_count": 1
 78 |             }
 79 |         },
 80 |         "confidence_30_histogram": {
 81 |             "histogram": {
 82 |                 "field": "dbpedia_entities_metadata.confidence_counts.30",
 83 |                 "interval": 1,
 84 |                 "min_doc_count": 1
 85 |             }
 86 |         },
 87 |         "confidence_40_histogram": {
 88 |             "histogram": {
 89 |                 "field": "dbpedia_entities_metadata.confidence_counts.40",
 90 |                 "interval": 1,
 91 |                 "min_doc_count": 1
 92 |             }
 93 |         },
 94 |         "confidence_50_histogram": {
 95 |             "histogram": {
 96 |                 "field": "dbpedia_entities_metadata.confidence_counts.50",
 97 |                 "interval": 1,
 98 |                 "min_doc_count": 1
 99 |             }
100 |         },
101 |         "confidence_60_histogram": {
102 |             "histogram": {
103 |                 "field": "dbpedia_entities_metadata.confidence_counts.60",
104 |                 "interval": 1,
105 |                 "min_doc_count": 1
106 |             }
107 |         },
108 |         "confidence_70_histogram": {
109 |             "histogram": {
110 |                 "field": "dbpedia_entities_metadata.confidence_counts.70",
111 |                 "interval": 1,
112 |                 "min_doc_count": 1
113 |             }
114 |         },
115 |         "confidence_80_histogram": {
116 |             "histogram": {
117 |                 "field": "dbpedia_entities_metadata.confidence_counts.80",
118 |                 "interval": 1,
119 |                 "min_doc_count": 1
120 |             }
121 |         },
122 |         "confidence_90_histogram": {
123 |             "histogram": {
124 |                 "field": "dbpedia_entities_metadata.confidence_counts.90",
125 |                 "interval": 1,
126 |                 "min_doc_count": 1
127 |             }
128 |         },
129 |         "confidence_100_histogram": {
130 |             "histogram": {
131 |                 "field": "dbpedia_entities_metadata.confidence_counts.100",
132 |                 "interval": 1,
133 |                 "min_doc_count": 1
134 |             }
135 |         }
136 |     }
137 | }


--------------------------------------------------------------------------------
/bin/entitiesDataQuality.js:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env node
  2 | 
  3 | import { saveObj } from '@svizzle/file';
  4 | import { mergeWithSum, getTruthyValuesKeys } from '@svizzle/utils';
  5 | import { Command } from 'commander';
  6 | import * as _ from 'lamb';
  7 | import mkdirp from 'mkdirp';
  8 | import { fetch } from 'undici';
  9 | 
 10 | import { getEntityDetails, isDisambiguation } from '../dbpedia/requests.mjs';
 11 | import { getEntities } from '../es/entities.mjs';
 12 | import { batchIterateFlatten } from '../util/array.mjs';
 13 | 
 14 | const program = new Command();
 15 | 
 16 | program.requiredOption(
 17 | 	'-d, --domain <domain>',
 18 | 	'ES domain on which the entities are stored',
 19 | );
 20 | program.requiredOption(
 21 |     '-i, --index <index>', 
 22 |     'ES index on which the entities are stored'
 23 | );
 24 | program.option(
 25 |     '-o, --output <directory>',
 26 |     'Output directory for the data quality results.',
 27 |     'data'
 28 | )
 29 | 
 30 | program.showHelpAfterError();
 31 | program.parse();
 32 | const options = program.opts();
 33 | 
 34 | await mkdirp(`${options.output}/outputs`)
 35 | await mkdirp(`${options.output}/quality/entities`)
 36 | 
 37 | const FILE_ENTITY_TITLES = `${options.output}/outputs/entity_titles.json`;
 38 | const FILE_ENTITY_DETAILS = `${options.output}/outputs/entity_details.json`;
 39 | const FILE_ENTITY_COUNTS = `${options.output}/quality/entities/entity_counts.json`;
 40 | const FILE_MISSING_ABSTRACTS = `${options.output}/quality/entities/missing_abstracts.json`;
 41 | const FILE_MISSING_DERIVED_FROM = `${options.output}/quality/entities/missing_derived_from.json`;
 42 | const FILE_MISSING_THUMBNAIL = `${options.output}/quality/entities/missing_image.json`;
 43 | const FILE_IMAGE_STATUS = `${options.output}/quality/entities/image_status.json`;
 44 | const FILE_IMAGE_404s = `${options.output}/quality/entities/image_404s.json`;
 45 | const FILE_IMAGE_EXTENSION_COUNTS = `${options.output}/quality/entities/image_extension_counts.json`;
 46 | const FILE_DISAMBIGUATION_ENTITIES = `${options.output}/quality/entities/disambiguation_entities.json`;
 47 | 
 48 | const save = (path, object) => saveObj(path, 4)(object);
 49 | const addStats = (entities, all) => {
 50 | 	const stats = {
 51 | 		count: entities.length,
 52 | 		proportion: entities.length / all.length
 53 | 	};
 54 | 	return {
 55 | 		stats,
 56 | 		entities
 57 | 	};
 58 | };
 59 | 
 60 | const main = async () => {
 61 | 
 62 | 	// Get Titles for all entities annotated on the ai_map index
 63 | 	console.log('[+] Getting Entity Titles');
 64 | 	const titles = await getEntities(options.index, options.domain);
 65 | 	save(FILE_ENTITY_TITLES, titles);
 66 | 
 67 | 	// Get details for all DBpedia entities using DBpedia SPARQL endpoint
 68 | 	console.log('[+] Getting Entity Details');
 69 | 	const details = await batchIterateFlatten(titles, getEntityDetails);
 70 | 	save(FILE_ENTITY_DETAILS, details);
 71 | 
 72 | 	// Get the count statistics for the details
 73 | 	console.log('[+] Calculating count statistics');
 74 | 	const counts = _.reduce(details, (acc, curr) => {
 75 | 		const ones = _.mapValues(curr, _.always(1));
 76 | 		return mergeWithSum(acc, ones);
 77 | 	}, {});
 78 | 	const normalisedCounts = _.mapValues(counts, count => count / details.length);
 79 | 	save(FILE_ENTITY_COUNTS, normalisedCounts);
 80 | 
 81 | 	// Get the count statistics for missing details
 82 | 	console.log('[+] Calculating missing statistics');
 83 | 	const filterToTitles = predicate =>
 84 | 		_.map(_.filter(details, predicate), _.getKey('URI'));
 85 | 	save(
 86 | 		FILE_MISSING_ABSTRACTS,
 87 | 		addStats(filterToTitles(d => !d.abstract), titles)
 88 | 	);
 89 | 	save(
 90 | 		FILE_MISSING_DERIVED_FROM,
 91 | 		addStats(filterToTitles(d => !d.derivedFrom), titles)
 92 | 	);
 93 | 	save(
 94 | 		FILE_MISSING_THUMBNAIL,
 95 | 		addStats(filterToTitles(d => !d.imageURL), titles)
 96 | 	);
 97 | 
 98 | 	const imageURLs = _.map(
 99 | 		_.filter(details, d => d.imageURL),
100 | 		d => new URL(d.imageURL)
101 | 	);
102 | 
103 | 	// Count image extensions
104 | 	console.log('[+] Counting image file types by extension');
105 | 	const extensions = _.map(imageURLs, t => t.pathname.split('.').slice(-1)[0]);
106 | 	const extensionCounts = _.count(extensions, _.identity);
107 | 	saveObj(FILE_IMAGE_EXTENSION_COUNTS, extensionCounts);
108 | 
109 | 	// Get the image status by fetching using imageURL
110 | 	console.log('[+] Fetching images and saving response status');
111 | 	const imageURLStatus = await batchIterateFlatten(
112 | 		imageURLs,
113 | 		async batch_ => {
114 | 			const responses = await Promise.allSettled(
115 | 				_.map(batch_, t => fetch(t))
116 | 			);
117 | 			return _.map(
118 | 				_.zip(batch_, responses),
119 | 				([u, r]) => ({ url: u.href, status: r.status })
120 | 			);
121 | 		}
122 | 	);
123 | 
124 | 	const imageURLStatusCounts = _.count(imageURLStatus, _.getKey('status'));
125 | 	const notFounds = _.filter(imageURLStatus, r => r.status === 404);
126 | 
127 | 	save(FILE_IMAGE_404s, addStats(_.map(notFounds, r => r.url), titles));
128 | 	save(FILE_IMAGE_STATUS, imageURLStatusCounts);
129 | 
130 | 	const disambiguationStatus = await batchIterateFlatten(
131 | 		titles,
132 | 		isDisambiguation,
133 | 		{ concat: false}
134 | 	);
135 | 	const flattened = _.reduce(
136 | 		disambiguationStatus,
137 | 		(acc, curr) => ({ ...acc, ...curr })
138 | 	);
139 | 	const disambiguations = getTruthyValuesKeys(flattened);
140 | 	save(FILE_DISAMBIGUATION_ENTITIES, addStats(disambiguations, details));
141 | };
142 | 
143 | await main();
144 | 
145 | 


--------------------------------------------------------------------------------
/es/index.mjs:
--------------------------------------------------------------------------------
  1 | import { stringify } from '@svizzle/utils';
  2 | import * as _ from 'lamb';
  3 | 
  4 | import { arxliveCopy } from '../conf/config.mjs';
  5 | import { buildRequest, makeRequest } from '../es/requests.mjs';
  6 | 
  7 | export const list = async domain => {
  8 | 	const path = '_mappings';
  9 | 	const request = buildRequest(domain, path, 'GET');
 10 | 	const { body: response } = await makeRequest(request);
 11 | 	return _.sort(_.keys(response));
 12 | };
 13 | 
 14 | /**
 15 |  * @function count
 16 |  * @description counts the number of documents for the specified domain and index.
 17 |  * @param {string} domain - domain the ElasticSearch domain.
 18 |  * @param {string} index - index index on which to count.
 19 |  * @param {Object} [options]
 20 |  * @param {boolean} [options.returnFullObject=false] - whether to return the full respose or just the count as a number.
 21 |  * @returns {Object|number} returns either the count of the number of documents or the full response for the API call.
 22 |  */
 23 | export const count = async (
 24 | 	domain,
 25 | 	index,
 26 | 	{ returnFullObject = false } = {}
 27 | ) => {
 28 | 	const path = `${index}/_count`;
 29 | 	const request = buildRequest(domain, path, 'GET');
 30 | 	const { body: response } = await makeRequest(request);
 31 | 	if (returnFullObject) {
 32 | 		return response;
 33 | 	}
 34 | 	return response.count;
 35 | 
 36 | };
 37 | 
 38 | /**
 39 |  * @function createIndex
 40 |  * @description creates an index using the specified name and domain.
 41 |  * @param {string} name - name of index to create.
 42 |  * @param {string} domain - domain on which to create index.
 43 |  * @param {Object} [options]
 44 |  * @param {Object} [options.payload={}] - payload for request to index endpoint.
 45 |  * @returns {Object} response to the request
 46 |  */
 47 | export const createIndex = async (
 48 | 	domain,
 49 | 	index,
 50 | 	{ payload = {} } = {}
 51 | ) => {
 52 | 	const path = index;
 53 | 	const parsedPayload = typeof payload !== 'string' ? JSON.stringify(payload) : payload;
 54 | 	const request = buildRequest(domain, path, 'PUT', { payload: parsedPayload });
 55 | 	const { body: response, code } = await makeRequest(request);
 56 | 	if (code !== 200) {
 57 | 		if (response.error.type === 'resource_already_exists_exception') {
 58 | 			console.warn('Index already exists, so was not created');
 59 | 		} else {
 60 | 			throw new Error(stringify(response));
 61 | 		}
 62 | 	}
 63 | 	return response;
 64 | };
 65 | 
 66 | /**
 67 |  * @function deleteIndex
 68 |  * @description deletes an index using the specified name and domain. If no
 69 |  * index with specified name exists, function exits gracefully but
 70 |  * logs this to the user.
 71 |  * @param {string} name - name of index to delete.
 72 |  * @param {string} domain - domain on which to delete index.
 73 |  * @returns {Object} response to the request
 74 |  */
 75 | export const deleteIndex = async (domain, index) => {
 76 | 	const path = index;
 77 | 	const request = buildRequest(domain, path, 'DELETE');
 78 | 	const { code } = await makeRequest(request);
 79 | 	if (code === 404) {
 80 | 		console.log(`index '${index}' not found, so was not deleted`);
 81 | 	}
 82 | };
 83 | 
 84 | /**
 85 |  * @function reindex
 86 |  * @description copies data from source index to dest index on specified domain.
 87 |  * @param {string} source - name of source index from which to copy data.
 88 |  * @param {string} dest - name of destination index on which to copy data.
 89 |  * @param {string} domain - domain on which to perform reindex.
 90 |  * @param {Object} [options]
 91 |  * @param {Object} [options.payload={}] - payload for request to index endpoint.
 92 |  * @param {string} [options.pipeline=null] - name of the ingestion pipeline to include upon reindex.
 93 |  * @returns {Object} response to the request
 94 |  */
 95 | export const reindex = async (
 96 | 	source,
 97 | 	dest,
 98 | 	domain = arxliveCopy,
 99 | 	{ payload = {}, pipeline = null } = {}
100 | ) => {
101 | 	const path = '_reindex';
102 | 	const parsedPayload = typeof payload === 'string' ? JSON.parse(payload) : payload;
103 | 	const expandedPayload = {
104 | 		...parsedPayload,
105 | 		source: {
106 | 			index: source
107 | 		},
108 | 		dest: {
109 | 			index: dest,
110 | 			pipeline,
111 | 		}
112 | 	};
113 | 	const request = buildRequest(domain, path, 'POST', { payload: JSON.stringify(expandedPayload) });
114 | 	const { code, body: response } = await makeRequest(request);
115 | 	if (code !== 200) {
116 | 		throw new Error(
117 | 			`Reindex from ${source} to ${dest} failed. Response:\n${stringify(response)}`
118 | 		);
119 | 	}
120 | 	return response;
121 | };
122 | 
123 | /**
124 |  * @function getMappings
125 |  * @description gets the mappings for the specied index on the specified domain.
126 |  * @param {string} domain - Domain from which to get mappings.
127 |  * @param {string} index - Index from which to get mappings.
128 |  * @returns {Object} the mappings.
129 |  */
130 | export const getMappings = async (domain, index) => {
131 | 	const path = `${index}/_mappings`;
132 | 	const request = buildRequest(domain, path);
133 | 	const { body: response } = await makeRequest(request);
134 | 	return response;
135 | };
136 | 
137 | /**
138 |  * @function updateMapping
139 |  * @description updates the mapping on the specified domain and index.
140 |  * @param {string} domain - domain on which to update the mappings.
141 |  * @param {string} index - index on which to update the mappings.
142 |  * @param {Object} [options]
143 |  * @param {Object} [options.payload={}] - payload for request.
144 |  * @returns {Object} response object.
145 |  */
146 | export const updateMapping = async (
147 | 	domain,
148 | 	index,
149 | 	{ payload } = {}
150 | ) => {
151 | 	const path = `${index}/_mappings`;
152 | 	const request = buildRequest(domain, path, 'PUT', { payload });
153 | 	const { body: response } = await makeRequest(request);
154 | 	return response;
155 | };
156 | 


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedConfidenceHistogram/README.md:
--------------------------------------------------------------------------------
  1 | ## Flattened `confidence` Histogram
  2 | 
  3 | Aggregates all `confidence` values into a histogram, each bucket indicating one of
  4 | the 10 possible `confidence` levels annotated. Flattened here denotes the fact
  5 | that all annotated entities are treated as a flat list - no per document
  6 | analysis is performed.
  7 | 
  8 | Endpoint: `POST arxiv_v6/_search`
  9 | 
 10 | See:
 11 | 
 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-terms-aggregation.html
 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html
 14 | 
 15 | ## Notes
 16 | 
 17 | We have decided not to use the `histogram` API here due to errors produced by
 18 | rounding of floating precision point values. When using a histogram with
 19 | interval 0.1, the values for the buckets turn out to be incorrect. In
 20 | particular, there are no entities found with `confidence `0.7, which is obviously
 21 | wrong. Instead, it seems like all entities tagged at `confidence` 0.7 are
 22 | erroneously counted in the 0.6 bucket, meaning that bucket contains all entities
 23 | for `confidence` 0.6 and 0.7:
 24 | 
 25 | Request:
 26 | ```json
 27 | {
 28 | 	"size": 0,
 29 | 	"aggs": {
 30 | 		"dbpedia": {
 31 | 			"nested": {
 32 | 				"path": "dbpedia_entities"
 33 | 			},
 34 | 			"aggs": {
 35 | 				"confidence": {
 36 | 					"histogram": {
 37 | 						"field": "dbpedia_entities.confidence",
 38 |             "interval": 0.1
 39 | 					}
 40 | 				}
 41 | 			}
 42 | 		}
 43 | 	}
 44 | }
 45 | ```
 46 | Truncated response:
 47 | ```json
 48 | ...
 49 | "aggregations": {
 50 |   "dbpedia": {
 51 |     "doc_count": 75296846,
 52 |     "confidence": {
 53 |       "buckets": [
 54 |         {
 55 |           "key": 0.1,
 56 |           "doc_count": 411055
 57 |         },
 58 |         {
 59 |           "key": 0.2,
 60 |           "doc_count": 652848
 61 |         },
 62 |         {
 63 |           "key": 0.30000000000000004,
 64 |           "doc_count": 53424468
 65 |         },
 66 |         {
 67 |           "key": 0.4,
 68 |           "doc_count": 6007261
 69 |         },
 70 |         {
 71 |           "key": 0.5,
 72 |           "doc_count": 3751608
 73 |         },
 74 |         {
 75 |           "key": 0.6000000000000001,
 76 |           "doc_count": 3500601
 77 |         },
 78 |         {
 79 |           "key": 0.7000000000000001,
 80 |           "doc_count": 0
 81 |         },
 82 |         {
 83 |           "key": 0.8,
 84 |           "doc_count": 7549005
 85 |         }
 86 |       ]
 87 |     }
 88 |   }
 89 | }
 90 | ```
 91 | The `terms` aggregation has difficulty creating buckets whose keys are of type
 92 | float or double, due to floating point precision errors. As a result, the keys
 93 | found in the `response.json` can look bizarre. In actual fact, the keys are
 94 | indistinguishable (in the Java Runtime) due to the rounding errors. Example
 95 | (using key for `confidence` bucket 0.4):
 96 | 
 97 | ```java
 98 | class Main {  
 99 |   public static void main(String args[]) { 
100 |     float example = 0.4000000059604645f;
101 |     System.out.println(example); // 0.4
102 |   } 
103 | }
104 | ```
105 | 
106 | You can find a replit for the example
107 | [here]([https://replit.com/@doogyb/Floating-Point-Precision-Errors#Main.java).
108 | 
109 | We've decided to document this behavior for now and move on. However, there
110 | exists two possible solutions to the problem. The first involves changing the
111 | schema so that `confidence` values are encoded as integers. The current values
112 | would be mapped using a factor of 10, so that entities tagged at confidence
113 | level 0.3 would have an integer `confidence` value of 3, those tagged at 0.7 an
114 | integer value of 7, and so on. The advantage of this approach is that we
115 | guarantee the correct term bucket keys due to no risk of floating point
116 | precision errors. However, we deviate from the accepted inputs of the Spotlight
117 | API, which only accepts values for `confidence` within the range 0 and 1.
118 | 
119 | The second solution is to use the `histogram` API, with interval set to 0.1 and
120 | an offset set to a value very slightly below zero. The following request is
121 | included for reference:
122 | 
123 | ```json
124 | {
125 | 	"size": 0,
126 | 	"aggs": {
127 | 		"dbpedia": {
128 | 			"nested": {
129 | 				"path": "dbpedia_entities"
130 | 			},
131 | 			"aggs": {
132 | 				"confidence": {
133 | 					"histogram": {
134 | 						"field": "dbpedia_entities.confidence",
135 |             "interval": 0.1,
136 |             "offset": -0.0000001
137 | 					}
138 | 				}
139 | 			}
140 | 		}
141 | 	}
142 | }
143 | ```
144 | 
145 | Truncated response:
146 | ```json
147 | ...
148 | "aggregations": {
149 |   "dbpedia": {
150 |     "doc_count": 75296846,
151 |     "confidence": {
152 |       "buckets": [
153 |         {
154 |           "key": 0.0999999,
155 |           "doc_count": 411055
156 |         },
157 |         {
158 |           "key": 0.1999999,
159 |           "doc_count": 652848
160 |         },
161 |         {
162 |           "key": 0.29999990000000004,
163 |           "doc_count": 53424468
164 |         },
165 |         {
166 |           "key": 0.3999999,
167 |           "doc_count": 6007261
168 |         },
169 |         {
170 |           "key": 0.4999999,
171 |           "doc_count": 3751608
172 |         },
173 |         {
174 |           "key": 0.5999999000000001,
175 |           "doc_count": 1970197
176 |         },
177 |         {
178 |           "key": 0.6999999000000001,
179 |           "doc_count": 1530404
180 |         },
181 |         {
182 |           "key": 0.7999999000000001,
183 |           "doc_count": 1346700
184 |         },
185 |         {
186 |           "key": 0.8999999000000001,
187 |           "doc_count": 6202305
188 |         }
189 |       ]
190 |     }
191 |   }
192 | }
193 | ```
194 | 
195 | However the values still do not strictly match up with the encoded confidence
196 | levels. This approach is however more in line with what is suggested according
197 | to this Github [issue](https://github.com/elastic/elasticsearch/issues/30529)
198 | for Elastic Search, due to the issue surrounding encoding floating point values
199 | accurately when using a base 2 system vs base 10 system.


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/flattenedURITermsByConfidence/request.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "size": 0,
  3 |     "aggs": {
  4 |         "dbpedia": {
  5 |             "nested": {
  6 |                 "path": "dbpedia_entities"
  7 |             },
  8 |             "aggs": {
  9 |                 "top_URI_100": {
 10 |                     "filter": {
 11 |                         "term": {
 12 |                             "dbpedia_entities.confidence": 100
 13 |                         }
 14 |                     },
 15 |                     "aggs": {
 16 |                         "URIs_100": {
 17 |                             "terms": {
 18 |                                 "field": "dbpedia_entities.URI",
 19 |                                 "size": 100
 20 |                             }
 21 |                         }
 22 |                     }
 23 |                 },
 24 |                 "top_URI_90": {
 25 |                     "filter": {
 26 |                         "term": {
 27 |                             "dbpedia_entities.confidence": 90
 28 |                         }
 29 |                     },
 30 |                     "aggs": {
 31 |                         "URI": {
 32 |                             "terms": {
 33 |                                 "field": "dbpedia_entities.URI",
 34 |                                 "size": 100
 35 |                             }
 36 |                         }
 37 |                     }
 38 |                 },
 39 |                 "top_URI_80": {
 40 |                     "filter": {
 41 |                         "term": {
 42 |                             "dbpedia_entities.confidence": 80
 43 |                         }
 44 |                     },
 45 |                     "aggs": {
 46 |                         "URI": {
 47 |                             "terms": {
 48 |                                 "field": "dbpedia_entities.URI",
 49 |                                 "size": 100
 50 |                             }
 51 |                         }
 52 |                     }
 53 |                 },
 54 |                 "top_URI_70": {
 55 |                     "filter": {
 56 |                         "term": {
 57 |                             "dbpedia_entities.confidence": 70
 58 |                         }
 59 |                     },
 60 |                     "aggs": {
 61 |                         "URI": {
 62 |                             "terms": {
 63 |                                 "field": "dbpedia_entities.URI",
 64 |                                 "size": 100
 65 |                             }
 66 |                         }
 67 |                     }
 68 |                 },
 69 |                 "top_URI_60": {
 70 |                     "filter": {
 71 |                         "term": {
 72 |                             "dbpedia_entities.confidence": 60
 73 |                         }
 74 |                     },
 75 |                     "aggs": {
 76 |                         "URI": {
 77 |                             "terms": {
 78 |                                 "field": "dbpedia_entities.URI",
 79 |                                 "size": 100
 80 |                             }
 81 |                         }
 82 |                     }
 83 |                 },
 84 |                 "top_URI_50": {
 85 |                     "filter": {
 86 |                         "term": {
 87 |                             "dbpedia_entities.confidence": 50
 88 |                         }
 89 |                     },
 90 |                     "aggs": {
 91 |                         "URI": {
 92 |                             "terms": {
 93 |                                 "field": "dbpedia_entities.URI",
 94 |                                 "size": 100
 95 |                             }
 96 |                         }
 97 |                     }
 98 |                 },
 99 |                 "top_URI_40": {
100 |                     "filter": {
101 |                         "term": {
102 |                             "dbpedia_entities.confidence": 40
103 |                         }
104 |                     },
105 |                     "aggs": {
106 |                         "URI": {
107 |                             "terms": {
108 |                                 "field": "dbpedia_entities.URI",
109 |                                 "size": 100
110 |                             }
111 |                         }
112 |                     }
113 |                 },
114 |                 "top_URI_30": {
115 |                     "filter": {
116 |                         "term": {
117 |                             "dbpedia_entities.confidence": 30
118 |                         }
119 |                     },
120 |                     "aggs": {
121 |                         "URI": {
122 |                             "terms": {
123 |                                 "field": "dbpedia_entities.URI",
124 |                                 "size": 100
125 |                             }
126 |                         }
127 |                     }
128 |                 },
129 |                 "top_URI_20": {
130 |                     "filter": {
131 |                         "term": {
132 |                             "dbpedia_entities.confidence": 20
133 |                         }
134 |                     },
135 |                     "aggs": {
136 |                         "URI": {
137 |                             "terms": {
138 |                                 "field": "dbpedia_entities.URI",
139 |                                 "size": 100
140 |                             }
141 |                         }
142 |                     }
143 |                 },
144 |                 "top_URI_10": {
145 |                     "filter": {
146 |                         "term": {
147 |                             "dbpedia_entities.confidence": 10
148 |                         }
149 |                     },
150 |                     "aggs": {
151 |                         "URI": {
152 |                             "terms": {
153 |                                 "field": "dbpedia_entities.URI",
154 |                                 "size": 100
155 |                             }
156 |                         }
157 |                     }
158 |                 },
159 |                 "top_URI_0": {
160 |                     "filter": {
161 |                         "term": {
162 |                             "dbpedia_entities.confidence": 0
163 |                         }
164 |                     },
165 |                     "aggs": {
166 |                         "URI": {
167 |                             "terms": {
168 |                                 "field": "dbpedia_entities.URI",
169 |                                 "size": 100
170 |                             }
171 |                         }
172 |                     }
173 |                 }
174 |             }
175 |         }
176 |     }
177 | }


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountOverTokenCountByConfidence/request.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "size": 0,
  3 |     "aggs": {
  4 |         "entities_count_over_token_count_at_0_extended_stats": {
  5 |             "extended_stats": {
  6 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['textBody_abstract_article.token_count'].value;"
  7 |             }
  8 |         },
  9 |         "entities_count_over_token_count_at_0_histogram": {
 10 |             "histogram": {
 11 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['textBody_abstract_article.token_count'].value;",
 12 |                 "interval": 0.01,
 13 |                 "min_doc_count": 1
 14 |             }
 15 |         },
 16 |         "entities_count_over_token_count_at_10_extended_stats": {
 17 |             "extended_stats": {
 18 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['textBody_abstract_article.token_count'].value;"
 19 |             }
 20 |         },
 21 |         "entities_count_over_token_count_at_10_histogram": {
 22 |             "histogram": {
 23 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['textBody_abstract_article.token_count'].value;",
 24 |                 "interval": 0.01,
 25 |                 "min_doc_count": 1
 26 |             }
 27 |         },
 28 |         "entities_count_over_token_count_at_20_extended_stats": {
 29 |             "extended_stats": {
 30 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['textBody_abstract_article.token_count'].value;"
 31 |             }
 32 |         },
 33 |         "entities_count_over_token_count_at_20_histogram": {
 34 |             "histogram": {
 35 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['textBody_abstract_article.token_count'].value;",
 36 |                 "interval": 0.01,
 37 |                 "min_doc_count": 1
 38 |             }
 39 |         },
 40 |         "entities_count_over_token_count_at_30_extended_stats": {
 41 |             "extended_stats": {
 42 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['textBody_abstract_article.token_count'].value;"
 43 |             }
 44 |         },
 45 |         "entities_count_over_token_count_at_30_histogram": {
 46 |             "histogram": {
 47 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['textBody_abstract_article.token_count'].value;",
 48 |                 "interval": 0.01,
 49 |                 "min_doc_count": 1
 50 |             }
 51 |         },
 52 |         "entities_count_over_token_count_at_40_extended_stats": {
 53 |             "extended_stats": {
 54 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['textBody_abstract_article.token_count'].value;"
 55 |             }
 56 |         },
 57 |         "entities_count_over_token_count_at_40_histogram": {
 58 |             "histogram": {
 59 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['textBody_abstract_article.token_count'].value;",
 60 |                 "interval": 0.01,
 61 |                 "min_doc_count": 1
 62 |             }
 63 |         },
 64 |         "entities_count_over_token_count_at_50_extended_stats": {
 65 |             "extended_stats": {
 66 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['textBody_abstract_article.token_count'].value;"
 67 |             }
 68 |         },
 69 |         "entities_count_over_token_count_at_50_histogram": {
 70 |             "histogram": {
 71 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['textBody_abstract_article.token_count'].value;",
 72 |                 "interval": 0.01,
 73 |                 "min_doc_count": 1
 74 |             }
 75 |         },
 76 |         "entities_count_over_token_count_at_60_extended_stats": {
 77 |             "extended_stats": {
 78 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['textBody_abstract_article.token_count'].value;"
 79 |             }
 80 |         },
 81 |         "entities_count_over_token_count_at_60_histogram": {
 82 |             "histogram": {
 83 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['textBody_abstract_article.token_count'].value;",
 84 |                 "interval": 0.01,
 85 |                 "min_doc_count": 1
 86 |             }
 87 |         },
 88 |         "entities_count_over_token_count_at_70_extended_stats": {
 89 |             "extended_stats": {
 90 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['textBody_abstract_article.token_count'].value;"
 91 |             }
 92 |         },
 93 |         "entities_count_over_token_count_at_70_histogram": {
 94 |             "histogram": {
 95 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['textBody_abstract_article.token_count'].value;",
 96 |                 "interval": 0.01,
 97 |                 "min_doc_count": 1
 98 |             }
 99 |         },
100 |         "entities_count_over_token_count_at_80_extended_stats": {
101 |             "extended_stats": {
102 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['textBody_abstract_article.token_count'].value;"
103 |             }
104 |         },
105 |         "entities_count_over_token_count_at_80_histogram": {
106 |             "histogram": {
107 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['textBody_abstract_article.token_count'].value;",
108 |                 "interval": 0.01,
109 |                 "min_doc_count": 1
110 |             }
111 |         },
112 |         "entities_count_over_token_count_at_90_extended_stats": {
113 |             "extended_stats": {
114 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['textBody_abstract_article.token_count'].value;"
115 |             }
116 |         },
117 |         "entities_count_over_token_count_at_90_histogram": {
118 |             "histogram": {
119 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['textBody_abstract_article.token_count'].value;",
120 |                 "interval": 0.01,
121 |                 "min_doc_count": 1
122 |             }
123 |         },
124 |         "entities_count_over_token_count_at_100_extended_stats": {
125 |             "extended_stats": {
126 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['textBody_abstract_article.token_count'].value;"
127 |             }
128 |         },
129 |         "entities_count_over_token_count_at_100_histogram": {
130 |             "histogram": {
131 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['textBody_abstract_article.token_count'].value;",
132 |                 "interval": 0.01,
133 |                 "min_doc_count": 1
134 |             }
135 |         }
136 |     }
137 | }


--------------------------------------------------------------------------------
/bin/annotationsDataQuality/requests/entitiesCountAggsByConfidenceOverEntitiesCount/request.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "size": 0,
  3 |     "aggs": {
  4 |         "confidence_0_normalised_with_entities_count_extended_stats": {
  5 |             "extended_stats": {
  6 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
  7 |             }
  8 |         },
  9 |         "confidence_0_normalised_with_entities_count_histogram": {
 10 |             "histogram": {
 11 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 12 |                 "interval": 0.01,
 13 |                 "min_doc_count": 1
 14 |             }
 15 |         },
 16 |         "confidence_10_normalised_with_entities_count_extended_stats": {
 17 |             "extended_stats": {
 18 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 19 |             }
 20 |         },
 21 |         "confidence_10_normalised_with_entities_count_histogram": {
 22 |             "histogram": {
 23 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 24 |                 "interval": 0.01,
 25 |                 "min_doc_count": 1
 26 |             }
 27 |         },
 28 |         "confidence_20_normalised_with_entities_count_extended_stats": {
 29 |             "extended_stats": {
 30 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 31 |             }
 32 |         },
 33 |         "confidence_20_normalised_with_entities_count_histogram": {
 34 |             "histogram": {
 35 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 36 |                 "interval": 0.01,
 37 |                 "min_doc_count": 1
 38 |             }
 39 |         },
 40 |         "confidence_30_normalised_with_entities_count_extended_stats": {
 41 |             "extended_stats": {
 42 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 43 |             }
 44 |         },
 45 |         "confidence_30_normalised_with_entities_count_histogram": {
 46 |             "histogram": {
 47 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 48 |                 "interval": 0.01,
 49 |                 "min_doc_count": 1
 50 |             }
 51 |         },
 52 |         "confidence_40_normalised_with_entities_count_extended_stats": {
 53 |             "extended_stats": {
 54 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 55 |             }
 56 |         },
 57 |         "confidence_40_normalised_with_entities_count_histogram": {
 58 |             "histogram": {
 59 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 60 |                 "interval": 0.01,
 61 |                 "min_doc_count": 1
 62 |             }
 63 |         },
 64 |         "confidence_50_normalised_with_entities_count_extended_stats": {
 65 |             "extended_stats": {
 66 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 67 |             }
 68 |         },
 69 |         "confidence_50_normalised_with_entities_count_histogram": {
 70 |             "histogram": {
 71 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 72 |                 "interval": 0.01,
 73 |                 "min_doc_count": 1
 74 |             }
 75 |         },
 76 |         "confidence_60_normalised_with_entities_count_extended_stats": {
 77 |             "extended_stats": {
 78 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 79 |             }
 80 |         },
 81 |         "confidence_60_normalised_with_entities_count_histogram": {
 82 |             "histogram": {
 83 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 84 |                 "interval": 0.01,
 85 |                 "min_doc_count": 1
 86 |             }
 87 |         },
 88 |         "confidence_70_normalised_with_entities_count_extended_stats": {
 89 |             "extended_stats": {
 90 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
 91 |             }
 92 |         },
 93 |         "confidence_70_normalised_with_entities_count_histogram": {
 94 |             "histogram": {
 95 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
 96 |                 "interval": 0.01,
 97 |                 "min_doc_count": 1
 98 |             }
 99 |         },
100 |         "confidence_80_normalised_with_entities_count_extended_stats": {
101 |             "extended_stats": {
102 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
103 |             }
104 |         },
105 |         "confidence_80_normalised_with_entities_count_histogram": {
106 |             "histogram": {
107 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
108 |                 "interval": 0.01,
109 |                 "min_doc_count": 1
110 |             }
111 |         },
112 |         "confidence_90_normalised_with_entities_count_extended_stats": {
113 |             "extended_stats": {
114 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
115 |             }
116 |         },
117 |         "confidence_90_normalised_with_entities_count_histogram": {
118 |             "histogram": {
119 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
120 |                 "interval": 0.01,
121 |                 "min_doc_count": 1
122 |             }
123 |         },
124 |         "confidence_100_normalised_with_entities_count_extended_stats": {
125 |             "extended_stats": {
126 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['dbpedia_entities_metadata.entities_count'].value;"
127 |             }
128 |         },
129 |         "confidence_100_normalised_with_entities_count_histogram": {
130 |             "histogram": {
131 |                 "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['dbpedia_entities_metadata.entities_count'].value;",
132 |                 "interval": 0.01,
133 |                 "min_doc_count": 1
134 |             }
135 |         }
136 |     }
137 | }


--------------------------------------------------------------------------------
/aws/s3.mjs:
--------------------------------------------------------------------------------
  1 | import {
  2 | 	S3Client,
  3 | 	GetObjectCommand,
  4 | 	GetObjectAttributesCommand,
  5 | 	CreateMultipartUploadCommand,
  6 | 	UploadPartCommand,
  7 | 	CompleteMultipartUploadCommand,
  8 | 	HeadBucketCommand,
  9 | 	HeadObjectCommand,
 10 | 	GetBucketAclCommand,
 11 | 	PutObjectCommand
 12 | } from '@aws-sdk/client-s3';
 13 | import { defaultProvider } from "@aws-sdk/credential-provider-node";
 14 | import * as cliProgress from 'cli-progress';
 15 | 
 16 | import * as _ from 'lamb';
 17 | 
 18 | import { bulkRequest } from '../es/bulk.mjs';
 19 | import { scroll } from '../es/search.mjs';
 20 | import { count, createIndex } from '../es/index.mjs';
 21 | 
 22 | 
 23 | // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
 24 | export const MIN_PART_SIZE = 5242880;
 25 | 
 26 | const config = {
 27 | 	credentials: defaultProvider(),
 28 | 	region: 'eu-west-2',
 29 | };
 30 | const client = new S3Client(config);
 31 | 
 32 | const parseMost = (chunk, type) => {
 33 | 	const [ start, end ] = type === 'object' ? ['{', '}'] : ['[', ']'];
 34 | 	for (let i = chunk.length - 1; i >= 0; i--) {
 35 | 		if (chunk[i] === ',' || chunk[i] === end) {
 36 | 			const test = `${start}${_.slice(chunk, 0, i).join('')}${end}`;
 37 | 			try {
 38 | 				const documents = JSON.parse(test);
 39 | 				const leftover = _.slice(chunk, i+1, chunk.length).join('');
 40 | 				return { documents, leftover };
 41 | 			} catch {}
 42 | 		}
 43 | 	}
 44 | 	return { documents: null, leftover: chunk };
 45 | };
 46 | 
 47 | const getObject = (bucket, key, { start=0, end=-1 }={}) => {
 48 | 	return new Promise(async (resolve, error) => {
 49 | 		const get = new GetObjectCommand({
 50 | 			Bucket: bucket,
 51 | 			Key: key,
 52 | 			Range: `bytes=${start}-${end}`
 53 | 		});
 54 | 		const { Body, ContentLength } = await client.send(get);
 55 | 		const finished = end === -1 || ContentLength < end - start;
 56 | 		const data = [];
 57 | 		Body.on('error', err => error(err));
 58 | 		Body.on('data', chunk => data.push(chunk));
 59 | 		Body.on('end', () => resolve({data: data.join(''), finished}));
 60 | 	});
 61 | };
 62 | 
 63 | const getObjectAttributes = async(
 64 | 	bucket,
 65 | 	key,
 66 | 	attributeList=['ETag', 'Checksum', 'ObjectParts', 'StorageClass',  'ObjectSize']
 67 | ) => {
 68 | 	const get = new GetObjectAttributesCommand({
 69 | 		Bucket: bucket,
 70 | 		Key: key,
 71 | 		ObjectAttributes: attributeList
 72 | 	});
 73 | 	const attributes = await client.send(get);
 74 | 	return attributes;
 75 | };
 76 | 
 77 | async function *stream(
 78 | 	bucket,
 79 | 	key,
 80 | 	type,
 81 | 	{ increment=512_000 }={}
 82 | ) {
 83 | 	let current = 0;
 84 | 	let chunk, finished;
 85 | 	let data = '';
 86 | 
 87 | 	// check at very beginning that types match up
 88 | 	const { data: first } = await getObject(
 89 | 		bucket, key, { start: 0, end: 0 }
 90 | 	);
 91 | 	if (
 92 | 		first === '{' && type !== 'object' ||
 93 | 		first === '[' && type !== 'array' ||
 94 | 		first !== '{' && first !== '[') {
 95 | 		throw new Error(
 96 | 			`Type errror. Are you sure the bucket object\'s type is correct?`
 97 | 		);
 98 | 	}
 99 | 
100 | 	const { ObjectSize: size } = await getObjectAttributes(bucket, key);
101 | 	const bar = new cliProgress.SingleBar(cliProgress.Presets.shades_classic);
102 | 	bar.start(size, 0);
103 | 	do {
104 | 
105 | 		// always omit first byte, as we know it's either '{' or '['
106 | 		// eslint-disable-next-line no-await-in-loop
107 | 		({ data: chunk, finished } = await getObject(
108 | 			bucket, key, { start: current+1, end: current + increment }
109 | 		));
110 | 		data += chunk;
111 | 		const { documents, leftover } = parseMost(data, type);
112 | 		if (documents) {
113 | 			yield documents;
114 | 			data = leftover;
115 | 		}
116 | 		current += increment;
117 | 		bar.update(current);
118 | 	} while (!finished);
119 | 	bar.update(size);
120 | 	bar.stop();
121 | }
122 | 
123 | export const streamObject = (
124 | 	bucket,
125 | 	key,
126 | 	{ increment=64_000 }={}
127 | ) => stream(bucket, key, 'object', { increment });
128 | 
129 | export const streamArray = (
130 | 	bucket,
131 | 	key,
132 | 	{ increment=64_000 }={}
133 | ) => stream(bucket, key, 'array', { increment });
134 | 
135 | export const initialiseMultiPartUpload = async (bucket, key) => {
136 | 	const create = new CreateMultipartUploadCommand({
137 | 		Bucket: bucket,
138 | 		Key: key
139 | 	});
140 | 	const { UploadId: uploadId } = await client.send(create);
141 | 	return uploadId;
142 | };
143 | 
144 | export const uploadPart = async (
145 | 	data,
146 | 	bucket,
147 | 	key,
148 | 	uploadId,
149 | 	partNumber
150 | ) => {
151 | 	const upload = new UploadPartCommand({
152 | 		Body: data,
153 | 		Bucket: bucket,
154 | 		Key: key,
155 | 		UploadId: uploadId,
156 | 		PartNumber: partNumber
157 | 	});
158 | 
159 | 	const { ETag } = await client.send(upload);
160 | 	return ETag;
161 | };
162 | 
163 | export const completeMultiPartUpload = async (
164 | 	bucket,
165 | 	key,
166 | 	parts,
167 | 	uploadId
168 | ) => {
169 | 	const complete = new CompleteMultipartUploadCommand({
170 | 		Bucket: bucket,
171 | 		Key: key,
172 | 		MultipartUpload: { Parts: parts },
173 | 		UploadId: uploadId
174 | 	});
175 | 	const completeResponse = await client.send(complete);
176 | 	return completeResponse;
177 | };
178 | 
179 | 
180 | export const bucketToIndex = async (
181 | 	index,
182 | 	domain,
183 | 	bucket,
184 | 	key,
185 | 	{
186 | 		idField=null,
187 | 		format='array',
188 | 		chunkSize=8_388_608, // 8MB,
189 | 		refresh=false
190 | 	}={}
191 | ) => {
192 | 
193 | 	let count_ = 0;
194 | 	const method = idField ? 'create' : 'index';
195 | 	const formatObject = _.pipe([
196 | 		_.pairs,
197 | 		_.mapWith(([k, value]) => ({ _id: k, data: { value } }))
198 | 	]);
199 | 	const formatArray = _.mapWith(
200 | 		({ [idField]: id, ...rest }) => ({
201 | 			...id && {_id: id},
202 | 			data: rest
203 | 		})
204 | 	);
205 | 	const funcs = {
206 | 		object: [streamObject, formatObject],
207 | 		array: [streamArray, formatArray]
208 | 	};
209 | 
210 | 	await createIndex(domain, index);
211 | 	const [stream_, formatter] = funcs[format];
212 | 	const streamer = stream_(
213 | 		bucket,
214 | 		key,
215 | 		{ increment: chunkSize }
216 | 	);
217 | 
218 | 	for await (let docs of streamer) {
219 | 		const bulkFormat = formatter(docs);
220 | 		await bulkRequest(
221 | 			domain,
222 | 			index,
223 | 			bulkFormat,
224 | 			method,
225 | 			{ refresh }
226 | 		);
227 | 		count_ += docs.length;
228 | 	}
229 | 	return count_;
230 | };
231 | 
232 | /* Index to Bucket Specific Functions */
233 | 
234 | const separate = (start, stop, data, page, total) => {
235 | 	let raw = JSON.stringify(data).slice(1, -1);
236 | 	if (page === 1) {
237 | 		raw = `${start}${raw}`;
238 | 	}
239 | 	if (page === total) {
240 | 		raw = `${raw}${stop}`;
241 | 	} else {
242 | 		raw = `${raw},`;
243 | 	}
244 | 	return raw;
245 | };
246 | 
247 | const arrayFormatter = (data, page, total) => {
248 | 	return separate('[', ']', data, page, total);
249 | };
250 | 
251 | const objectFormatter = (data, page, total, { key=null }={}) => {
252 | 
253 | 	const getter = key ? _.getPath(key) : _.identity;
254 | 	const documents = _.reduce(
255 | 		data,
256 | 		(acc, doc) => {
257 | 			acc[doc.id] = getter(doc);
258 | 			return acc;
259 | 		},
260 | 		{}
261 | 	);
262 | 	return separate('{', '}', documents, page, total);
263 | };
264 | 
265 | const entitiesFormatter = (data, page, total) =>
266 | 	objectFormatter(data, page, total, { key: 'dbpedia_entities' });
267 | 
268 | const extractSource = _.mapWith(doc => ({ id: doc._id,  ...doc._source }));
269 | 
270 | const extractURIandConfidence = _.mapWith(
271 | 	doc => {
272 | 		doc.dbpedia_entities = _.map(
273 | 			doc.dbpedia_entities || [],
274 | 			entity => ({ URI: entity.URI, confidence: entity.confidence })
275 | 		);
276 | 		return doc;
277 | 	}
278 | );
279 | 
280 | const filterByConfidence = threshold => _.mapWith(
281 | 	doc => {
282 | 		if (doc._source.dbpedia_entities) {
283 | 			doc._source.dbpedia_entities = _.filter(
284 | 				doc._source.dbpedia_entities || [],
285 | 				entity => entity.confidence > threshold
286 | 			);
287 | 		}
288 | 		return doc;
289 | 	}
290 | );
291 | 
292 | export const indexToBucket = async(
293 | 	index,
294 | 	domain,
295 | 	bucket,
296 | 	key,
297 | 	{
298 | 		threshold=0,
299 | 		pages='all',
300 | 		pageSize=10000,
301 | 		format='array',
302 | 		processor='default'
303 | 	}={}
304 | ) => {
305 | 
306 | 	const formats = {
307 | 		array: arrayFormatter,
308 | 		object: objectFormatter,
309 | 		entities: entitiesFormatter
310 | 	};
311 | 
312 | 	const processors = {
313 | 		es: _.identity,
314 | 		default: extractSource,
315 | 		simple: _.pipe([extractSource, extractURIandConfidence])
316 | 	};
317 | 
318 | 	const filter = filterByConfidence(threshold);
319 | 	const processor_ = processors[processor];
320 | 	const etl = _.pipe([filter, processor_]);
321 | 	const formatter = formats[format];
322 | 
323 | 	const scroller = scroll(domain, index, {
324 | 		pages,
325 | 		size: pageSize,
326 | 	});
327 | 
328 | 	const totalDocuments = await count(domain, index);
329 | 	const totalWork = pages === 'all'
330 | 		? totalDocuments
331 | 		: pages * pageSize;
332 | 
333 | 	const pagesNeeded = Math.floor(totalDocuments / pageSize) + 1;
334 | 	const pages_ = pages === 'all'
335 | 		? pagesNeeded
336 | 		: Math.min(pagesNeeded, pages);
337 | 
338 | 	const bar = new cliProgress.SingleBar(
339 | 		cliProgress.Presets.shades_classic
340 | 	);
341 | 
342 | 	const uploadId = await initialiseMultiPartUpload(bucket, key);
343 | 	bar.start(totalWork, 0);
344 | 
345 | 	let partNumber = 1;
346 | 	let currentPage = 1;
347 | 	let parts = [];
348 | 	let chunk = '';
349 | 
350 | 	for await (let page of scroller) {
351 | 
352 | 		const data = etl(page.hits.hits);
353 | 		const raw = formatter(data, currentPage, pages_);
354 | 		chunk += raw;
355 | 
356 | 		// check if the chunk is large enough to upload as a part to s3
357 | 		if (Buffer.byteLength(chunk) >= MIN_PART_SIZE) {
358 | 			const ETag = await uploadPart(
359 | 				chunk, bucket, key, uploadId, partNumber
360 | 			);
361 | 			parts.push({ PartNumber: partNumber, ETag });
362 | 			partNumber++;
363 | 			chunk = '';
364 | 		}
365 | 		bar.increment(page.hits.hits.length);
366 | 		currentPage++;
367 | 	}
368 | 
369 | 	// if chunk as not been reset on last iteration, there's still one last
370 | 	// upload to perform
371 | 	if (chunk.length) {
372 | 		const ETag = await uploadPart(
373 | 			chunk, bucket, key, uploadId, partNumber
374 | 		);
375 | 		parts.push({ PartNumber: partNumber, ETag });
376 | 		partNumber++;
377 | 	}
378 | 	await completeMultiPartUpload(bucket, key, parts, uploadId);
379 | 	bar.stop();
380 | };
381 | 
382 | export const headBucket = async bucket => {
383 | 	const command = new HeadBucketCommand({ Bucket: bucket });
384 | 	const response = await client.send(command);
385 | 	return response;
386 | };
387 | 
388 | export const headObject = async (bucket, key) => {
389 | 	const command = new HeadObjectCommand({ Key: key, Bucket: bucket });
390 | 	const response = await client.send(command);
391 | 	return response;
392 | };
393 | 
394 | export const bucketACL = async bucket => {
395 | 	const command = new GetBucketAclCommand({ Bucket: bucket });
396 | 	const response = await client.send(command);
397 | 	return response;
398 | };
399 | 
400 | export const putObject = async (bucket, key, data) => {
401 | 	const command = new PutObjectCommand({
402 | 		Bucket: bucket,
403 | 		Key: key,
404 | 		body: data
405 | 	});
406 | 	const response = await client.send(command);
407 | 	return response;
408 | };
409 | 


--------------------------------------------------------------------------------
/dbpedia/spotlight.mjs:
--------------------------------------------------------------------------------
  1 | import * as cliProgress from 'cli-progress';
  2 | import * as _ from 'lamb';
  3 | 
  4 | import { getLength, mergeWithMerge, stringify } from '@svizzle/utils';
  5 | import { fetch } from 'undici'
  6 | 
  7 | import { defaultMapping, metaDataMapping } from '../conf/mappings.mjs';
  8 | import { count, updateMapping } from '../es/index.mjs';
  9 | import { update } from '../es/update.mjs';
 10 | import { scroll, clearScroll } from '../es/search.mjs';
 11 | import { bulkRequest } from '../es/bulk.mjs';
 12 | import { batch } from '../util/array.mjs';
 13 | import { logger } from '../logging/logging.mjs';
 14 | import { promisesHandler } from '../util/promises.mjs';
 15 | import { spotlightEndpoint, confidenceValues } from '../conf/config.mjs';
 16 | 
 17 | /**
 18 |  * The resource object that the spotlight tool responds with. Each resource corresponds to a DBpedia URI.
 19 |  * @typedef SpotlightResource
 20 |  * @type {Object}
 21 |  * @property {string} @URI - the Unique Resource Identifier for this resource.
 22 |  * @property {number} @support - the support for the annotated resource (see {@link SpotlightAnnotation})
 23 |  * @property {string} @types - the types the resource belongs to in the ontology.
 24 |  * @property {string} @surfaceForm - the original string used to produce this resource.
 25 |  * @property {number} @offset - the index at which the surface form was found in the provided text.
 26 |  * @property {number} @similarityScore - cosine similarity between the context vectors and the context surrounding the surface form.
 27 |  * @property {number} @percentageOfSecondRank - the relative difference in topic score between the first and the second ranked resource.
 28 |  */
 29 | 
 30 | /**
 31 |  * The annotation response. This object is the response to a call made when annotating a piece of text.
 32 |  * @typedef SpotlightAnnotation
 33 |  * @type {Object}
 34 |  * @property {string} text - text to be annotated.
 35 |  * @property {string} confidence - confidence score for disambiguation / linking.
 36 |  * @property {number} support - how prominent is this entity in Lucene Model, i.e. number of inlinks in Wikipedia
 37 |  * @property {string} types - types filter (Eg.DBpedia:Place).
 38 |  * @property {string} sparql - SPARQL filtering
 39 |  * @property {string} policy - (whitelist) select all entities that have the same type; (blacklist) - select all entities that have not the same type.
 40 |  * @property {SpotlightResource[]} Resources - the resources found for the supplied text.
 41 |  */
 42 | 
 43 | /**
 44 |  * @function castAnnotation
 45 |  * @description the Spotlight API returns the annotations with certan values cast as strings.
 46 |  * This function recasts the values back to their appropriate types.
 47 |  * @param {SpotlightAnnotation} annotation
 48 |  * @returns the Spotlight annotation, correctly parsed and casted
 49 |  */
 50 | const castAnnotation = annotation => {
 51 | 
 52 | 	// FIXME: Use mapping to determine which types to cast
 53 | 	const Resources = annotation.Resources
 54 | 		? annotation.Resources.map(r => {
 55 | 			return {
 56 | 				...r,
 57 | 				'@support': parseInt(r['@support'], 10),
 58 | 				'@offset': parseFloat(r['@offset'], 10),
 59 | 				'@similarityScore': parseFloat(r['@similarityScore'], 10),
 60 | 				'@percentageOfSecondRank': parseFloat(
 61 | 					r['@percentageOfSecondRank'], 10
 62 | 				),
 63 | 			};
 64 | 		})
 65 | 		: null;
 66 | 	return {
 67 | 		...annotation,
 68 | 		'@confidence': parseInt(100 * parseFloat(annotation['@confidence']), 10),
 69 | 		'@support': parseInt(annotation['@support'], 10),
 70 | 		Resources,
 71 | 	};
 72 | };
 73 | 
 74 | /**
 75 |  * @function annotate
 76 |  * @description Returns an annotation object for the specified inputs.
 77 |  * @param {string} text - Text to annotate
 78 |  * @param {float} confidence - Confidence with which to annotate
 79 |  * @param {Object} [options] - Options object for the annotation process
 80 |  * @param {string} [options.endpoint] - Endpoint url where the Spotlight process runs. Defaults to the Docker container running on Nesta's EC2 instance.
 81 |  * @returns {SpotlightAnnotation} Spotlight annotation for given input paramaters
 82 |  */
 83 | export const annotate = async (
 84 | 	text,
 85 | 	confidence,
 86 | 	{ endpoint = spotlightEndpoint } = {}
 87 | ) => {
 88 | 	const url = new URL(endpoint);
 89 | 	const body = `text=${encodeURIComponent(text)}&confidence=${confidence}`;
 90 | 	const response = await fetch(url, {
 91 | 		method: 'POST',
 92 | 		headers: {
 93 | 			Accept: 'application/json',
 94 | 			'content-type': 'application/x-www-form-urlencoded',
 95 | 		},
 96 | 		body,
 97 | 	});
 98 | 	if (!response.ok) {
 99 | 		throw new Error(`Annotation failed\nResponse: ${stringify(response)}`);
100 | 	}
101 | 	const annotation = await response.json();
102 | 	return castAnnotation(annotation);
103 | };
104 | 
105 | /**
106 |  * For our purposes, we simplify the {@link SpotlightResource} object.
107 |  * All properties below are the exact same.
108 |  * @typedef ReducedResource
109 |  * @type {Object}
110 |  * @property {string} URI - the Unique Resource Identifier for this resource.
111 |  * @property {string} surfaceForm - the original string used to produce this resource.
112 |  * @property {number} similarityScore - cosine similarity between the context vectors and the context surrounding the surface form.
113 |  * @property {number} percentageOfSecondRank - the relative difference in topic score between the first and the second ranked resource.
114 |  */
115 | 
116 | /**
117 |  *
118 |  * @typedef ParsedAnnotation
119 |  * @type {Object}
120 |  * @property {ReducedResource[]} results - an array of simplified results.
121 |  * @property {number} confidence - the confidence at which these results were annotated.
122 |  */
123 | 
124 | /**
125 |  * @function parseAnnotationResults
126 |  * @description this function takes an array of Annotation objects and simplifies them.
127 |  * @param {SpotlightAnnotation} spotlightAnnotation - an object returned by {@link annotate}
128 |  * @returns {ParsedAnnotation} parsed Annotation.
129 |  */
130 | export const parseAnnotationResults = spotlightAnnotation =>
131 | 	spotlightAnnotation.Resources
132 | 		? _.flatMap(spotlightAnnotation.Resources, result => ({
133 | 			confidence: spotlightAnnotation['@confidence'],
134 | 			URI: result['@URI'],
135 | 			surfaceForm: result['@surfaceForm'],
136 | 			similarityScore: result['@similarityScore'],
137 | 			percentageOfSecondRank: result['@percentageOfSecondRank'],
138 | 		}))
139 | 		: [];
140 | 
141 | /**
142 |  * The final form of resource, this is the same as {@link ReducedResource}, however
143 |  * the confidence property has been added to the objects values.
144 |  * All properties below are the exact same.
145 |  * @typedef DBpediaEntity
146 |  * @type {Object}
147 |  * @property {string} URI - the Unique Resource Identifier for this resource.
148 |  * @property {string} surfaceForm - the original string used to produce this resource.
149 |  * @property {number} similarityScore - cosine similarity between the context vectors and the context surrounding the surface form.
150 |  * @property {number} percentageOfSecondRank - the relative difference in topic score between the first and the second ranked resource.
151 |  * @property {number} confidence - the highest confidence at which this resource could be found. This means that all annotations performed at a lower confidence than the one given here will also produce this annotation.
152 |  */
153 | 
154 | /**
155 |  * @function reduceAnnotationResults
156 |  * @description Maps {@link ParsedAnnotation} objects using their confidedence.
157 |  * @param {Object.<string, ParsedAnnotation>} parsedAnnotationByConfidence - an object where {@link ReducedResource} objects are mapped by the condience with which they were produced.
158 |  * @returns {DBpediaEntity[]} - a list of annotated entities.
159 |  */
160 | export const reduceAnnotationResults = spotlightTerms => {
161 | 
162 | 	const reduceTerms = _.mapValuesWith(
163 | 		_.reduceWith(
164 | 			(acc, curr) => curr.confidence > acc.confidence ? curr : acc,
165 | 		)
166 | 	);
167 | 
168 | 	const countDuplicatesOf = confidence => _.mapValuesWith(
169 | 		_.pipe([
170 | 			_.filterWith(_.hasKeyValue('confidence', confidence)),
171 | 			getLength,
172 | 			value => ({ [`duplicates_${confidence}`]: value })
173 | 		]),
174 | 	);
175 | 
176 | 	const reduceAndCountDuplicates = confidences => _.pipe([
177 | 		_.groupBy(_.getKey('URI')),
178 | 		_.collect([
179 | 			reduceTerms,
180 | 			..._.map(confidences, countDuplicatesOf)
181 | 		]),
182 | 		_.reduceWith(mergeWithMerge),
183 | 		_.values
184 | 	]);
185 | 
186 | 	const reduceAndCountDuplicatesOf = reduceAndCountDuplicates([10, 60]);
187 | 	const finalResults = reduceAndCountDuplicatesOf(spotlightTerms);
188 | 	return finalResults;
189 | };
190 | 
191 | export const generateMetaData = (reducedTerms, spotlightResults) => {
192 | 
193 | 	const metaReducer = (prev, curr) => {
194 | 		return {
195 | 			entities_count: prev.entities_count + 1,
196 | 			confidence_avg: prev.confidence_avg + curr.confidence,
197 | 			confidence_max: curr.confidence > prev.confidence_max ? curr.confidence : prev.confidence_max,
198 | 			confidence_min: curr.confidence < prev.confidence_min ? curr.confidence : prev.confidence_min,
199 | 			dupes_10_count: prev.dupes_10_count + (curr.duplicates_10 > 1 ? 1 : 0),
200 | 			dupes_60_count: prev.dupes_60_count + (curr.duplicates_60 > 1 ? 1 : 0),
201 | 			confidence_counts: {
202 | 				...prev.confidence_counts,
203 | 				[curr.confidence]: prev.confidence_counts[curr.confidence]
204 | 					? prev.confidence_counts[curr.confidence] + 1
205 | 					: 1,
206 | 			},
207 | 		};
208 | 	};
209 | 
210 | 	const intialMetaData = {
211 | 		entities_count: 0,
212 | 		confidence_avg: 0,
213 | 		confidence_max: 0,
214 | 		confidence_min: 100,
215 | 		dupes_10_count: 0,
216 | 		dupes_60_count: 0,
217 | 		confidence_counts: {},
218 | 	};
219 | 	const reducedMetaData = reducedTerms.reduce(metaReducer, intialMetaData);
220 | 	const metadata = {
221 | 		...reducedMetaData,
222 | 		confidence_avg: reducedMetaData.confidence_avg / reducedMetaData.entities_count,
223 | 		dupes_10_ratio: reducedMetaData.dupes_10_count / reducedMetaData.entities_count,
224 | 		dupes_60_ratio: reducedMetaData.dupes_60_count / reducedMetaData.entities_count
225 | 	};
226 | 	return metadata;
227 | };
228 | 
229 | export const annotateText = async (
230 | 	text,
231 | 	{ endpoint = spotlightEndpoint, includeMetaData = null } = {}
232 | ) => {
233 | 	const spotLightPromises = _.map(
234 | 		confidenceValues,
235 | 		confidence => annotate(text, confidence, { endpoint, })
236 | 	);
237 | 
238 | 	/** @type {SpotlightAnnotation[]} */
239 | 	const spotlightResults = (await Promise.all(spotLightPromises)).filter(
240 | 		r => 'Resources' in r
241 | 	);
242 | 
243 | 	/** @type {ParsedAnnotation[]} */
244 | 	const reducedTerms = _.pipe([
245 | 		_.mapWith(parseAnnotationResults),
246 | 		_.flatten,
247 | 		reduceAnnotationResults
248 | 	])(spotlightResults);
249 | 
250 | 	const metadata =
251 | 		includeMetaData && generateMetaData(reducedTerms, spotlightResults);
252 | 
253 | 	return {
254 | 		annotations: reducedTerms,
255 | 		...metadata && { metadata },
256 | 	};
257 | };
258 | 
259 | export const annotateArray = async (texts, endpoint) => {
260 | 	const body = JSON.stringify({ texts });
261 | 	const headers = { 'Content-Type': 'application/json' };
262 | 	const result = await fetch(endpoint, { body, headers, method: 'POST' });
263 | 	const annotations = await result.json();
264 | 	return annotations;
265 | };
266 | 
267 | /**
268 |  * Results for the higher level process of annotating an ElasticSearch document.
269 |  * @typedef documentAnnotationResult
270 |  * @type {Object}
271 |  * @property {Object} document - the ElasticSearch document supplied for annotation.
272 |  * @property {DBpediaEntity[]} annotations - a list of annotations for the supplied document.
273 |  */
274 | 
275 | /**
276 |  * @function annotateDocument
277 |  * @description takes an Elastic search document from Arxlive and annotates the abstract_article field.
278 |  * @param {Object} doc - an ElasticSearch document from the Arxlive domain.
279 |  * @param {string} field - the field of the docment to use as text for the annotation
280 |  * @param {string} endpoint - the endpoint pointing to the Spotlight REST API.
281 |  * @return {documentAnnotationResult} - the annotations for this document
282 |  */
283 | export const annotateDocument = async (
284 | 	doc,
285 | 	field,
286 | 	{ includeMetaData = null, endpoint = spotlightEndpoint } = {}
287 | ) => {
288 | 	const annotationData = await annotateText(doc._source[field], { endpoint, includeMetaData });
289 | 	return { id: doc._id, ...annotationData };
290 | };
291 | 
292 | /**
293 |  * @function uploadAnnotatedDocument
294 |  * @description abstracts process of uploading document, to avoid uploading empty annotations
295 |  * @param {Object} annotations - the dbpedia annotations provided by {@link annotatedDocument}
296 |  * @param {string} id - id of document to update
297 |  * @param {string} domain - domain on which to upload
298 |  * @param {*} index - index on which to upload
299 |  * @returns {Promise} a promise indicating status of upload process
300 |  */
301 | export const uploadAnnotatedDocument = (
302 | 	{ annotations, id, metadata },
303 | 	fieldName,
304 | 	domain,
305 | 	index
306 | ) => {
307 | 
308 | 	// no point in uploading if the doc/payload is empty
309 | 	if (Object.keys(annotations).length === 0) {
310 | 		return Promise.resolve();
311 | 	}
312 | 	return update(domain, index, id, {
313 | 		[fieldName]: annotations,
314 | 		...metadata && { [`${fieldName}_metadata`]: metadata },
315 | 	});
316 | };
317 | 
318 | const annotateBatch = async (
319 | 	docs,
320 | 	fieldName,
321 | 	newFieldName,
322 | 	endpoint,
323 | 	includeMetaData
324 | ) => {
325 | 
326 | 	const toBulkFormat = doc => ({
327 | 		'_id': doc._id,
328 | 		data: {
329 | 			[newFieldName]: doc.annotations,
330 | 			...doc.metadata && { [`${newFieldName}_metadata`]: doc.metadata }
331 | 		}
332 | 	});
333 | 
334 | 	// filter out docs with empty text
335 | 	const nonEmptyDocs = docs.filter(doc => doc._source[fieldName]);
336 | 	const emptyDocs = docs.filter(doc => !doc._source[fieldName]);
337 | 	_.forEach(
338 | 		emptyDocs,
339 | 		doc => logger.warn(`Empty field: ${JSON.stringify(doc)}`)
340 | 	);
341 | 	const texts = _.map(nonEmptyDocs, _.getPath(`_source.${fieldName}`));
342 | 	const results = await annotateArray(texts, endpoint);
343 | 	const inputs = _.map(
344 | 		_.zip(nonEmptyDocs, results),
345 | 		([doc, data]) => ({ ...doc, ...data })
346 | 	);
347 | 	const [annotations, empties] = _.partition(
348 | 		inputs,
349 | 		doc => doc.annotations.length !== 0
350 | 	);
351 | 
352 | 	if (empties.length) {
353 | 		_.forEach(
354 | 			empties,
355 | 			doc => logger.warn(`Empty doc: ${JSON.stringify(doc)}`)
356 | 		);
357 | 	}
358 | 	const bulkFormat = _.map(annotations, toBulkFormat);
359 | 	return bulkFormat;
360 | };
361 | 
362 | const initialiseIndexProgressBar = async (domain, index, batchSize) => {
363 | 	const bar = new cliProgress.SingleBar(
364 | 		{ etaBuffer: batchSize * 10 },
365 | 		cliProgress.Presets.shades_classic
366 | 	);
367 | 	const totalDocuments = await count(domain, index);
368 | 	bar.start(totalDocuments, 0);
369 | 	return bar;
370 | };
371 | 
372 | const generateMappingPayload = (name, includeMetaData) => {
373 | 	const mappingPayload = {
374 | 		properties: {
375 | 			[name]: defaultMapping,
376 | 			...includeMetaData && {
377 | 				[`${name}_metadata`]: metaDataMapping,
378 | 			},
379 | 		},
380 | 	};
381 | 	return mappingPayload;
382 | };
383 | 
384 | export const annotateIndex = async (
385 | 	domain,
386 | 	index,
387 | 	endpoint,
388 | 	field,
389 | 	{
390 | 		batchSize=50,
391 | 		groupSize=4,
392 | 		includeMetaData=true,
393 | 		newField='dbpedia_entities',
394 | 		pages='all',
395 | 		pageSize=10000,
396 | 		progress=null,
397 | 	}={}
398 | ) => {
399 | 
400 | 	const mappingPayload = generateMappingPayload(newField, includeMetaData);
401 | 	await updateMapping(domain, index, { payload: mappingPayload });
402 | 
403 | 	const bar = progress
404 | 		? progress
405 | 		: await initialiseIndexProgressBar(domain, index, batchSize);
406 | 	const scroller = scroll(domain, index, { size: pageSize, pages });
407 | 
408 | 	let page;
409 | 	for await (page of scroller) {
410 | 		const batches = batch(page.hits.hits, batchSize);
411 | 		const groups = batch(batches, groupSize);
412 | 		const updates = [];
413 | 		for await (const group of groups) {
414 | 			// eslint-disable-next-line no-await-in-loop
415 | 			const promises = _.map(group, docs =>
416 | 				annotateBatch(docs, field, newField, endpoint, includeMetaData)
417 | 			);
418 | 			const resolvedPromises = await promisesHandler(promises);
419 | 			const annotations = _.flatten(resolvedPromises);
420 | 			updates.push(annotations);
421 | 			bar.increment(_.flatten(group).length);
422 | 		};
423 | 		const flattenedUpdates = _.flatten(updates);
424 | 
425 | 		// this is likely to be too big, so separate by default size
426 | 		const batchedUpdates = batch(flattenedUpdates, 500);
427 | 		for await (const update_ of batchedUpdates) {
428 | 			await bulkRequest(
429 | 				domain,
430 | 				index,
431 | 				update_,
432 | 				'update',
433 | 				{ error: false, refresh: 'wait_for' }
434 | 			);
435 | 		}
436 | 	}
437 | 
438 | 	bar.stop();
439 | 
440 | 	if (page) {
441 | 		clearScroll(domain, page._scroll_id);
442 | 	}
443 | 
444 | };
445 | 
446 | export const annotateRequest = async request => {
447 | 
448 | 	await annotateIndex(
449 | 		request.domain,
450 | 		request.index,
451 | 		request.annotationEndpoint,
452 | 		request.field,
453 | 		request,
454 | 	);
455 | 	return request;
456 | };
457 | 


--------------------------------------------------------------------------------