├── .gitignore ├── neo4j ├── secrets │ └── credentials.json ├── driver.mjs ├── gds.mjs ├── util.mjs └── community.mjs ├── util ├── time.mjs ├── commander.mjs ├── path.mjs ├── shell.mjs ├── promises.mjs ├── string.mjs └── array.mjs ├── es ├── domain.mjs ├── query.mjs ├── dump.mjs ├── update.mjs ├── pipeline.mjs ├── bulk.mjs ├── document.mjs ├── entities.mjs ├── snapshot.mjs ├── search.mjs ├── requests.mjs └── index.mjs ├── bin ├── annotationsDataQuality │ ├── requests │ │ ├── flattenedConfidenceHistogram │ │ │ ├── request.json │ │ │ └── README.md │ │ ├── flattenedConfidenceExtendedStats │ │ │ ├── request.json │ │ │ └── README.md │ │ ├── confidenceHistograms │ │ │ ├── README.md │ │ │ └── request.json │ │ ├── flattenedSimilarityScoreHistogram │ │ │ ├── request.json │ │ │ └── README.md │ │ ├── flattenedURITerms │ │ │ ├── README.md │ │ │ └── request.json │ │ ├── docsWithMissingDBpediaEntitiesField │ │ │ ├── request.json │ │ │ └── README.md │ │ ├── flattenedSimilarityScoreExtendedStats │ │ │ ├── request.json │ │ │ └── README.md │ │ ├── entitiesCountAggregations │ │ │ ├── request.json │ │ │ └── README.md │ │ ├── flattenedURITermsByConfidence │ │ │ ├── README.md │ │ │ └── request.json │ │ ├── textBodyAbstractArticleTokensHistogram │ │ │ ├── request.json │ │ │ └── README.md │ │ ├── entitiesCountAggsByConfidenceOverEntitiesCount │ │ │ ├── README.md │ │ │ ├── request.mjs │ │ │ └── request.json │ │ ├── tokenCountOverEntitiesCountAggs │ │ │ ├── README.md │ │ │ └── request.json │ │ ├── entitiesCountOverTokenCountByConfidence │ │ │ ├── README.md │ │ │ ├── request.mjs │ │ │ └── request.json │ │ ├── entitiesCountAggsByConfidence │ │ │ ├── README.md │ │ │ └── request.json │ │ └── duplicateAggregations │ │ │ ├── README.md │ │ │ └── request.json │ └── annotationsDataQuality.js ├── geo │ ├── generatePmTiles.sh │ ├── downloadBoundaries.js │ └── README.md ├── README.md ├── jsonToEsIndex.js ├── annotate.js ├── annotateEsIndex.js └── entitiesDataQuality.js ├── dbpedia ├── util.mjs ├── ontology.mjs ├── requests.mjs └── spotlight.mjs ├── terraform ├── state.mjs ├── commands.mjs └── configuration.mjs ├── conf ├── config.mjs └── mappings.mjs ├── auth └── authentication.mjs ├── sparql └── query.mjs ├── aws ├── email.mjs └── s3.mjs ├── bing └── search.mjs ├── README.md ├── LICENSE ├── wiki └── page.mjs ├── .github └── workflows │ └── tag_new_versions.yml ├── logging └── logging.mjs ├── CHANGELOG.md ├── geo └── download.js └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | logs -------------------------------------------------------------------------------- /neo4j/secrets/credentials.json: -------------------------------------------------------------------------------- 1 | { 2 | "user": "neo4j", 3 | "password": "datavis" 4 | } -------------------------------------------------------------------------------- /util/time.mjs: -------------------------------------------------------------------------------- 1 | export const sleep = ms => { 2 | return new Promise(resolve => { 3 | setTimeout(resolve, ms); 4 | }); 5 | }; 6 | -------------------------------------------------------------------------------- /es/domain.mjs: -------------------------------------------------------------------------------- 1 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 2 | 3 | 4 | export const info = async domain => { 5 | const request = buildRequest(domain, '', 'GET'); 6 | const { body: response } = await makeRequest(request); 7 | return response; 8 | }; 9 | -------------------------------------------------------------------------------- /util/commander.mjs: -------------------------------------------------------------------------------- 1 | import { InvalidArgumentError } from 'commander'; 2 | 3 | export const commanderParseInt = (value, _) => { 4 | const parsedValue = parseInt(value, 10); 5 | if (isNaN(parsedValue)) { 6 | throw new InvalidArgumentError('Not an integer.'); 7 | } 8 | return parsedValue; 9 | }; 10 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedConfidenceHistogram/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "dbpedia": { 5 | "nested": { 6 | "path": "dbpedia_entities" 7 | }, 8 | "aggs": { 9 | "confidence": { 10 | "terms": { 11 | "field": "dbpedia_entities.confidence" 12 | } 13 | } 14 | } 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /util/path.mjs: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | import * as path from 'path'; 3 | 4 | import { stringify } from '@svizzle/utils'; 5 | 6 | export const createPathAndWriteObject = async (path_, data) => { 7 | const directory = path.dirname(path_); 8 | await fs.mkdir(directory, { recursive: true }); 9 | await fs.writeFile(path_, stringify(data)); 10 | }; 11 | -------------------------------------------------------------------------------- /dbpedia/util.mjs: -------------------------------------------------------------------------------- 1 | export const prefixes = 2 | ` 3 | PREFIX rdf: 4 | PREFIX dbo: 5 | PREFIX dbr: 6 | PREFIX prov: `; 7 | 8 | export const dbr = 'http://dbpedia.org/resource/'; 9 | export const dbo = 'http://dbpedia.org/ontology/'; 10 | 11 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedConfidenceExtendedStats/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "dbpedia_entities": { 5 | "nested": { 6 | "path": "dbpedia_entities" 7 | }, 8 | "aggs": { 9 | "confidence_stats": { 10 | "extended_stats": { 11 | "field": "dbpedia_entities.confidence" 12 | } 13 | } 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /terraform/state.mjs: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | import * as path from 'path'; 3 | 4 | export const getCurrentState = async dir => { 5 | let state; 6 | try { 7 | state = JSON.parse(await fs.readFile(path.join(dir, 'terraform.tfstate'), { encoding: 'utf-8'})); 8 | } catch { 9 | state = null; 10 | } 11 | if (!state) { 12 | return false; 13 | } 14 | return state; 15 | }; 16 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/confidenceHistograms/README.md: -------------------------------------------------------------------------------- 1 | ## Histogram of Average `confidence` 2 | 3 | Produces a histogram for the average, max and min `confidence` values calculated as 4 | part of the annotation meta-data. 5 | 6 | Endpoint: `arxiv_v6/_search` 7 | 8 | See: 9 | 10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html 11 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedSimilarityScoreHistogram/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "dbpedia_entities": { 5 | "nested": { 6 | "path": "dbpedia_entities" 7 | }, 8 | "aggs": { 9 | "similarityScore": { 10 | "histogram": { 11 | "field": "dbpedia_entities.similarityScore", 12 | "interval": 0.1 13 | } 14 | } 15 | } 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /conf/config.mjs: -------------------------------------------------------------------------------- 1 | export const arxliveCopy = 2 | 'search-datavis-arxlive-copy-n6ltva3lqh7x7xb6ucpaqfb5a4.eu-west-2.es.amazonaws.com'; 3 | export const ec2 = 'http://ec2-3-8-167-48.eu-west-2.compute.amazonaws.com'; 4 | export const spotlightEndpoint = `${ec2}:2222/rest/annotate`; 5 | export const confidenceValues = [ 6 | 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 7 | ]; 8 | export const confidenceScores = [ 9 | 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 10 | ]; 11 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedURITerms/README.md: -------------------------------------------------------------------------------- 1 | ## Term aggregation of flattened URI values 2 | 3 | Counts the occurrences of the `URI` fields in `dbpedia_entities` and returns 4 | the top 100 most frequent values. 5 | 6 | Endpoint: `POST arxiv_v6/_search` 7 | 8 | See: 9 | 10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-terms-aggregation.html 11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html -------------------------------------------------------------------------------- /util/shell.mjs: -------------------------------------------------------------------------------- 1 | export const displayCommandOutput = ( 2 | error, 3 | stdout, 4 | stderr, 5 | { warnings=false } = {} 6 | ) => { 7 | if (error) { 8 | console.log(`error: ${error.message}`); 9 | return; 10 | } 11 | if (stderr) { 12 | if (stderr.toLowerCase().startsWith('warning') && !warnings) { 13 | return; 14 | } 15 | console.log(`stderr: ${stderr}`); 16 | return; 17 | } 18 | if (stdout.length > 0) { 19 | console.log(`stdout: ${stdout}`); 20 | } 21 | }; 22 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/docsWithMissingDBpediaEntitiesField/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "must_not": { 5 | "nested": { 6 | "path": "dbpedia_entities", 7 | "query": { 8 | "exists": { 9 | "field": "dbpedia_entities" 10 | } 11 | } 12 | } 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedURITerms/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "dbpedia": { 5 | "nested": { 6 | "path": "dbpedia_entities" 7 | }, 8 | "aggs": { 9 | "URIs": { 10 | "terms": { 11 | "field": "dbpedia_entities.URI", 12 | "size": 100 13 | } 14 | } 15 | } 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /util/promises.mjs: -------------------------------------------------------------------------------- 1 | import { getValue, stringify } from '@svizzle/utils'; 2 | 3 | import { logger } from '../logging/logging.mjs'; 4 | 5 | const logErrors = v => { 6 | if (v.status === 'rejected') { 7 | logger.error(stringify(v)); 8 | } 9 | return v; 10 | }; 11 | 12 | const removeErrors = v => v.status !== 'rejected'; 13 | 14 | export const promisesHandler = async promises => { 15 | return (await Promise.allSettled(promises)) 16 | .map(logErrors) 17 | .filter(removeErrors) 18 | .map(getValue); 19 | }; 20 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedSimilarityScoreExtendedStats/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "dbpedia_entities": { 5 | "nested": { 6 | "path": "dbpedia_entities" 7 | }, 8 | "aggs": { 9 | "similarityScore_stats": { 10 | "extended_stats": { 11 | "field": "dbpedia_entities.similarityScore" 12 | } 13 | } 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /util/string.mjs: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb'; 2 | import { trim } from '@svizzle/utils'; 3 | 4 | export const toLowerString = v => v.toString().toLowerCase(); 5 | 6 | // tag function to dedent template literals 7 | export const dedent = _.pipe([ 8 | _.head, // first argument is strings 9 | _.splitBy('\n'), 10 | _.mapWith(trim), 11 | _.joinWith('\n'), 12 | trim 13 | ]); 14 | 15 | export const hasOnlyLatinCharacters = str => (/^[a-zA-Z:]+$/u).test(str); 16 | export const hasNonAsciiCharacters = str => (/^[\u0000-\u007f]*$/u).test(str); 17 | -------------------------------------------------------------------------------- /auth/authentication.mjs: -------------------------------------------------------------------------------- 1 | export const authenticate = async (endpoint, email, token) => { 2 | const url = `${endpoint}?email=${email}&token=${token}` 3 | const response = await fetch(url); 4 | const result = await response.json(); 5 | return result; 6 | }; 7 | 8 | export const parseBasicAuth = header => { 9 | const base64String = header.slice(6, -1); 10 | const buff = Buffer.from(base64String, 'base64'); 11 | const utfString = buff.toString('utf-8'); 12 | const [ email, token ] = utfString.split(':'); 13 | return { email, token }; 14 | }; 15 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountAggregations/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "entities_count_extended_stats": { 5 | "extended_stats": { 6 | "field": "dbpedia_entities_metadata.entities_count" 7 | } 8 | }, 9 | "entities_count_histogram": { 10 | "histogram": { 11 | "field": "dbpedia_entities_metadata.entities_count", 12 | "interval": 1, 13 | "min_doc_count": 1 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /sparql/query.mjs: -------------------------------------------------------------------------------- 1 | import { fetch } from 'undici'; 2 | 3 | 4 | export const query = async( 5 | sparql, 6 | endpoint='https://dbpedia.org/sparql', 7 | { responseFormat='application/json' } = {} 8 | ) => { 9 | const headers = { 10 | Accept: responseFormat, 11 | 'Content-Type': 'application/sparql-query' 12 | }; 13 | const response = await fetch(endpoint, 14 | { 15 | method: 'POST', 16 | body: sparql, 17 | headers 18 | } 19 | ); 20 | if (responseFormat === 'application/json') { 21 | return response.json(); 22 | } 23 | return response; 24 | }; 25 | 26 | 27 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedURITermsByConfidence/README.md: -------------------------------------------------------------------------------- 1 | ## Term aggregation of flattened URI values 2 | 3 | Counts the occurrences of the `URI` fields in `dbpedia_entities` at different 4 | confidence levels and returns the top 100 most frequent values. 5 | 6 | Endpoint: `POST arxiv_v6/_search` 7 | 8 | See: 9 | 10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/query-filter-context.html 11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-terms-aggregation.html 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html -------------------------------------------------------------------------------- /bin/geo/generatePmTiles.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Please supply path to the geojson files" 5 | exit 1 6 | fi 7 | 8 | if [ ! -d $1 ]; then 9 | echo "Plase supply a valid path" 10 | exit 1 11 | fi 12 | 13 | tiles="" 14 | files="" 15 | 16 | for file in `ls $1/*.geojson`; do 17 | files="$file $files" 18 | file=`basename -- $file` 19 | file="${file%.*}" 20 | tiles="$file"_"$tiles" 21 | done 22 | 23 | tiles=${tiles%?} 24 | mbtiles="$tiles.mbtiles" 25 | pmtiles="$tiles.pmtiles" 26 | 27 | tippecanoe -o $mbtiles -zg $files 28 | pmtiles convert $mbtiles $pmtiles -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/textBodyAbstractArticleTokensHistogram/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "textBody_abstract_article_token_count_histogram": { 5 | "histogram": { 6 | "field": "textBody_abstract_article.token_count", 7 | "interval": 10, 8 | "min_doc_count": 1 9 | } 10 | }, 11 | "textBody_abstract_article_token_count_extended_stats": { 12 | "extended_stats": { 13 | "field": "textBody_abstract_article.token_count" 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedSimilarityScoreHistogram/README.md: -------------------------------------------------------------------------------- 1 | ## Histogram of flattened `similarityScore` values 2 | 3 | Aggregates all `similarityScore` values into a histogram, each bucket having an 4 | interval of 0.1. Flattened here denotes the fact that all annotated entities are 5 | treated as a flat list - no per document analysis is performed. 6 | 7 | Endpoint: `POST arxiv_v6/_search` 8 | 9 | See: 10 | 11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html 13 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedConfidenceExtendedStats/README.md: -------------------------------------------------------------------------------- 1 | ## Flattened `confidence` Extended Stats 2 | 3 | Produces a number of different statistical measures such as average, STD, min, 4 | max, etc. for`confidence`values. Flattened here denotes the fact 5 | that all annotated entities are treated as a flat list - no per document 6 | analysis is performed. 7 | 8 | Endpoint: `POST arxiv_v6/_search` 9 | 10 | See: 11 | 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html 14 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountAggregations/README.md: -------------------------------------------------------------------------------- 1 | ## Entities Count Aggregations 2 | 3 | Aggregations for the `entities_count` field, which is a simple count of the 4 | total number of entities found for that document. 5 | 6 | The aggregations use both `extended_stats` and `histogram`s for the 7 | `entities_count` metadata field. 8 | 9 | Endpoint: `POST arxiv_v6/_search` 10 | 11 | See: 12 | 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html 14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedSimilarityScoreExtendedStats/README.md: -------------------------------------------------------------------------------- 1 | ## Flattened `similarityScore` Extended Stats 2 | 3 | Produces a number of different statistical measures such as average, STD, min, 4 | max, etc. for `similarityScore` fields. Flattened here denotes the fact 5 | that all annotated entities are treated as a flat list - no per document 6 | analysis is performed. 7 | 8 | Endpoint: `POST arxiv_v6/_search` 9 | 10 | See: 11 | 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html 14 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountAggsByConfidenceOverEntitiesCount/README.md: -------------------------------------------------------------------------------- 1 | ## Entities Count by Confidence over Entities Count 2 | 3 | Counts number of entities found at different confidence levels, then normalises 4 | that count using the total count of entities found at all confidence levels. 5 | 6 | Uses `extended_stats` and `histogram` aggs. 7 | 8 | Endpoint: `POST arxiv_v6/_search` 9 | 10 | See: 11 | 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html -------------------------------------------------------------------------------- /es/query.mjs: -------------------------------------------------------------------------------- 1 | import { stringify } from '@svizzle/utils'; 2 | 3 | import { arxliveCopy } from '../conf/config.mjs'; 4 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 5 | 6 | export const query = async (query_, index, domain=arxliveCopy) => { 7 | const path = `${index}/_search`; 8 | const payload = query_; 9 | const request = buildRequest( 10 | domain, 11 | path, 12 | 'POST', 13 | { payload } 14 | ); 15 | const { body: response, code } = await makeRequest(request, { verbose: true }); 16 | 17 | if (code !== 200) { 18 | throw new Error(`Query failed with response ${stringify(response)}`); 19 | } 20 | 21 | return response; 22 | }; 23 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/confidenceHistograms/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "avg_confidence_histogram": { 5 | "histogram": { 6 | "field": "dbpedia_entities_metadata.confidence_avg", 7 | "interval": 1, 8 | "min_doc_count": 1 9 | } 10 | }, 11 | "max_confidence_histogram": { 12 | "histogram": { 13 | "field": "dbpedia_entities_metadata.confidence_max", 14 | "interval": 10, 15 | "min_doc_count": 1 16 | } 17 | }, 18 | "min_confidence_histogram": { 19 | "histogram": { 20 | "field": "dbpedia_entities_metadata.confidence_min", 21 | "interval": 10, 22 | "min_doc_count": 1 23 | } 24 | } 25 | } 26 | } -------------------------------------------------------------------------------- /neo4j/driver.mjs: -------------------------------------------------------------------------------- 1 | 2 | import { promises as fs } from 'fs'; 3 | import { fileURLToPath } from 'url'; 4 | import { dirname } from 'path'; 5 | 6 | import * as neo4j from '../neo4j-driver'; 7 | 8 | const __filename = fileURLToPath(import.meta.url); 9 | const __dirname = dirname(__filename); 10 | 11 | export const getSession = async () => { 12 | const credentials = `${__dirname}/secrets/credentials.json`; 13 | const { user, password } = JSON.parse(await fs.readFile(credentials)); 14 | const uri = 'bolt://3.8.167.48:7687'; 15 | const driver = neo4j.driver(uri, neo4j.auth.basic(user, password)); 16 | 17 | const session = driver.session(); 18 | return [session, driver]; 19 | }; 20 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/tokenCountOverEntitiesCountAggs/README.md: -------------------------------------------------------------------------------- 1 | ## `token_count` normalised by `entities_count` 2 | 3 | Provides descriptive statistics on the number of tokens found in the annotated 4 | field, divided by the total number of `dbpedia_entities` produced when annotated. 5 | 6 | The aggregations use both `extended_stats` and `histogram`s for the 7 | `entities_count` metadata field. 8 | 9 | Endpoint: `POST arxiv_v6/_search` 10 | 11 | See: 12 | 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html 14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountOverTokenCountByConfidence/README.md: -------------------------------------------------------------------------------- 1 | ## Entities Count by Confidence over `token_count` 2 | 3 | Counts number of entities found at different confidence levels, then normalises 4 | that count using the `token_count`, which is a count of the number of tokens 5 | for the field that was used as input for the annotation process. 6 | 7 | Uses `extended_stats` and `histogram` aggs. 8 | 9 | Endpoint: `POST arxiv_v6/_search` 10 | 11 | See: 12 | 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html 14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountAggsByConfidence/README.md: -------------------------------------------------------------------------------- 1 | ## `entities_count` filtered by confidence 2 | 3 | These aggregations provided statistcs for the 4 | `dbpedia_entities_metatadata.confidence_counts.X` fields. Specifically, these 5 | fields count the number of entities found at the varying degreees of confidence 6 | levels. The request is a multi agg request which provides `extended_stats` and 7 | `histograms` for all 11 possible confidence levels. 8 | 9 | Endpoint: `POST arxiv_v6/_search` 10 | 11 | See: 12 | 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html 14 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/textBodyAbstractArticleTokensHistogram/README.md: -------------------------------------------------------------------------------- 1 | ## Histogram for `textBody_abstract_article` tokens 2 | 3 | Aggregates all `textBody_abstract_article` values into a histogram, each bucket 4 | having an interval of 10 tokens. Tokens are generate upon indexing using 5 | ElasticSearch's standard tokenizer. Also performs an `extended_stats` 6 | aggregation for the `token_count` field. 7 | 8 | See: 9 | 10 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html 11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/analysis-standard-tokenizer.html 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html -------------------------------------------------------------------------------- /dbpedia/ontology.mjs: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | 3 | import { stringify } from '@svizzle/utils'; 4 | import * as _ from 'lamb'; 5 | 6 | import { dbo } from '../dbpedia/util.mjs'; 7 | 8 | const FILE_ONTOLOGY_JSON = 'data/dbpedia/ontology.json'; 9 | 10 | export const loadOntology = async (depth, { squash=false, fullURI=true }={}) => { 11 | const data = await fs.readFile(FILE_ONTOLOGY_JSON, { encoding: 'utf-8'}); 12 | const changedURIs = fullURI 13 | ? JSON.parse(data) 14 | : JSON.parse(data.replaceAll(dbo, '')); 15 | 16 | const selectAtDepth = _.pickIf(value => _.getIn(value, 'depth') <= depth); 17 | const ontology = squash 18 | ? _.values(_.mapValues(selectAtDepth(changedURIs), _.getKey('class_'))) 19 | : selectAtDepth(changedURIs); 20 | 21 | return ontology; 22 | }; 23 | -------------------------------------------------------------------------------- /aws/email.mjs: -------------------------------------------------------------------------------- 1 | import { defaultProvider } from '@aws-sdk/credential-provider-node'; 2 | import { SESClient, SendEmailCommand } from '@aws-sdk/client-ses'; 3 | 4 | const client = new SESClient({ 5 | credentials: defaultProvider(), 6 | region: 'eu-west-2', 7 | }); 8 | 9 | export const sendEmail = async (email, source, message, subject) => { 10 | const input = { 11 | Source: source, 12 | Destination: { 13 | ToAddresses: [email], 14 | }, 15 | Message: { 16 | Body: { 17 | Html: { 18 | Charset: 'UTF-8', 19 | Data: message, 20 | }, 21 | }, 22 | Subject: { 23 | Charset: 'UTF-8', 24 | Data: subject, 25 | }, 26 | }, 27 | }; 28 | const command = new SendEmailCommand(input); 29 | const response = await client.send(command); 30 | return response; 31 | }; 32 | -------------------------------------------------------------------------------- /bing/search.mjs: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | 3 | import { stringify } from '@svizzle/utils'; 4 | import { fetch } from 'undici'; 5 | 6 | const SUBSCRIPTION_KEY = process.env.AZURE_SUBSCRIPTION_KEY; 7 | if (!SUBSCRIPTION_KEY) { 8 | throw new Error('AZURE_SUBSCRIPTION_KEY is not set.'); 9 | } 10 | 11 | export const search = async (query, { mkt='en-GB' } = {}) => { 12 | const host = 'https://api.bing.microsoft.com'; 13 | const path = `v7.0/search?q=${encodeURIComponent(query)}&mkt=${mkt}`; 14 | const headers = { 'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY }; 15 | const response = await fetch(`${host}/${path}`, { headers }); 16 | if (response.status !== 200) { 17 | throw new Error(`Bing search failed.\nResponse:\n${stringify(response)}`); 18 | } 19 | const data = await response.json(); 20 | return data; 21 | }; 22 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/docsWithMissingDBpediaEntitiesField/README.md: -------------------------------------------------------------------------------- 1 | ## Documents missing the `dbpedia_entities` Field 2 | 3 | This query uses a combination of the nested, bool, must_not and exists API 4 | parameters to determine which documents are missing the `dbpedia_entities` 5 | field. 6 | 7 | Endpoint: `POST arxiv_v6/_count` 8 | 9 | See: 10 | 11 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/query-dsl-exists-query.html 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/query-dsl-bool-query.html 14 | 15 | ### Notes 16 | 17 | We are using the query API because we can't use the `missing` Aggregation API as 18 | it does not support `nested` type fields. 19 | 20 | See: 21 | 22 | - https://github.com/elastic/elasticsearch/issues/9571 -------------------------------------------------------------------------------- /terraform/commands.mjs: -------------------------------------------------------------------------------- 1 | import { promisify } from 'node:util'; 2 | import { exec } from 'child_process'; 3 | 4 | import { displayCommandOutput } from '../util/shell.mjs'; 5 | 6 | const execAwait = promisify(exec); 7 | 8 | export const init = async dir => { 9 | const initCommand = 10 | `terraform -chdir=${dir} init`; 11 | console.log("[+] Terraform - Initialising..."); 12 | await execAwait(initCommand); 13 | }; 14 | 15 | export const apply = async dir => { 16 | const applyCommand = 17 | `terraform -chdir=${dir} apply -auto-approve`; 18 | console.log("[+] Terraform - Applying..."); 19 | await execAwait(applyCommand); 20 | }; 21 | 22 | 23 | export const destroy = async dir => { 24 | const destroyCommand = 25 | `terraform -chdir=${dir} destroy -auto-approve`; 26 | console.log("[+] Terraform - Destroying..."); 27 | await execAwait(destroyCommand); 28 | }; 29 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/tokenCountOverEntitiesCountAggs/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "token_count_over_entity_count_extended_stats": { 5 | "extended_stats": { 6 | "script": "if (doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['textBody_abstract_article.token_count'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 7 | } 8 | }, 9 | "token_count_over_entity_count_histogram": { 10 | "histogram": { 11 | "script": "if (doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['textBody_abstract_article.token_count'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 12 | "interval": 0.1, 13 | "min_doc_count": 1 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dap_dv_backends_utils 2 | 3 | Utility library and utility scripts for the DAP data visualisation team's 4 | backend services and general code. 5 | 6 | ## Installing the latest version 7 | 8 | To install the latest version: 9 | 10 | `npm install nestauk/dap_dv_backends_utils#release` 11 | 12 | ## Installing a specific version 13 | 14 | To install a specific version (check the tagged version on the release branch): 15 | 16 | `npm install nestauk/dap_dv_backends_utils#` 17 | 18 | e.g.: 19 | 20 | `npm install nestauk/dap_dv_backends_utils#v0.0.2` 21 | 22 | ## Installing a specific branch 23 | 24 | To install a specific branch (useful for development): 25 | 26 | `npm install nestauk/dap_dv_backends_utils#my_branch` 27 | 28 | ## Updating the installation 29 | 30 | After installing from a branch (`release` or any other), the content of that branch can change, so if we need to update the installation with newer commits in that branch, we can use: 31 | 32 | `npm update` 33 | -------------------------------------------------------------------------------- /bin/README.md: -------------------------------------------------------------------------------- 1 | ## `annotate` 2 | 3 | Running this script requires that you have data hosted on an ElasticSearch 4 | domain and that you have a running Spotlight API endpoint. 5 | 6 | ## `annotationsDataQuality` 7 | 8 | This script will provide a number of aggreagations relating to the data quality 9 | of the results provided by the annotation process. The output directory will 10 | have names relating to the kind of aggregation that was run. For further details 11 | on the kinds of aggregations and what they do, refer to the README.md files 12 | in each aggreagation requests directory in `bin/annotationsDataQuality/requests/`. 13 | 14 | ## `entitiesDataQuality` 15 | 16 | This script will provide data quality for the actual DBpedia entities produced 17 | by the `annotate` script. It collects the set of all DBpedia URIs and uses 18 | a number of SPARQL queries to determine the quality of data provided by DBpedia, 19 | such as "how many entities have images?" and "of those images, what file type 20 | are they?" etc. The aggregations produced have self descriptive names. 21 | -------------------------------------------------------------------------------- /neo4j/gds.mjs: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb'; 2 | 3 | import { getSession } from '../neo4j/driver.mjs'; 4 | 5 | import { promisify } from '../neo4j/util.mjs'; 6 | 7 | export const project = async(graphName, threshold) => { 8 | const [session, driver] = await getSession(); 9 | const command = ` 10 | CALL gds.graph.project.cypher( 11 | '${graphName}', 12 | 'MATCH (n:Entity)-[r:APPEARS_IN_ABSTRACT]-(m:Entity) WHERE r.confidence >= ${threshold} RETURN id(n) AS id', 13 | 'MATCH (n:Entity)-[r:APPEARS_IN_ABSTRACT]-(m:Entity) WHERE r.confidence >= ${threshold} RETURN id(n) AS source, id(m) AS target') 14 | YIELD 15 | graphName AS graph, nodeQuery, nodeCount AS nodes, relationshipQuery, relationshipCount AS rels 16 | `; 17 | const result = session.run(command); 18 | return promisify(result, session, driver); 19 | }; 20 | 21 | export const drop = async graphName => { 22 | const [session, driver] = await getSession(); 23 | const command = `CALL gds.graph.drop('${graphName}')`; 24 | const result = session.run(command); 25 | return promisify(result, session, driver); 26 | }; 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Nesta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /wiki/page.mjs: -------------------------------------------------------------------------------- 1 | import { fetch } from 'undici'; 2 | 3 | 4 | // eslint-disable-next-line no-process-env 5 | const ACCESS_TOKEN = process.env.WIKIMEDIA_ACCESS_TOKEN; 6 | if (!ACCESS_TOKEN) { 7 | throw new Error('WIKIMEDIA_ACCESS_TOKEN is not set.'); 8 | } 9 | 10 | const domain = 'https://api.wikimedia.org/core/v1/wikipedia'; 11 | 12 | /** 13 | * 14 | * @param {string} title - Title of the Wikipedia page to fetch 15 | * @param {Object} options 16 | * @param {string} [options.language='en] - Language of source Wikipedia page 17 | * @param {boolean} [options.bare=true] - Whether to fetch just the page's metadata or the entire contents of the page. 18 | * @returns {Object} - response object 19 | */ 20 | export const getPage = async (title, { language='en', bare=true } = {}) => { 21 | const path = `${language}/page/${encodeURIComponent(title)}${bare ? '/bare' : ''}`; 22 | const url = `${domain}/${path}`; 23 | const response = await fetch(url, { 24 | headers: { 25 | 'Authorization': `Bearer ${ACCESS_TOKEN}`, 26 | 'Api-User-Agent': 'ai_map' 27 | } 28 | }); 29 | 30 | return { 31 | code: response.status, 32 | body: await response.json() 33 | }; 34 | }; 35 | -------------------------------------------------------------------------------- /.github/workflows/tag_new_versions.yml: -------------------------------------------------------------------------------- 1 | name: Tag new version 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'dev' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v3 14 | with: 15 | fetch-depth: 0 16 | - name: Get possible new version 17 | id: newVersionStep 18 | run: | 19 | new=$(echo $(cat ./package.json | grep version | head -1 | awk -F: '{ print $2 }' | sed 's/[",]//g' | tr -d '[[:space:]]')) 20 | echo "new=$new" >> $GITHUB_OUTPUT 21 | - name: Check current version 22 | id: currentVersionStep 23 | run: | 24 | current=$(git tag | grep -E '^v[0-9]' | sort -V | tail -1 | cut -c2-) 25 | echo "current=$current" >> $GITHUB_OUTPUT 26 | - name: Tag new version and push to release 27 | if: ${{ steps.currentVersionStep.outputs.current != steps.newVersionStep.outputs.new }} 28 | run: | 29 | tag=${{ format('v{0}', steps.newVersionStep.outputs.new) }} 30 | git tag $tag 31 | git push origin $tag 32 | git switch release 33 | git merge dev 34 | git push -------------------------------------------------------------------------------- /es/dump.mjs: -------------------------------------------------------------------------------- 1 | import { SingleBar, Presets } from 'cli-progress'; 2 | import * as _ from 'lamb'; 3 | 4 | import { count } from '../es/index.mjs'; 5 | import { scroll, clearScroll } from '../es/search.mjs'; 6 | 7 | /** 8 | * @param {string} domain - domain on from which to dump data 9 | * @param {string} index - index from which to dump data 10 | * @param {number} size size of scroll object - how many documents to fetch in a single reqeust. Maximum value is 10k 11 | * @returns {Object} list of all documents on that index. 12 | */ 13 | export const dump = async(domain, index, size) => { 14 | const bar = new SingleBar( 15 | { etaBuffer: size * 10 }, 16 | Presets.rect 17 | ); 18 | const totalDocuments = await count(domain, index); 19 | 20 | bar.start(totalDocuments, 0); 21 | 22 | const scroller = scroll(domain, index, { 23 | size, 24 | pages: 'all' 25 | }); 26 | 27 | // mutation required due to await 28 | let documents = []; 29 | for await (let page of scroller) { 30 | documents.push( 31 | ..._.map(page.hits.hits, doc => { 32 | bar.increment(); 33 | return doc._source; 34 | }) 35 | ); 36 | } 37 | 38 | bar.stop(); 39 | 40 | clearScroll(domain); 41 | 42 | return documents; 43 | }; 44 | -------------------------------------------------------------------------------- /neo4j/util.mjs: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb'; 2 | 3 | const resolveValue = value => { 4 | if (!value || typeof value === 'String') { 5 | return value; 6 | } 7 | 8 | const className = value.constructor.name; 9 | switch (className) { 10 | case 'Integer': 11 | return value.toInt(); 12 | case 'Object': 13 | return _.mapValues(value, resolveValue); 14 | case 'Array': 15 | return _.map(value, resolveValue); 16 | default: 17 | return value; 18 | } 19 | }; 20 | 21 | const parseRecord = record => { 22 | const fields = _.reduce( 23 | _.range(0, record.length), 24 | (acc, idx) => { 25 | const value = record.get(idx); 26 | return { 27 | ...acc, 28 | [record.keys[idx]]: resolveValue(value) 29 | }; 30 | }, 31 | {} 32 | ); 33 | return fields; 34 | }; 35 | 36 | export const promisify = (result, session, driver) => { 37 | const data = []; 38 | return new Promise((resolve, reject) => { 39 | result.subscribe({ 40 | onNext: record => { 41 | data.push(parseRecord(record)); 42 | }, 43 | onCompleted: () => { 44 | session.close().then(driver.close()); 45 | resolve(data); 46 | }, 47 | onError: error => { 48 | reject(error); 49 | } 50 | }); 51 | }); 52 | }; 53 | -------------------------------------------------------------------------------- /terraform/configuration.mjs: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb'; 2 | 3 | import { createPathAndWriteObject } from '../util/path.mjs'; 4 | import { ami, scaffold, spotlightInstanceType } from '../conf/infrastructure.mjs'; 5 | 6 | 7 | export const generateConfiguration = async(workers, path=null) => { 8 | const identifiers = [...Array(workers).keys()]; 9 | const resource = _.map(identifiers, id => ( 10 | { 11 | aws_instance: [ 12 | { 13 | [`spotlight-node-${id}`]: [ 14 | { 15 | ami, 16 | instance_type: spotlightInstanceType, 17 | key_name: 'spotlight', 18 | vpc_security_group_ids: ['sg-026313a646e2d8470'], 19 | tags: { 20 | Name: `spotlight-node-${id}`, 21 | }, 22 | }, 23 | ], 24 | }, 25 | ], 26 | } 27 | )); 28 | const output = _.map(identifiers, id => ( 29 | { 30 | [`spotlight-node-${id}-public_ip`]: [ 31 | { 32 | "value": `\${aws_instance.spotlight-node-${id}.public_ip}` 33 | } 34 | ] 35 | } 36 | )); 37 | const configuration = { 38 | ...scaffold, 39 | output, 40 | resource 41 | }; 42 | 43 | if (path) { 44 | await createPathAndWriteObject(path, configuration); 45 | } 46 | return configuration; 47 | }; 48 | 49 | -------------------------------------------------------------------------------- /logging/logging.mjs: -------------------------------------------------------------------------------- 1 | import * as fs from 'fs/promises'; 2 | import { createLogger, format, transports } from 'winston'; 3 | 4 | await fs.mkdir('logs', { recursive: true }); 5 | 6 | export const logger = createLogger({ 7 | level: 'info', 8 | format: format.combine( 9 | format.timestamp({ 10 | format: 'YYYY-MM-DD HH:mm:ss', 11 | }), 12 | format.errors({ stack: true }), 13 | format.splat(), 14 | format.json() 15 | ), 16 | defaultMeta: { service: 'arxlive-spotlight-annotator' }, 17 | transports: [ 18 | 19 | // 20 | // - Write to all logs with level `info` and below to `quick-start-combined.log`. 21 | // - Write all logs error (and below) to `quick-start-error.log`. 22 | // 23 | new transports.File({ 24 | filename: 'logs/error.log', 25 | level: 'error', 26 | }), 27 | new transports.File({ filename: 'logs/all.log' }), 28 | ], 29 | }); 30 | 31 | // 32 | // If we're not in production then **ALSO** log to the `console` 33 | // with the colorized simple format. 34 | // 35 | // eslint-disable-next-line no-process-env 36 | if (process.env.NODE_ENV !== 'production') { 37 | logger.add( 38 | new transports.Console({ 39 | format: format.combine(format.colorize(), format.simple()), 40 | }) 41 | ); 42 | } 43 | -------------------------------------------------------------------------------- /bin/geo/downloadBoundaries.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | import fs from 'fs'; 4 | 5 | import json from '@discoveryjs/json-ext' 6 | import { readJson } from '@svizzle/file'; 7 | import { Command } from 'commander'; 8 | import * as _ from 'lamb'; 9 | 10 | import { collectAllFeatures } from '../../geo/download.js'; 11 | 12 | 13 | const { stringifyStream } = json; 14 | 15 | const program = new Command(); 16 | program.requiredOption('-i, --config ', 'Configuration file. More on this in the README'); 17 | program.requiredOption('-o, --output ', 'Path in which to save the outputp data'); 18 | 19 | program.parse(); 20 | const options = program.opts(); 21 | 22 | const downloadBoundaries = async inputs => { 23 | for await (const { boundary, endpoint } of inputs) { 24 | console.log(`Collecting ${boundary}...`) 25 | const writeStream = fs.createWriteStream(`${options.output}/${boundary}.geojson`); 26 | const collection = await collectAllFeatures(endpoint); 27 | stringifyStream(collection).pipe(writeStream); 28 | } 29 | } 30 | 31 | const main = async () => { 32 | readJson(options.config) 33 | .then(downloadBoundaries) 34 | .catch(() => { throw new Error('Unable to parse configuration') }); 35 | } 36 | 37 | main(); -------------------------------------------------------------------------------- /es/update.mjs: -------------------------------------------------------------------------------- 1 | import { stringify } from '@svizzle/utils'; 2 | import * as _ from 'lamb'; 3 | 4 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 5 | 6 | /** 7 | * @function update 8 | * @description update a document on an ES index. 9 | * @param {string} domain - domain on which to update. 10 | * @param {string} index - index on which to update. 11 | * @param {string} id - id of document to update. 12 | * @param {Object} doc - an object containing the new fields and properties that constitute the update. 13 | * @returns {HttpResponse} response of the update reqeuest. 14 | */ 15 | export const update = async ( 16 | domain, 17 | index, 18 | id, 19 | doc, 20 | payloadOptions={}, 21 | query={}, 22 | { checkStatus=true } = {} 23 | ) => { 24 | const path = `${index}/_update/${encodeURIComponent(id)}`; 25 | const payload = { ...payloadOptions, doc }; 26 | const request = buildRequest(domain, path, 'POST', { payload, query }); 27 | const { body: response, code } = await makeRequest(request); 28 | 29 | if (!checkStatus) { 30 | return { response, code }; 31 | } 32 | 33 | if (code !== 200) { 34 | throw Error( 35 | `Update failed at ${domain}/${index} for document with ID: ${id}. 36 | Response:\n${stringify(response)}` 37 | ); 38 | } 39 | 40 | return response; 41 | }; 42 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/duplicateAggregations/README.md: -------------------------------------------------------------------------------- 1 | ## Duplicate Aggregations 2 | 3 | These aggregations include a mixture of different descriptive statistics 4 | relating to metadata which describes the duplicates found for entities provided 5 | by the Spotlight Tool. In particular, `dupes_10` and `dupes_60` are measures of 6 | how many duplicates were found **at that confidence level**. So if 7 | `dupes_10_count`'s value is 6, then there were a total of 6 duplicates found at 8 | confidence level 10. One entity having duplicates counts as a single occurrence 9 | of a duplicate, e.g. if `Photon` has 3 duplicates found at confidence level 10, 10 | it will contribute 1 occurrence to the total `dupes_10_count`. 11 | 12 | We also provide aggregations on the `dupes_ratio_X` metadata value, which is 13 | simply the `dupes_count_X` value divided by the total number of entities 14 | annotated for that piece of text. 15 | 16 | The aggregations use both `extended_stats` and `histogram`s for each metadata 17 | value (`dupes_count_X` and `dupes_ratio_X`) 18 | 19 | Endpoint: `POST arxiv_v6/_search` 20 | 21 | See: 22 | 23 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-histogram-aggregation.html 24 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-metrics-extendedstats-aggregation.html -------------------------------------------------------------------------------- /util/array.mjs: -------------------------------------------------------------------------------- 1 | import { isNotNil, mergeObjects } from '@svizzle/utils'; 2 | import * as cliProgress from 'cli-progress'; 3 | import * as _ from 'lamb'; 4 | 5 | const _batch = (arr, batchSize) => { 6 | return arr.map((val, i) => { 7 | if (i % batchSize === 0) { 8 | return arr.slice(i, i + batchSize); 9 | } 10 | return null; 11 | }); 12 | }; 13 | 14 | export const batch = _.pipe([_batch, _.filterWith(isNotNil)]); 15 | 16 | export const batchIterate = async(iterable, func, options={}) => { 17 | const { batchSize=100 } = options; 18 | 19 | const bar = new cliProgress.Bar(null, cliProgress.Presets.rect); 20 | bar.start(iterable.length, 0); 21 | const batches = batch(iterable, batchSize); 22 | let results = []; 23 | for (const batch_ of batches) { 24 | // eslint-disable-next-line no-await-in-loop 25 | const result = await func(batch_); 26 | results = [...results, result]; 27 | bar.increment(batch_.length); 28 | } 29 | 30 | bar.stop(); 31 | return results; 32 | }; 33 | 34 | export const batchIterateFlatten = async(iterable, func, options) => { 35 | const results = await batchIterate(iterable, func, options); 36 | return _.shallowFlatten(results); 37 | }; 38 | 39 | export const batchIterateMerge = async(iterable, func, options) => { 40 | const results = await batchIterate(iterable, func, options); 41 | return mergeObjects(results); 42 | }; 43 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # v0.0.15 2 | 3 | Create `annotateEsIndex` script 4 | 5 | # v0.0.14 6 | 7 | Fix `#getEntities` function 8 | Fix `entitiesDataQuality` bug 9 | 10 | # v0.0.13 11 | 12 | Fix Arcgis FeatureServer bug 13 | 14 | # v0.0.12 15 | 16 | Update package dependencies 17 | 18 | # v0.0.11 19 | 20 | Due to uknown bug, the changes below were never pushed to GH 21 | 22 | # v0.0.10 23 | 24 | Fix Arcgis FeatureServer bug where features weren't returning all properties. 25 | 26 | # v0.0.9 27 | 28 | Fix ES query bug 29 | 30 | # v0.0.8 31 | 32 | Add scripts for downloading geographic boundaries from an arcGis FeatureServer, 33 | and for converting these boundaries to mbtiles/pmtiles, and uploading the 34 | pmtiles file to an s3 bucket. 35 | 36 | # v0.0.7 37 | 38 | Fix authentication bug. Authentication endpoint expects a GET request with 39 | email and token provided in the URLSearchParams, but instead was being passed 40 | as the body of a POST request. This change fixes that bug. 41 | 42 | # v0.0.6 43 | 44 | Add authentication logic formerly contained in the 45 | annotation service 46 | 47 | # v0.0.5 48 | 49 | Patch import errors in `jsonToEsIndex.js` script 50 | and patch a bulk request bug. 51 | 52 | # v0.0.4 53 | 54 | Add the jsonToEsIndex script to `bin/` 55 | 56 | # v0.0.3 57 | 58 | Port Terraform configuration to nestauk/dap_dv_backends 59 | 60 | # v0.0.2 61 | 62 | Added some executable scripts in `bin/` 63 | 64 | # v0.0.1 65 | 66 | Copy of utilities from nestauk/dap_dv_backends@ce64d0c 67 | -------------------------------------------------------------------------------- /es/pipeline.mjs: -------------------------------------------------------------------------------- 1 | import { arxliveCopy } from '../conf/config.mjs'; 2 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 3 | 4 | /** 5 | * 6 | * @param {string} name - name of pipeline in url parsable form. 7 | * @param {string} description - description of pipeline. 8 | * @param {Array} processors - list of processors for pipeline. 9 | * @param {string} domain - domain on which to put pipeline. 10 | * @returns {Object} response object. 11 | */ 12 | const generic = (name, description, processors, domain) => { 13 | const path = `_ingest/pipeline/${name}`; 14 | const payload = { description, processors }; 15 | 16 | const request = buildRequest(domain, path, 'PUT', { payload }); 17 | return makeRequest(request); 18 | }; 19 | 20 | /** 21 | * 22 | * @param {Array} fields - list of fields to remove upon ingestion. 23 | * @param {string} domain - domain on which to put pipeline. 24 | * @returns {string} name of created pipeline 25 | */ 26 | export const remove = async (fields, domain = arxliveCopy) => { 27 | const description = `Remove ${fields.join(' ')}`; 28 | const name = `remove-${fields.join('-')}`; 29 | const processors = [ 30 | { 31 | remove: { 32 | field: fields, 33 | ignore_failure: true, 34 | }, 35 | }, 36 | ]; 37 | const response = await generic(name, description, processors, domain); 38 | if (response.code === 200) { 39 | return name; 40 | } 41 | throw new Error( 42 | `Failed to create remove pipeline. Response:\n${response}` 43 | ); 44 | 45 | }; 46 | -------------------------------------------------------------------------------- /neo4j/community.mjs: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb'; 2 | 3 | import { getSession } from '../neo4j/driver.mjs'; 4 | import { promisify } from '../neo4j/util.mjs'; 5 | 6 | const getMetadata = data => { 7 | const intermediateCommunities = data[0].intermediateCommunityIds.length; 8 | const communityCount = _.keys(_.group(data, _.getKey('community'))).length; 9 | const intermediateCounts = _.map( 10 | _.range(0, intermediateCommunities), 11 | idx => { 12 | const communities = _.group(data, r => r.intermediateCommunityIds[idx]); 13 | const counts = _.keys(communities).length; 14 | return counts; 15 | } 16 | ); 17 | const metadata = { 18 | intermediateCommunities, 19 | communityCount, 20 | intermediateCounts 21 | }; 22 | return metadata; 23 | }; 24 | 25 | const objectToString = object => _.reduce( 26 | _.pairs(object), 27 | (acc, [key, value]) => `${acc.length ? `${acc},` : ''} ${key}: ${value}`, 28 | '' 29 | ); 30 | 31 | const generateCommand = (graph, options) => ` 32 | CALL gds.louvain.stream('${graph}', { ${objectToString(options)} }) 33 | YIELD nodeId, communityId, intermediateCommunityIds 34 | RETURN gds.util.asNode(nodeId).URI AS URI, communityId, intermediateCommunityIds 35 | ORDER BY communityId ASC 36 | `; 37 | 38 | export const stream = async (graph, options) => { 39 | const [session, driver] = await getSession(); 40 | const command = generateCommand(graph, options); 41 | const result = session.run(command); 42 | const data = await promisify(result, session, driver); 43 | const metadata = getMetadata(data); 44 | 45 | return { data, metadata }; 46 | }; 47 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountOverTokenCountByConfidence/request.mjs: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | import { fileURLToPath } from 'url'; 3 | import { dirname } from 'path'; 4 | 5 | const main = async () => { 6 | const confidences = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]; 7 | const aggs = confidences.reduce((acc, conf) => { 8 | return { 9 | ...acc, 10 | [`entities_count_over_token_count_at_${conf}_extended_stats`]: { 11 | extended_stats: { 12 | script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['textBody_abstract_article.token_count'].value;` 13 | } 14 | }, 15 | [`entities_count_over_token_count_at_${conf}_histogram`]: { 16 | histogram: { 17 | script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['textBody_abstract_article.token_count'].value;`, 18 | interval: 0.01, 19 | min_doc_count: 1 20 | } 21 | } 22 | }; 23 | }, {}); 24 | const payload = { 25 | size: 0, 26 | aggs 27 | }; 28 | const requestString = JSON.stringify(payload, null, 4); 29 | const __filename = fileURLToPath(import.meta.url); 30 | const __dirname = dirname(__filename); 31 | await fs.writeFile(`${__dirname}/request.json`, requestString); 32 | }; 33 | 34 | main(); 35 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountAggsByConfidenceOverEntitiesCount/request.mjs: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | import { fileURLToPath } from 'url'; 3 | import { dirname } from 'path'; 4 | 5 | const main = async () => { 6 | const confidences = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]; 7 | const aggs = confidences.reduce((acc, conf) => { 8 | return { 9 | ...acc, 10 | [`confidence_${conf}_normalised_with_entities_count_extended_stats`]: { 11 | extended_stats: { 12 | script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['dbpedia_entities_metadata.entities_count'].value;` 13 | } 14 | }, 15 | [`confidence_${conf}_normalised_with_entities_count_histogram`]: { 16 | histogram: { 17 | script: `if (doc['dbpedia_entities_metadata.confidence_counts.${conf}'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.${conf}'].value) / doc['dbpedia_entities_metadata.entities_count'].value;`, 18 | interval: 0.01, 19 | min_doc_count: 1 20 | } 21 | } 22 | }; 23 | }, {}); 24 | const payload = { 25 | size: 0, 26 | aggs 27 | }; 28 | const requestString = JSON.stringify(payload, null, 4); 29 | const __filename = fileURLToPath(import.meta.url); 30 | const __dirname = dirname(__filename); 31 | await fs.writeFile(`${__dirname}/request.json`, requestString); 32 | }; 33 | 34 | main(); 35 | -------------------------------------------------------------------------------- /conf/mappings.mjs: -------------------------------------------------------------------------------- 1 | export const defaultMapping = { 2 | type: 'nested', 3 | properties: { 4 | URI: { 5 | type: 'keyword', 6 | }, 7 | confidence: { 8 | type: 'integer', 9 | }, 10 | percentageOfSecondRank: { 11 | type: 'float', 12 | }, 13 | similarityScore: { 14 | type: 'float', 15 | }, 16 | surfaceForm: { 17 | type: 'text', 18 | }, 19 | duplicates_60: { 20 | type: 'integer', 21 | }, 22 | duplicates_10: { 23 | type: 'integer', 24 | } 25 | }, 26 | }; 27 | 28 | export const metaDataMapping = { 29 | properties: { 30 | confidence_avg: { 31 | type: 'float' 32 | }, 33 | confidence_max: { 34 | type: 'integer' 35 | }, 36 | confidence_min: { 37 | type: 'integer' 38 | }, 39 | entities_count: { 40 | type: 'integer' 41 | }, 42 | dupes_10_ratio: { 43 | type: 'float' 44 | }, 45 | dupes_60_ratio: { 46 | type: 'float' 47 | }, 48 | dupes_10_count: { 49 | type: 'integer' 50 | }, 51 | dupes_60_count: { 52 | type: 'integer' 53 | }, 54 | confidence_counts: { 55 | properties: { 56 | "0": { 57 | type: 'integer' 58 | }, 59 | "10": { 60 | type: 'integer' 61 | }, 62 | "20": { 63 | type: 'integer' 64 | }, 65 | "30": { 66 | type: 'integer' 67 | }, 68 | "40": { 69 | type: 'integer' 70 | }, 71 | "50": { 72 | type: 'integer' 73 | }, 74 | "60": { 75 | type: 'integer' 76 | }, 77 | "70": { 78 | type: 'integer' 79 | }, 80 | "80": { 81 | type: 'integer' 82 | }, 83 | "90": { 84 | type: 'integer' 85 | }, 86 | "100": { 87 | type: 'integer' 88 | } 89 | } 90 | } 91 | } 92 | 93 | }; 94 | -------------------------------------------------------------------------------- /geo/download.js: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb' 2 | 3 | import { batchIterateFlatten } from '../util/array.mjs' 4 | 5 | // API docs: https://developers.arcgis.com/rest/services-reference/enterprise/query-feature-service-layer-.htm 6 | 7 | export const getAllFeatureIds = async featureServerEndpoint => { 8 | 9 | const url = new URL(`${featureServerEndpoint}/0/query`) 10 | url.search = new URLSearchParams({ 11 | f: 'json', 12 | returnIdsOnly: true, 13 | where: '1=1', 14 | outFields: '*' 15 | }).toString() 16 | 17 | const response = await fetch(url, { method: 'POST' }); 18 | const result = await response.json(); 19 | return result.objectIds; 20 | } 21 | 22 | export const collectAllFeatures = async featureServerEndpoint => { 23 | 24 | const ids = await getAllFeatureIds(featureServerEndpoint) 25 | const url = new URL(`${featureServerEndpoint}/0/query`) 26 | 27 | const downloadFeatures = async batch => { 28 | url.search = new URLSearchParams({ 29 | f: 'geoJSON', 30 | where: '1=1', 31 | objectIds: batch, 32 | outFields: '*', 33 | }) 34 | const response = await fetch(url, { method: 'POST' }); 35 | const result = await response.json(); 36 | return result; 37 | } 38 | const results = await batchIterateFlatten(ids, downloadFeatures, { batchSize: 100 }); 39 | const collection = _.reduce( 40 | results, 41 | (acc, curr) => { 42 | acc.features.push(...curr.features); 43 | return acc 44 | } 45 | ) 46 | return { 47 | type: "FeatureCollection", 48 | ...collection 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "bin": { 3 | "annotate": "bin/annotate.js", 4 | "annotateEsIndex": "bin/annotateEsIndex.js", 5 | "jsonToEsIndex": "bin/jsonToEsIndex.js", 6 | "annotationsDataQuality": "bin/annotationsDataQuality/annotationsDataQuality.js", 7 | "entitiesDataQuality": "bin/entitiesDataQuality.js", 8 | "downloadBoundaries": "bin/geo/downloadBoundaries.js", 9 | "generatePmTiles": "bin/geo/generatePmTiles.sh" 10 | }, 11 | "bugs": { 12 | "url": "https://github.com/nestauk/dap_dv_backends_utils/issues" 13 | }, 14 | "dependencies": { 15 | "@aws-crypto/sha256-browser": "^2.0.1", 16 | "@aws-sdk/client-s3": "^3.121.0", 17 | "@aws-sdk/client-ses": "^3.128.0", 18 | "@aws-sdk/credential-provider-node": "^3.49.0", 19 | "@aws-sdk/node-http-handler": "^3.49.0", 20 | "@aws-sdk/protocol-http": "^3.49.0", 21 | "@aws-sdk/signature-v4": "^3.49.0", 22 | "@discoveryjs/json-ext": "^0.5.7", 23 | "@svizzle/file": "^0.12.0", 24 | "@svizzle/utils": "^0.16.0", 25 | "cli-progress": "^3.10.0", 26 | "commander": "^9.0.0", 27 | "lamb": "^0.60.0", 28 | "mkdirp": "^2.1.3", 29 | "neo4j-driver": "^5.0.1", 30 | "undici": "^5.22.1", 31 | "winston": "^3.5.1" 32 | }, 33 | "description": "Utilties for the DAP data visualisation team's backend services", 34 | "devDependencies": { 35 | "eslint": "^8.14.0", 36 | "mocha": "^9.1.3" 37 | }, 38 | "homepage": "https://github.com/nestauk/dap_dv_backends_utils#readme", 39 | "keywords": [ 40 | "utilities", 41 | "utils", 42 | "dap", 43 | "backend", 44 | "data-visualisation" 45 | ], 46 | "license": "MIT", 47 | "name": "dap_dv_backends_utils", 48 | "repository": { 49 | "type": "git", 50 | "url": "git+https://github.com/nestauk/dap_dv_backends_utils.git" 51 | }, 52 | "type": "module", 53 | "version": "0.0.16" 54 | } -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/duplicateAggregations/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "dupes_10_count_extended_stats": { 5 | "extended_stats": { 6 | "field": "dbpedia_entities_metadata.dupes_10_count" 7 | } 8 | }, 9 | "dupes_10_count_histogram": { 10 | "histogram": { 11 | "field": "dbpedia_entities_metadata.dupes_10_count", 12 | "interval": 1, 13 | "min_doc_count": 1 14 | } 15 | }, 16 | "dupes_10_ratio_extended_stats": { 17 | "extended_stats": { 18 | "field": "dbpedia_entities_metadata.dupes_10_ratio" 19 | } 20 | }, 21 | "dupes_10_ratio_histogram": { 22 | "histogram": { 23 | "field": "dbpedia_entities_metadata.dupes_10_ratio", 24 | "interval": 0.01, 25 | "min_doc_count": 1 26 | } 27 | }, 28 | "dupes_60_count_extended_stats": { 29 | "extended_stats": { 30 | "field": "dbpedia_entities_metadata.dupes_60_count" 31 | } 32 | }, 33 | "dupes_60_count_histogram": { 34 | "histogram": { 35 | "field": "dbpedia_entities_metadata.dupes_60_count", 36 | "interval": 1, 37 | "min_doc_count": 1 38 | } 39 | }, 40 | "dupes_60_ratio_extended_stats": { 41 | "extended_stats": { 42 | "field": "dbpedia_entities_metadata.dupes_60_ratio" 43 | } 44 | }, 45 | "dupes_60_ratio_histogram": { 46 | "histogram": { 47 | "field": "dbpedia_entities_metadata.dupes_60_ratio", 48 | "interval": 0.01, 49 | "min_doc_count": 1 50 | } 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /es/bulk.mjs: -------------------------------------------------------------------------------- 1 | import { stringify } from '@svizzle/utils'; 2 | import * as _ from 'lamb'; 3 | 4 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 5 | import { logger } from '../logging/logging.mjs'; 6 | 7 | const generateBulkPayload = (method, index) => _.pipe([ 8 | _.flatMapWith(doc => 9 | [ 10 | { [method]: { 11 | ...('_id' in doc && { "_id": doc._id }), 12 | "_index": index 13 | } }, 14 | method === 'update' ? { doc: doc.data } : doc.data 15 | ] 16 | ), 17 | _.reduceWith((acc, curr) => `${acc}\n${JSON.stringify(curr)}`, ''), 18 | json => `${json}\n` 19 | ]); 20 | 21 | /** 22 | * @function bulkRequest 23 | * @description creates multiple documents on an ES index in one request. 24 | * @param {string} domain - domain on which to update. 25 | * @param {string} index - index on which to update. 26 | * @param {Object[]} documents - list of documents, where each object has an id 27 | * key and a data key. The data key is the document intended to be created. 28 | * @param {string} method - the method to use (create, update, delete, etc.) 29 | * @returns {HttpResponse} response of the update reqeuest. 30 | */ 31 | export const bulkRequest = async ( 32 | domain, 33 | index, 34 | documents, 35 | method, 36 | { error=true, refresh="false" }={} 37 | ) => { 38 | const path = `${index}/_bulk`; 39 | const generate = generateBulkPayload(method, index); 40 | const payload = generate(documents); 41 | 42 | // if payload is empty, no docs were supplied to the function 43 | if (!payload.trim()) { 44 | console.log("Payload empty"); 45 | return { response: "Payload empty", code: 204 }; 46 | } 47 | 48 | const request = buildRequest( 49 | domain, path, 'POST', 50 | { payload, contentType: 'application/x-ndjson', query: { refresh } } 51 | ); 52 | const { body: response, code } = await makeRequest(request); 53 | if (response.error) { 54 | if (error) { 55 | throw new Error(stringify(response)); 56 | } else { 57 | logger.error(stringify(response)); 58 | } 59 | } 60 | return { response, code }; 61 | }; 62 | -------------------------------------------------------------------------------- /es/document.mjs: -------------------------------------------------------------------------------- 1 | import { stringify } from '@svizzle/utils'; 2 | 3 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 4 | 5 | /** 6 | * @function create 7 | * @description creates a document on an ES index. 8 | * @param {string} domain - domain on which to create the document. 9 | * @param {string} index - index on which to create the document. 10 | * @param {Object} doc - an object containing the new fields and properties that constitute the update. 11 | * @param {Object} [options={}] 12 | * @param {string} [options.id=''] - id of document (if empty, ElasticSearch creates one for you). 13 | * @returns {HttpResponse} response of the update reqeuest. 14 | */ 15 | export const create = async (domain, index, doc, { id = '', checkStatus=true} = {}) => { 16 | const path = `${index}/_doc/${encodeURIComponent(id)}`; 17 | const payload = doc; 18 | const request = buildRequest(domain, path, 'POST', { payload }); 19 | const { body: response, code } = await makeRequest(request); 20 | if (!checkStatus) { 21 | return { response, code }; 22 | } 23 | if (parseInt(code / 200, 10) !== 1) { 24 | console.log(response); 25 | throw Error( 26 | `Creating document failed at ${domain}/${index} for document\n${JSON.stringify(doc, null, 2)}` 27 | ); 28 | } 29 | return response; 30 | }; 31 | 32 | /** 33 | * 34 | * @param {string} domain - domain on which to retrieve document 35 | * @param {string} index - index on which to retrieve document 36 | * @param {string} id - id of document to retrieve 37 | * @param {Object} [options={}] 38 | * @param {boolean} [options.source=false] - whether to return just the source of the document 39 | * @returns {Object} an ElasticSearch document 40 | */ 41 | export const get = async (domain, index, id, { source=false } = {}) => { 42 | const path = `${index}/_doc/${id}`; 43 | const request = buildRequest(domain, path, 'GET'); 44 | const { body: response, code } = await makeRequest(request); 45 | if (code !== 200) { 46 | console.log(response); 47 | throw Error(`Getting document for ${id} failed with response \n${stringify(response)}`); 48 | } 49 | if (source) { 50 | return response._source; 51 | } 52 | return response; 53 | }; 54 | -------------------------------------------------------------------------------- /bin/jsonToEsIndex.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | import { promises as fs } from 'fs'; 4 | 5 | import { Command } from 'commander'; 6 | import * as _ from 'lamb'; 7 | 8 | import { arxliveCopy } from '../conf/config.mjs'; 9 | import { bulkRequest } from '../es/bulk.mjs'; 10 | import { createIndex } from '../es/index.mjs'; 11 | import { logger } from '../logging/logging.mjs'; 12 | import { batch } from '../util/array.mjs'; 13 | import { commanderParseInt } from '../util/commander.mjs'; 14 | 15 | const program = new Command(); 16 | program.option( 17 | '-d, --domain ', 18 | 'ES domain on which to ingest documents', 19 | arxliveCopy 20 | ); 21 | program.requiredOption('-i, --index ', 'Index on which to ingest'); 22 | program.requiredOption('-p, --path ', 'Path to JSON data'); 23 | program.option('--batch-size ', 'Size of batch of docs to upload', commanderParseInt, 100); 24 | program.option( 25 | '--key ', 26 | 'Top level key in JSON object to use as key. If not supplied, keys will be generated automatically', 27 | null 28 | ); 29 | program.option( 30 | '--list-key ', 31 | 'Key for the documents if documents are stored as a value at the root level of the json file. Not recommended', 32 | null 33 | ); 34 | 35 | program.parse(); 36 | const options = program.opts(); 37 | 38 | const main = async () => { 39 | 40 | await createIndex(options.domain, options.index); 41 | 42 | const json = JSON.parse( 43 | await fs.readFile(options.path, { encoding: 'utf-8' }) 44 | ); 45 | const data = options.listKey ? json[options.listKey] : json; 46 | 47 | const documents = options.key 48 | ? _.map(data, object => { 49 | const { [options.key]: _id, ...contents } = object; 50 | return { _id, data: contents }; 51 | }) 52 | : _.map(data, (contents, _id) => ({ _id, data: contents })); 53 | 54 | const docsWithId = _.filter(documents, doc => '_id' in doc); 55 | 56 | for (const docs of batch(docsWithId, options.batchSize)) { 57 | // eslint-disable-next-line no-await-in-loop 58 | const response = await bulkRequest(options.domain, options.index, docs, 'create'); 59 | if (response.code !== 200) { 60 | logger.error(response); 61 | } 62 | }; 63 | }; 64 | 65 | main(); 66 | -------------------------------------------------------------------------------- /es/entities.mjs: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb'; 2 | 3 | import { arxliveCopy } from '../conf/config.mjs'; 4 | import { dbr } from '../dbpedia/util.mjs'; 5 | import { scroll, clearScroll } from '../es/search.mjs'; 6 | 7 | // titles are the Wiki pages with whitepace replaced with underscores, so 8 | // World War 1 => World_War_1 9 | // We use this terminology to stay consistent with Wikimedia's API, where the 10 | // this parameter is also named title. 11 | // https://api.wikimedia.org/wiki/API_reference/Core/Pages/Get_page 12 | export const getEntities = async( 13 | index, 14 | domain=arxliveCopy, 15 | { asTitle=true } = {} 16 | ) => { 17 | 18 | const scroller = scroll(domain, index, { size: 10000, }); 19 | const uriCounts = {}; 20 | let page; 21 | for await (page of scroller) { 22 | _.forEach(page.hits.hits, doc => { 23 | if ('dbpedia_entities' in doc._source) { 24 | _.forEach(doc._source.dbpedia_entities, ({ URI }) => { 25 | const key = asTitle 26 | ? URI.replace(dbr, '') 27 | : URI; 28 | uriCounts[key] = uriCounts[key] ? uriCounts[key] + 1 : 1; 29 | }); 30 | } 31 | }); 32 | } 33 | if (page) { 34 | clearScroll(domain, page._scroll_id); 35 | } 36 | const entities = _.keys(uriCounts); 37 | return entities; 38 | }; 39 | 40 | /** 41 | * @function getAllConfidenceLevels 42 | * @description counts the different confidence values found for every unique entity on a given ES index. 43 | * @param {string} index Index on which to count confidence levels 44 | * @param {string} domain Domain on which the index sits 45 | * @returns { Object. } an object where keys are the unique 46 | * entity URIs and values are an array of confidence values found for that entity. 47 | */ 48 | export const getAllConfidenceLevels = async( 49 | index, 50 | domain=arxliveCopy 51 | ) => { 52 | const scroller = scroll(domain, index, { size: 10000, }); 53 | const confidenceCounts = {}; 54 | let page; 55 | for await (page of scroller) { 56 | const entities = _.flatMap( 57 | page.hits.hits, 58 | _.getPath('_source.dbpedia_entities') 59 | ); 60 | _.forEach( 61 | entities, 62 | ({ URI, confidence }) => { 63 | confidenceCounts[URI] = URI in confidenceCounts 64 | ? [ ...confidenceCounts[URI], confidence ] 65 | : [ confidence ]; 66 | } 67 | ); 68 | } 69 | if (page) { 70 | clearScroll(domain, page._scroll_id); 71 | } 72 | return confidenceCounts; 73 | }; 74 | -------------------------------------------------------------------------------- /bin/geo/README.md: -------------------------------------------------------------------------------- 1 | ## downloadBoundaries 2 | 3 | This script downloads boundaries from an arcGis server and saves them as geoJSON 4 | to a specified output directory. In order to know which boundaries to download, 5 | you must supply a configuration file with a list of objects, where each object 6 | has a `boundary` and `endpoint` key. The `boundary` key will be used to name the 7 | resulting geoJSON file for that boundary, and the `endpoint` should point to the 8 | arcGis FeatureServer related to that boundary. Here is an example for the three 9 | different levels of International Territorial boundaries: 10 | 11 | ```json 12 | [ 13 | { 14 | "boundary": "itl1", 15 | "endpoint": "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/International_Territorial_Level_1_January_2021_UK_BFC_2022/FeatureServer" 16 | }, 17 | { 18 | "boundary": "itl2", 19 | "endpoint": "https://services1.arcgis.com/ESMARspQHYMw9BZ9/ArcGIS/rest/services/International_Territorial_Level_2_January_2021_UK_BFC_V2_2022/FeatureServer" 20 | }, 21 | { 22 | "boundary": "itl3", 23 | "endpoint": "https://services1.arcgis.com/ESMARspQHYMw9BZ9/ArcGIS/rest/services/International_Territorial_Level_3_January_2021_UK_BFC_V3_2022/FeatureServer" 24 | } 25 | ] 26 | ``` 27 | 28 | ## generatePmTiles 29 | 30 | ### Requirements 31 | 32 | [tippecanoe](https://github.com/mapbox/tippecanoe), which can be installed with 33 | brew `brew install tippecanoe` 34 | [pmtiles](https://github.com/protomaps/go-pmtiles/releases), download the 35 | relevant binary at this link, and add it to your system's path. 36 | 37 | The script was written and tested using the following versions of the software 38 | above: 39 | 40 | `tippecanoe v2.23.0` 41 | `pmtiles v1.70` 42 | 43 | ### Running the script 44 | 45 | Script for generating a pmtiles file and uploading it to s3. You specify the 46 | directory in which the boundaries are kept (all in geojson) as the first 47 | argument, and the s3 URI of your desired bucket as the second argument: 48 | 49 | ```sh 50 | npx generatePmTiles boundaries/ s3://path-to-bucket 51 | ``` 52 | 53 | ### Uploading to s3 54 | 55 | You must have `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment 56 | variables set in order to upload the resulting pmtiles file to s3. 57 | 58 | To upload, simply run the following: 59 | 60 | pmtiles upload $pmtiles --bucket=$2 $pmtiles 61 | 62 | ```sh 63 | pmtiles upload --bucket= 64 | ``` 65 | -------------------------------------------------------------------------------- /bin/annotate.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | import { Command } from 'commander'; 4 | import * as _ from 'lamb'; 5 | import { performance } from 'perf_hooks'; 6 | 7 | import { arxliveCopy } from '../conf/config.mjs'; 8 | import { getMappings } from '../es/index.mjs'; 9 | import { annotateIndex } from '../dbpedia/spotlight.mjs'; 10 | import { commanderParseInt } from '../util/commander.mjs'; 11 | import { dedent } from '../util/string.mjs'; 12 | 13 | const program = new Command(); 14 | program.option( 15 | '-d, --domain ', 16 | 'ES domain on which to annotate', 17 | arxliveCopy 18 | ); 19 | program.requiredOption( 20 | '-i, --index ', 21 | 'Index on which to annotate', 22 | ); 23 | program.requiredOption( 24 | '-s, --spotlight ', 25 | 'Endpoint for spotlight annotator', 26 | ); 27 | program.requiredOption( 28 | '-f, --field-name ', 29 | 'Field of doc to be used as input text for annotation' 30 | ); 31 | program.option( 32 | '-n, --new-field-name ', 33 | 'Name of new field to be created', 34 | 'dbpedia_entities' 35 | ); 36 | program.option( 37 | '-p, --page-size ', 38 | 'Size of page to scroll with', 39 | commanderParseInt, 40 | 10000 41 | ); 42 | program.option( 43 | '-b, --batch-size ', 44 | 'Size of batch to annotate over', 45 | commanderParseInt, 46 | 10 47 | ); 48 | program.option( 49 | '-g, --group-size ', 50 | 'Size of group of batches, usually corresponds to the number of worker nodes', 51 | commanderParseInt, 52 | 4 53 | ); 54 | program.option( 55 | '-z, --pages ', 56 | 'Number of pages to iterate over', 57 | 'all' 58 | ); 59 | program.option( 60 | '--force', 61 | 'Force the annotation process, even if no snapshots can be created' 62 | ); 63 | program.option( 64 | '--include-metadata', 65 | 'Include metadata fields on the index', 66 | true 67 | ); 68 | 69 | program.showHelpAfterError(); 70 | program.parse(); 71 | const options = program.opts(); 72 | 73 | const main = async () => { 74 | 75 | const currentMapping = await getMappings(options.domain, options.index); 76 | if ( 77 | options.newFieldName in currentMapping[options.index].mappings.properties && 78 | !options.force 79 | ) { 80 | throw new Error( 81 | dedent`Field already exists at index mapping, and force 82 | flag or continue flag not supplied` 83 | ); 84 | } 85 | 86 | const startTime = performance.now(); 87 | 88 | await annotateIndex( 89 | options.domain, 90 | options.index, 91 | options.spotlight, 92 | options.fieldName, 93 | options 94 | ); 95 | 96 | const endTime = performance.now(); 97 | console.log(`Total time taken (in ms): ${endTime - startTime}`); 98 | }; 99 | 100 | main(); 101 | 102 | 103 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/annotationsDataQuality.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | import { exec } from 'child_process'; 4 | import fs from 'fs'; 5 | import path from 'path'; 6 | import { fileURLToPath } from 'url'; 7 | 8 | import { Command } from 'commander'; 9 | import * as _ from 'lamb'; 10 | 11 | import { buildRequest, makeRequest } from '../../es/requests.mjs'; 12 | 13 | const __filename = fileURLToPath(import.meta.url); 14 | const __dirname = path.dirname(__filename); 15 | 16 | const program = new Command(); 17 | program.requiredOption( 18 | '-d, --domain ', 19 | 'ES domain on which to aggregate', 20 | ); 21 | program.requiredOption('-i, --index ', 'ES index on which to aggregate'); 22 | program.option( 23 | '-p, --path ', 24 | 'Path to directory containing requests', 25 | `${__dirname}/requests` 26 | ); 27 | program.requiredOption( 28 | '-o, --out ', 29 | 'Path to directory in which to save results.', 30 | ); 31 | 32 | program.showHelpAfterError(); 33 | program.parse(); 34 | const options = program.opts(); 35 | 36 | const filterDirectory = predicate => _.pipe([ 37 | dirPath => fs.readdirSync(dirPath, { withFileTypes: true }), 38 | _.filterWith(predicate), 39 | _.mapWith(_.getKey('name')) 40 | ]); 41 | 42 | const getSubDirectories = filterDirectory(dirEnt => dirEnt.isDirectory()); 43 | 44 | const main = async () => { 45 | const aggregationDirectories = getSubDirectories(options.path); 46 | 47 | const payloads = await Promise.all( 48 | _.map(aggregationDirectories, dir => { 49 | const subPath = path.join(options.path, dir); 50 | 51 | // if file is generated using script, regenerate 52 | if (fs.existsSync(path.join(subPath, 'request.mjs'))) { 53 | exec(`node ${path.join(subPath, 'request.mjs')}`); 54 | } 55 | const payload = fs.readFileSync( 56 | path.join(subPath, 'request.json'), { encoding: 'utf-8' }); 57 | return { name: dir, payload }; 58 | })); 59 | 60 | const responses = await Promise.all( 61 | _.map(payloads, async ({ name, payload }) => { 62 | const requestPath = `${options.index}/_search`; 63 | const request = buildRequest( 64 | options.domain, 65 | requestPath, 66 | 'POST', 67 | { payload } 68 | ); 69 | const { body: response } = await makeRequest(request); 70 | return { name, payload, response }; 71 | })); 72 | 73 | if (options.out) { 74 | if (!fs.existsSync(options.out)) { 75 | fs.mkdirSync(options.out, { recursive: true }); 76 | } 77 | } 78 | await Promise.all( 79 | _.map(responses, response => { 80 | const outputPath = options.out 81 | ? path.join(options.out, `${response.name}.json`) 82 | : path.join(options.path, response.name, 'response.json'); 83 | fs.writeFileSync( 84 | outputPath, 85 | JSON.stringify(response.response, null, 4) 86 | ); 87 | })); 88 | }; 89 | 90 | main(); 91 | -------------------------------------------------------------------------------- /bin/annotateEsIndex.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | import { Command } from 'commander'; 4 | import * as _ from 'lamb'; 5 | 6 | import { arxliveCopy } from '../conf/config.mjs'; 7 | import { sleep } from '../util/time.mjs'; 8 | import { commanderParseInt } from '../util/commander.mjs'; 9 | 10 | 11 | const { NESTA_EMAIL, NESTA_TOKEN } = process.env; 12 | 13 | if (!NESTA_EMAIL || !NESTA_TOKEN) { 14 | throw new Error(` 15 | Please export your NESTA_EMAIL and NESTA_TOKEN as environment variables. 16 | More information on how to retrieve these can be found here: 17 | https://github.com/nestauk/dap_dv_backends/tree/dev/src/services/authentication` 18 | ) 19 | } 20 | 21 | const program = new Command(); 22 | program.requiredOption( 23 | '-d, --domain ', 24 | 'ES domain on which to annotate', 25 | arxliveCopy 26 | ); 27 | program.requiredOption( 28 | '-e, --endpoint ', 29 | 'Endpoint to be used for annotation' 30 | ); 31 | program.requiredOption( 32 | '-i, --index ', 33 | 'Index on which to annotate', 34 | ); 35 | program.requiredOption( 36 | '-f, --field-name ', 37 | 'Field of doc to be used as input text for annotation' 38 | ); 39 | program.option( 40 | '-n, --new-field-name ', 41 | 'Name of new field to be created', 42 | 'dbpedia_entities' 43 | ); 44 | program.option( 45 | '--include-metadata', 46 | 'Include metadata fields on the index', 47 | true 48 | ); 49 | program.option( 50 | '--workers ', 51 | 'Number of workers to use', 52 | commanderParseInt, 53 | 2 54 | ); 55 | 56 | program.showHelpAfterError(); 57 | program.parse(); 58 | const options = program.opts(); 59 | 60 | const main = async () => { 61 | 62 | const authHeader = `Basic ${Buffer.from(NESTA_EMAIL + ':' + NESTA_TOKEN).toString('base64')}`; 63 | 64 | const query = { 65 | domain: options.domain, 66 | index: options.index, 67 | field: options.fieldName, 68 | newField: options.newFieldName, 69 | includeMetaData: options.includeMetadata, 70 | workers: options.workers 71 | } 72 | 73 | const queryString = new URLSearchParams(query); 74 | const url = `${options.endpoint}/es?${queryString.toString()}`; 75 | 76 | let requestOptions = { 77 | method: 'GET', 78 | headers: { 79 | Authorization: authHeader 80 | } 81 | }; 82 | 83 | let response = await fetch(url, requestOptions); 84 | const { id } = await response.json(); 85 | 86 | console.log(id); 87 | 88 | const progressEndpoint = `${options.endpoint}/progress/` 89 | response = await fetch(`${progressEndpoint}/${id}`) 90 | let progress = await response.json(); 91 | 92 | while (progress.status !== 'finished') { 93 | response = await fetch(`${progressEndpoint}/${id}`) 94 | progress = await response.json(); 95 | console.log(progress); 96 | 97 | await sleep(1000 * 10); 98 | } 99 | }; 100 | 101 | main(); -------------------------------------------------------------------------------- /es/snapshot.mjs: -------------------------------------------------------------------------------- 1 | import { settings as globalSettings } from '../conf/config.mjs'; 2 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 3 | 4 | const settings = globalSettings.snapshotSettings; 5 | 6 | /** 7 | * @function register 8 | * @description Registers a snapshot repository on the specified domain. This 9 | * repository is essentially a directory to contain snapshots, and 10 | * a default one for ES snapshots is typically created when the 11 | * user has specified the correct AWS configurations. 12 | * @param {string} domain - domain on which to register snapshot. 13 | * @param {string} repository - name of repository to register. 14 | * @returns {Object} reponse of request. 15 | */ 16 | export const register = (domain, repository) => { 17 | const path = `_snapshot/${repository}`; 18 | const payload = { 19 | type: 's3', 20 | settings: { 21 | bucket: settings.bucketName, 22 | region: settings.region, 23 | role_arn: `arn:aws:iam::${settings.awsID}:role/${settings.snapshotRole}`, 24 | }, 25 | }; 26 | const request = buildRequest(domain, path, 'PUT', { payload }); 27 | return makeRequest(request, { verbose: true }); 28 | }; 29 | 30 | /** 31 | * @function trigger 32 | * @description this function triggers a snapshot for the specified domain and 33 | * saves it in the repository with the given snapshot name. 34 | * @param {string} domain - domain on which to trigger the snapshot. 35 | * @param {string} repository - repository in which to save the snapshot result. 36 | * @param {string} snapshot - name of the snapshot. 37 | * @returns {Object} response of request. 38 | */ 39 | export const trigger = (domain, repository, snapshot) => { 40 | const path = `_snapshot/${repository}/${snapshot}`; 41 | const request = buildRequest(domain, path, 'PUT'); 42 | makeRequest(request, { verbose: true }); 43 | }; 44 | 45 | /** 46 | * @function list 47 | * @description lists the snapshots for the specified domain and repository. 48 | * @param {string} domain - domain on which to list the snapshots. 49 | * @param {string} repository - repository in which to list the snapshots. 50 | * @returns {Object} response object, containng the list of snapshots. 51 | */ 52 | export const list = (domain, repository) => { 53 | const path = repository ? `_snapshot/${repository}/_all` : '_snapshot'; 54 | const request = buildRequest(domain, path, 'GET'); 55 | return makeRequest(request, { verbose: true }); 56 | }; 57 | 58 | /** 59 | * @function restore 60 | * @description restores the domain to the specified snapshot located in the 61 | * specified repository. 62 | * @param {string} domain domain on which to restore. 63 | * @param {string} repository - repository from which to get the snapshot needed to restore. 64 | * @param {string} snapshot - name of snapshot used to restore. 65 | * @returns {Object} response of request. 66 | */ 67 | export const restore = (domain, repository, snapshot) => { 68 | const payload = { indices: '-.kibana*,-.opendistro*' }; 69 | const path = `_snapshot/${repository}/${snapshot}/_restore`; 70 | const request = buildRequest(domain, path, 'POST', { payload }); 71 | return makeRequest(request, { verbose: true }); 72 | }; 73 | 74 | /** 75 | * @function status 76 | * @description retrieves snapshot status of specified domain. 77 | * @param {string} domain - domain on which to retrieve status. 78 | * @returns response object containing snapshot status of specified domain. 79 | */ 80 | export const status = domain => { 81 | const path = '_snapshot/_status'; 82 | const request = buildRequest(domain, path, 'GET'); 83 | return makeRequest(request, { verbose: true }); 84 | }; 85 | -------------------------------------------------------------------------------- /es/search.mjs: -------------------------------------------------------------------------------- 1 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 2 | 3 | /** 4 | * @function clearScroll 5 | * @description clears scroll objects on an ElasticSearch domain 6 | * @param {string} domain - domain on which to clear the scroll object 7 | * @param {string} [id=null] - id of scroll object. If not supplied, the function will clear all scroll objects on the specified domain. 8 | * @returns {Object} response to the request made to clear the scroll. 9 | */ 10 | export const clearScroll = (domain, id = null) => { 11 | const payload = id ? { scroll_id: id } : undefined; 12 | const path = id ? '_search/scroll' : '_search/scroll/_all'; 13 | const request = buildRequest(domain, path, 'DELETE', { payload }); 14 | const { body: result } = makeRequest(request); 15 | return result; 16 | }; 17 | 18 | /** 19 | * @function first 20 | * @description retrieves the first batch of documents (or first page) and the associated scroll id. 21 | * @param {string} domain - domain on which to scroll. 22 | * @param {string} index - index on which to scroll. 23 | * @param {number} size - size of pages, this will determine how many documents per page to return. 24 | * @returns {HttpResponse} the first response to the scroll API call. This response contains both the documents and the id for the scroll object which is needed for subsequent calls. 25 | */ 26 | const first = async (domain, index, size) => { 27 | const path = `${index}/_search`; 28 | const query = { scroll: '1h' }; 29 | const payload = { size, sort: ['_doc'] }; 30 | const firstRequest = buildRequest(domain, path, 'POST', { 31 | payload, 32 | query, 33 | }); 34 | const { body: result } = await makeRequest(firstRequest, { retry: 5000 }); 35 | return result; 36 | }; 37 | 38 | /** 39 | * @function subsequent 40 | * @description function for subsequent calls to the scroll API after having first called {@link first} 41 | * @param {string} domain - domain on which to scroll 42 | * @param {string} id - id for scroll object 43 | * @returns {HttpResponse} response object containing the documents for the current iteration of the scroll. 44 | */ 45 | const subsequent = async (domain, id) => { 46 | const path = `_search/scroll`; 47 | const payload = { scroll: '1h', scroll_id: id }; 48 | const subsequentRequest = buildRequest(domain, path, 'POST', { 49 | payload, 50 | }); 51 | const { body: result } = await makeRequest(subsequentRequest, { retry: 5000 }); 52 | return result; 53 | }; 54 | 55 | /** 56 | * @function scroll 57 | * @description Generator function, returns an iterate which uses the Scroll API to iterate over huge numbers of documents (potentially all documents) on a given index. 58 | * @param {string} domain - domain on which to scroll. 59 | * @param {string} index - indeox on which to scroll 60 | * @param {Object} [options] 61 | * @param {number} [options.size=1000] - size of page - this determines how many documents are returned per iteration. 62 | * @param {string|number} [options.pages='all'] - number of pages to return. If not specified, the index will iterate exhaustively until all documents are returned. 63 | * @returns {Generator} a generator which yields HttpResponses. Each response has {@link options.size} number of documents. 64 | */ 65 | export async function *scroll( 66 | domain, 67 | index, 68 | { size = 1000, pages = 'all' } = {} 69 | ) { 70 | 71 | // set limit to infinity if all to iterate all results 72 | const limit = pages === 'all' ? Infinity : pages; 73 | let next = await first(domain, index, size); 74 | for ( 75 | let i = 0; 76 | i < limit && next.hits && next.hits.hits.length !== 0; 77 | i++ 78 | ) { 79 | yield next; 80 | // eslint-disable-next-line no-await-in-loop 81 | next = await subsequent(domain, next._scroll_id); 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /es/requests.mjs: -------------------------------------------------------------------------------- 1 | import sha256 from '@aws-crypto/sha256-browser'; 2 | import { defaultProvider } from '@aws-sdk/credential-provider-node'; 3 | import { NodeHttpHandler } from '@aws-sdk/node-http-handler'; 4 | import { HttpRequest } from '@aws-sdk/protocol-http'; 5 | import { SignatureV4 } from '@aws-sdk/signature-v4'; 6 | 7 | import { sleep } from '../util/time.mjs'; 8 | 9 | const { Sha256 } = sha256; 10 | 11 | const signer = new SignatureV4({ 12 | credentials: defaultProvider(), 13 | region: 'eu-west-2', 14 | service: 'es', 15 | sha256: Sha256, 16 | }); 17 | 18 | /** 19 | * @function buildRequest 20 | * @description builds a HttpRequest object using the AWS sdk. Needed for signing the request using Environment variables. 21 | * @param {string} domain ElasticSearch domain on which to make request. 22 | * @param {string} path - additional path, appended after the domain in the request URL. 23 | * @param {string} method - HTTP request method (GET, POST, etc.). 24 | * @param {Object} [options] 25 | * @param {Object|string} [options.payload] - optional payload for the request. Can be passed as object and subsequently stringifyed. 26 | * @param {Object} [options.query={}] - optionaly query object for using the search API. 27 | * @returns {HttpRequest} the AWS HttpRequest object, signed using AWS credentials. 28 | */ 29 | export const buildRequest = ( 30 | domain, 31 | path, 32 | method, 33 | { payload, contentType = 'application/json', query = {} } = {} 34 | ) => { 35 | const body = 36 | payload && typeof payload !== 'string' 37 | ? JSON.stringify(payload) 38 | : payload; 39 | return new HttpRequest({ 40 | body, 41 | method, 42 | path, 43 | query, 44 | headers: { 45 | 'Content-Type': contentType, 46 | host: domain, 47 | }, 48 | hostname: domain, 49 | }); 50 | }; 51 | 52 | /** 53 | * @function parseResponseBody 54 | * @description helper function which returns promise of signed response object's body 55 | * @param {Object} response - response object obtained from {@link makeRequest} 56 | * @returns {Promise} promise which resolves to the response's body 57 | */ 58 | const parseResponseBody = response => { 59 | let responseBody = ''; 60 | return new Promise((resolve, reject) => { 61 | response.body.on('data', chunk => { 62 | responseBody += chunk; 63 | }); 64 | response.body.on('end', () => { 65 | try { 66 | resolve(JSON.parse(responseBody)); 67 | } catch (e) { 68 | reject(e); 69 | } 70 | }); 71 | }); 72 | }; 73 | 74 | /** 75 | * @function _makeRequest 76 | * @description makes a request using a HttpRequest object. 77 | * @param {HttpRequest} request - the HttpRequest object built using {@link buildRequest} 78 | * @param {Object} [options={}] 79 | * @param {boolean} [options.verbose] - whether to log the output of the request and response. 80 | * @returns {Object} the HttpResponse object 81 | */ 82 | const _makeRequest = async request => { 83 | 84 | // Sign the request 85 | const signedRequest = await signer.sign(request); 86 | 87 | // Send the request 88 | const client = new NodeHttpHandler(); 89 | const { response } = await client.handle(signedRequest); 90 | const responseBody = await parseResponseBody(response); 91 | 92 | return { 93 | code: response.statusCode, 94 | message: response.body.statusMessage, 95 | body: responseBody, 96 | }; 97 | }; 98 | 99 | /** 100 | * @function makeRequest 101 | * @description wraps the makeRequest function with try/catch and retry logic. 102 | * @param {HttpRequest} request - the HttpRequest object built using {@link buildRequest} 103 | * @param {Object} [options={}] 104 | * @param {boolean} [options.retry] - how long to wait between trys. 105 | * @param {boolean} [options.limit] - how many times to retry. 106 | * @returns {Object} the HttpResponse object 107 | */ 108 | export const makeRequest = async (request, { retry=null, limit=10 }={}) => { 109 | const promise = _makeRequest(request); 110 | const result = promise 111 | .then(value => value) 112 | .catch(async err => { 113 | if (retry && limit !== 0) { 114 | await sleep(retry); 115 | return makeRequest(request, { retry, limit: limit-1 }); 116 | } 117 | throw err; 118 | 119 | }); 120 | return result; 121 | }; 122 | -------------------------------------------------------------------------------- /dbpedia/requests.mjs: -------------------------------------------------------------------------------- 1 | import * as _ from 'lamb'; 2 | 3 | import { getValue, isIterableLongerThan1 } from '@svizzle/utils'; 4 | 5 | import { loadOntology } from '../dbpedia/ontology.mjs'; 6 | import { dbr, prefixes } from '../dbpedia/util.mjs'; 7 | import { query } from '../sparql/query.mjs'; 8 | 9 | const sanitizeInput = input => { 10 | const URIs = typeof input === 'string' ? [input] : input; 11 | const sanitizedURIs = _.map(URIs, URI => 12 | URI.charAt(0) !== '<' 13 | ? URI.startsWith(dbr) ? `<${URI}>` : `<${dbr}${URI}>` 14 | : URI 15 | ); 16 | return sanitizedURIs; 17 | }; 18 | 19 | const buildIndividualQueries = (inputs, template) => _.map( 20 | inputs, 21 | input => _.replace(/\$\$URI\$\$/gu, input)(template) 22 | ); 23 | 24 | const buildQuery = queries => { 25 | const body = _.join(queries, '\nUNION\n'); 26 | const sparql = ` 27 | ${prefixes} 28 | SELECT * WHERE { 29 | ${body} 30 | }`; 31 | return sparql; 32 | }; 33 | 34 | const makeRequest = async sparql => { 35 | const { results } = await query(sparql); 36 | const values = _.map(results.bindings, _.mapValuesWith(getValue)); 37 | return values; 38 | }; 39 | 40 | const genericRequest = async (input, template) => { 41 | const sanitizedInput = sanitizeInput(input); 42 | const queries = buildIndividualQueries(sanitizedInput, template); 43 | const sparql = buildQuery(queries); 44 | const values = await makeRequest(sparql); 45 | return values; 46 | }; 47 | 48 | /** 49 | * @function getEntityDetails 50 | * @description provides details such as imageURL and abstract for supplied DBpedia URIs 51 | * @param {String|String[]} input - a single DBpedia URI or a list of URIs. 52 | * @returns a list of entities for the supplied DBPedia URIs. 53 | */ 54 | export const getEntityDetails = async input => { 55 | const template = 56 | `{ 57 | BIND ($$URI$$ as ?URI) 58 | OPTIONAL { 59 | $$URI$$ dbo:abstract ?abstract . 60 | FILTER (langMatches(lang(?abstract),"en")) 61 | } 62 | OPTIONAL { $$URI$$ prov:wasDerivedFrom ?derivedFrom . } 63 | OPTIONAL { $$URI$$ dbo:thumbnail ?imageURL . } 64 | }`; 65 | 66 | const values = await genericRequest(input, template); 67 | 68 | // filter out bad encodings 69 | const filteredValues = _.map(values, entity => { 70 | if ('imageURL' in entity) { 71 | if (entity.imageURL.includes('�')) { 72 | const { imageURL, ...rest } = entity; 73 | return rest; 74 | } 75 | } 76 | return entity; 77 | }); 78 | 79 | return filteredValues; 80 | }; 81 | 82 | export const getEntityAbstract = input => { 83 | const template = `{ 84 | BIND ($$URI$$ as ?URI) 85 | OPTIONAL { 86 | $$URI$$ dbo:abstract ?abstract . 87 | FILTER (langMatches(lang(?abstract),"en")) 88 | } 89 | }`; 90 | return genericRequest(input, template); 91 | }; 92 | 93 | export const isDisambiguation = async input => { 94 | const template = `{ 95 | BIND ($$URI$$ as ?title) 96 | OPTIONAL { $$URI$$ dbo:wikiPageDisambiguates ?resource . } 97 | }`; 98 | const values = await genericRequest(input, template); 99 | const groups = _.group(values, _.getKey('title')); 100 | 101 | // if the dbo:wikiPageDisambiguates predicate returns at least one value 102 | // for the URI, then it's a disambiguation page. As the title binding 103 | // will always be found, we check for length > 1 104 | const disambiguations = _.mapValues(groups, isIterableLongerThan1); 105 | return disambiguations; 106 | }; 107 | 108 | export const getClasses = async ( 109 | input, 110 | { 111 | depth=Infinity, 112 | squash=true, 113 | fullURI=true 114 | } = {} 115 | ) => { 116 | 117 | const template = `{ 118 | BIND ($$URI$$ as ?title) 119 | OPTIONAL { $$URI$$ rdf:type ?type . } 120 | }`; 121 | const values = await genericRequest(input, template); 122 | const groups = _.group(values, _.getKey('title')); 123 | const types = _.mapValues(groups, group => _.map(group, _.getKey('type'))); 124 | const classFilter = await loadOntology(depth); 125 | const filteredTypes = _.mapValues( 126 | types, 127 | typeList => { 128 | const filtered = _.filter(typeList, t => t in classFilter); 129 | const squashed = squash 130 | ? filtered 131 | : _.map(filtered, key => _.getIn(classFilter, key)); 132 | const URIs = fullURI 133 | ? squashed 134 | : JSON.parse(stringify(squashed).replaceAll(dbo, '')); 135 | return URIs; 136 | } 137 | ); 138 | return filteredTypes; 139 | }; 140 | 141 | export const hasInfoBoxTemplate = async input => { 142 | const template = `{ 143 | BIND ($$URI$$ as ?URI) 144 | OPTIONAL { $$URI$$ dbp:wikiPageUsesTemplate ?template . } 145 | }`; 146 | const values = await genericRequest(input, template); 147 | const groups = _.group(values, _.getKey('URI')); 148 | const wikiTemplates = _.mapValues(groups, _.pluck('template')); 149 | const infobox = 'http://dbpedia.org/resource/Template:Infobox'; 150 | 151 | const results = _.mapValues( 152 | wikiTemplates, 153 | _.some(t => (t || '').startsWith(infobox)) 154 | ); 155 | return results; 156 | }; 157 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountAggsByConfidence/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "confidence_0_extended_stats": { 5 | "extended_stats": { 6 | "field": "dbpedia_entities_metadata.confidence_counts.0" 7 | } 8 | }, 9 | "confidence_10_extended_stats": { 10 | "extended_stats": { 11 | "field": "dbpedia_entities_metadata.confidence_counts.10" 12 | } 13 | }, 14 | "confidence_20_extended_stats": { 15 | "extended_stats": { 16 | "field": "dbpedia_entities_metadata.confidence_counts.20" 17 | } 18 | }, 19 | "confidence_30_extended_stats": { 20 | "extended_stats": { 21 | "field": "dbpedia_entities_metadata.confidence_counts.30" 22 | } 23 | }, 24 | "confidence_40_extended_stats": { 25 | "extended_stats": { 26 | "field": "dbpedia_entities_metadata.confidence_counts.40" 27 | } 28 | }, 29 | "confidence_50_extended_stats": { 30 | "extended_stats": { 31 | "field": "dbpedia_entities_metadata.confidence_counts.50" 32 | } 33 | }, 34 | "confidence_60_extended_stats": { 35 | "extended_stats": { 36 | "field": "dbpedia_entities_metadata.confidence_counts.60" 37 | } 38 | }, 39 | "confidence_70_extended_stats": { 40 | "extended_stats": { 41 | "field": "dbpedia_entities_metadata.confidence_counts.70" 42 | } 43 | }, 44 | "confidence_80_extended_stats": { 45 | "extended_stats": { 46 | "field": "dbpedia_entities_metadata.confidence_counts.80" 47 | } 48 | }, 49 | "confidence_90_extended_stats": { 50 | "extended_stats": { 51 | "field": "dbpedia_entities_metadata.confidence_counts.90" 52 | } 53 | }, 54 | "confidence_100_extended_stats": { 55 | "extended_stats": { 56 | "field": "dbpedia_entities_metadata.confidence_counts.100" 57 | } 58 | }, 59 | "confidence_0_histogram": { 60 | "histogram": { 61 | "field": "dbpedia_entities_metadata.confidence_counts.0", 62 | "interval": 1, 63 | "min_doc_count": 1 64 | } 65 | }, 66 | "confidence_10_histogram": { 67 | "histogram": { 68 | "field": "dbpedia_entities_metadata.confidence_counts.10", 69 | "interval": 1, 70 | "min_doc_count": 1 71 | } 72 | }, 73 | "confidence_20_histogram": { 74 | "histogram": { 75 | "field": "dbpedia_entities_metadata.confidence_counts.20", 76 | "interval": 1, 77 | "min_doc_count": 1 78 | } 79 | }, 80 | "confidence_30_histogram": { 81 | "histogram": { 82 | "field": "dbpedia_entities_metadata.confidence_counts.30", 83 | "interval": 1, 84 | "min_doc_count": 1 85 | } 86 | }, 87 | "confidence_40_histogram": { 88 | "histogram": { 89 | "field": "dbpedia_entities_metadata.confidence_counts.40", 90 | "interval": 1, 91 | "min_doc_count": 1 92 | } 93 | }, 94 | "confidence_50_histogram": { 95 | "histogram": { 96 | "field": "dbpedia_entities_metadata.confidence_counts.50", 97 | "interval": 1, 98 | "min_doc_count": 1 99 | } 100 | }, 101 | "confidence_60_histogram": { 102 | "histogram": { 103 | "field": "dbpedia_entities_metadata.confidence_counts.60", 104 | "interval": 1, 105 | "min_doc_count": 1 106 | } 107 | }, 108 | "confidence_70_histogram": { 109 | "histogram": { 110 | "field": "dbpedia_entities_metadata.confidence_counts.70", 111 | "interval": 1, 112 | "min_doc_count": 1 113 | } 114 | }, 115 | "confidence_80_histogram": { 116 | "histogram": { 117 | "field": "dbpedia_entities_metadata.confidence_counts.80", 118 | "interval": 1, 119 | "min_doc_count": 1 120 | } 121 | }, 122 | "confidence_90_histogram": { 123 | "histogram": { 124 | "field": "dbpedia_entities_metadata.confidence_counts.90", 125 | "interval": 1, 126 | "min_doc_count": 1 127 | } 128 | }, 129 | "confidence_100_histogram": { 130 | "histogram": { 131 | "field": "dbpedia_entities_metadata.confidence_counts.100", 132 | "interval": 1, 133 | "min_doc_count": 1 134 | } 135 | } 136 | } 137 | } -------------------------------------------------------------------------------- /bin/entitiesDataQuality.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | import { saveObj } from '@svizzle/file'; 4 | import { mergeWithSum, getTruthyValuesKeys } from '@svizzle/utils'; 5 | import { Command } from 'commander'; 6 | import * as _ from 'lamb'; 7 | import mkdirp from 'mkdirp'; 8 | import { fetch } from 'undici'; 9 | 10 | import { getEntityDetails, isDisambiguation } from '../dbpedia/requests.mjs'; 11 | import { getEntities } from '../es/entities.mjs'; 12 | import { batchIterateFlatten } from '../util/array.mjs'; 13 | 14 | const program = new Command(); 15 | 16 | program.requiredOption( 17 | '-d, --domain ', 18 | 'ES domain on which the entities are stored', 19 | ); 20 | program.requiredOption( 21 | '-i, --index ', 22 | 'ES index on which the entities are stored' 23 | ); 24 | program.option( 25 | '-o, --output ', 26 | 'Output directory for the data quality results.', 27 | 'data' 28 | ) 29 | 30 | program.showHelpAfterError(); 31 | program.parse(); 32 | const options = program.opts(); 33 | 34 | await mkdirp(`${options.output}/outputs`) 35 | await mkdirp(`${options.output}/quality/entities`) 36 | 37 | const FILE_ENTITY_TITLES = `${options.output}/outputs/entity_titles.json`; 38 | const FILE_ENTITY_DETAILS = `${options.output}/outputs/entity_details.json`; 39 | const FILE_ENTITY_COUNTS = `${options.output}/quality/entities/entity_counts.json`; 40 | const FILE_MISSING_ABSTRACTS = `${options.output}/quality/entities/missing_abstracts.json`; 41 | const FILE_MISSING_DERIVED_FROM = `${options.output}/quality/entities/missing_derived_from.json`; 42 | const FILE_MISSING_THUMBNAIL = `${options.output}/quality/entities/missing_image.json`; 43 | const FILE_IMAGE_STATUS = `${options.output}/quality/entities/image_status.json`; 44 | const FILE_IMAGE_404s = `${options.output}/quality/entities/image_404s.json`; 45 | const FILE_IMAGE_EXTENSION_COUNTS = `${options.output}/quality/entities/image_extension_counts.json`; 46 | const FILE_DISAMBIGUATION_ENTITIES = `${options.output}/quality/entities/disambiguation_entities.json`; 47 | 48 | const save = (path, object) => saveObj(path, 4)(object); 49 | const addStats = (entities, all) => { 50 | const stats = { 51 | count: entities.length, 52 | proportion: entities.length / all.length 53 | }; 54 | return { 55 | stats, 56 | entities 57 | }; 58 | }; 59 | 60 | const main = async () => { 61 | 62 | // Get Titles for all entities annotated on the ai_map index 63 | console.log('[+] Getting Entity Titles'); 64 | const titles = await getEntities(options.index, options.domain); 65 | save(FILE_ENTITY_TITLES, titles); 66 | 67 | // Get details for all DBpedia entities using DBpedia SPARQL endpoint 68 | console.log('[+] Getting Entity Details'); 69 | const details = await batchIterateFlatten(titles, getEntityDetails); 70 | save(FILE_ENTITY_DETAILS, details); 71 | 72 | // Get the count statistics for the details 73 | console.log('[+] Calculating count statistics'); 74 | const counts = _.reduce(details, (acc, curr) => { 75 | const ones = _.mapValues(curr, _.always(1)); 76 | return mergeWithSum(acc, ones); 77 | }, {}); 78 | const normalisedCounts = _.mapValues(counts, count => count / details.length); 79 | save(FILE_ENTITY_COUNTS, normalisedCounts); 80 | 81 | // Get the count statistics for missing details 82 | console.log('[+] Calculating missing statistics'); 83 | const filterToTitles = predicate => 84 | _.map(_.filter(details, predicate), _.getKey('URI')); 85 | save( 86 | FILE_MISSING_ABSTRACTS, 87 | addStats(filterToTitles(d => !d.abstract), titles) 88 | ); 89 | save( 90 | FILE_MISSING_DERIVED_FROM, 91 | addStats(filterToTitles(d => !d.derivedFrom), titles) 92 | ); 93 | save( 94 | FILE_MISSING_THUMBNAIL, 95 | addStats(filterToTitles(d => !d.imageURL), titles) 96 | ); 97 | 98 | const imageURLs = _.map( 99 | _.filter(details, d => d.imageURL), 100 | d => new URL(d.imageURL) 101 | ); 102 | 103 | // Count image extensions 104 | console.log('[+] Counting image file types by extension'); 105 | const extensions = _.map(imageURLs, t => t.pathname.split('.').slice(-1)[0]); 106 | const extensionCounts = _.count(extensions, _.identity); 107 | saveObj(FILE_IMAGE_EXTENSION_COUNTS, extensionCounts); 108 | 109 | // Get the image status by fetching using imageURL 110 | console.log('[+] Fetching images and saving response status'); 111 | const imageURLStatus = await batchIterateFlatten( 112 | imageURLs, 113 | async batch_ => { 114 | const responses = await Promise.allSettled( 115 | _.map(batch_, t => fetch(t)) 116 | ); 117 | return _.map( 118 | _.zip(batch_, responses), 119 | ([u, r]) => ({ url: u.href, status: r.status }) 120 | ); 121 | } 122 | ); 123 | 124 | const imageURLStatusCounts = _.count(imageURLStatus, _.getKey('status')); 125 | const notFounds = _.filter(imageURLStatus, r => r.status === 404); 126 | 127 | save(FILE_IMAGE_404s, addStats(_.map(notFounds, r => r.url), titles)); 128 | save(FILE_IMAGE_STATUS, imageURLStatusCounts); 129 | 130 | const disambiguationStatus = await batchIterateFlatten( 131 | titles, 132 | isDisambiguation, 133 | { concat: false} 134 | ); 135 | const flattened = _.reduce( 136 | disambiguationStatus, 137 | (acc, curr) => ({ ...acc, ...curr }) 138 | ); 139 | const disambiguations = getTruthyValuesKeys(flattened); 140 | save(FILE_DISAMBIGUATION_ENTITIES, addStats(disambiguations, details)); 141 | }; 142 | 143 | await main(); 144 | 145 | -------------------------------------------------------------------------------- /es/index.mjs: -------------------------------------------------------------------------------- 1 | import { stringify } from '@svizzle/utils'; 2 | import * as _ from 'lamb'; 3 | 4 | import { arxliveCopy } from '../conf/config.mjs'; 5 | import { buildRequest, makeRequest } from '../es/requests.mjs'; 6 | 7 | export const list = async domain => { 8 | const path = '_mappings'; 9 | const request = buildRequest(domain, path, 'GET'); 10 | const { body: response } = await makeRequest(request); 11 | return _.sort(_.keys(response)); 12 | }; 13 | 14 | /** 15 | * @function count 16 | * @description counts the number of documents for the specified domain and index. 17 | * @param {string} domain - domain the ElasticSearch domain. 18 | * @param {string} index - index index on which to count. 19 | * @param {Object} [options] 20 | * @param {boolean} [options.returnFullObject=false] - whether to return the full respose or just the count as a number. 21 | * @returns {Object|number} returns either the count of the number of documents or the full response for the API call. 22 | */ 23 | export const count = async ( 24 | domain, 25 | index, 26 | { returnFullObject = false } = {} 27 | ) => { 28 | const path = `${index}/_count`; 29 | const request = buildRequest(domain, path, 'GET'); 30 | const { body: response } = await makeRequest(request); 31 | if (returnFullObject) { 32 | return response; 33 | } 34 | return response.count; 35 | 36 | }; 37 | 38 | /** 39 | * @function createIndex 40 | * @description creates an index using the specified name and domain. 41 | * @param {string} name - name of index to create. 42 | * @param {string} domain - domain on which to create index. 43 | * @param {Object} [options] 44 | * @param {Object} [options.payload={}] - payload for request to index endpoint. 45 | * @returns {Object} response to the request 46 | */ 47 | export const createIndex = async ( 48 | domain, 49 | index, 50 | { payload = {} } = {} 51 | ) => { 52 | const path = index; 53 | const parsedPayload = typeof payload !== 'string' ? JSON.stringify(payload) : payload; 54 | const request = buildRequest(domain, path, 'PUT', { payload: parsedPayload }); 55 | const { body: response, code } = await makeRequest(request); 56 | if (code !== 200) { 57 | if (response.error.type === 'resource_already_exists_exception') { 58 | console.warn('Index already exists, so was not created'); 59 | } else { 60 | throw new Error(stringify(response)); 61 | } 62 | } 63 | return response; 64 | }; 65 | 66 | /** 67 | * @function deleteIndex 68 | * @description deletes an index using the specified name and domain. If no 69 | * index with specified name exists, function exits gracefully but 70 | * logs this to the user. 71 | * @param {string} name - name of index to delete. 72 | * @param {string} domain - domain on which to delete index. 73 | * @returns {Object} response to the request 74 | */ 75 | export const deleteIndex = async (domain, index) => { 76 | const path = index; 77 | const request = buildRequest(domain, path, 'DELETE'); 78 | const { code } = await makeRequest(request); 79 | if (code === 404) { 80 | console.log(`index '${index}' not found, so was not deleted`); 81 | } 82 | }; 83 | 84 | /** 85 | * @function reindex 86 | * @description copies data from source index to dest index on specified domain. 87 | * @param {string} source - name of source index from which to copy data. 88 | * @param {string} dest - name of destination index on which to copy data. 89 | * @param {string} domain - domain on which to perform reindex. 90 | * @param {Object} [options] 91 | * @param {Object} [options.payload={}] - payload for request to index endpoint. 92 | * @param {string} [options.pipeline=null] - name of the ingestion pipeline to include upon reindex. 93 | * @returns {Object} response to the request 94 | */ 95 | export const reindex = async ( 96 | source, 97 | dest, 98 | domain = arxliveCopy, 99 | { payload = {}, pipeline = null } = {} 100 | ) => { 101 | const path = '_reindex'; 102 | const parsedPayload = typeof payload === 'string' ? JSON.parse(payload) : payload; 103 | const expandedPayload = { 104 | ...parsedPayload, 105 | source: { 106 | index: source 107 | }, 108 | dest: { 109 | index: dest, 110 | pipeline, 111 | } 112 | }; 113 | const request = buildRequest(domain, path, 'POST', { payload: JSON.stringify(expandedPayload) }); 114 | const { code, body: response } = await makeRequest(request); 115 | if (code !== 200) { 116 | throw new Error( 117 | `Reindex from ${source} to ${dest} failed. Response:\n${stringify(response)}` 118 | ); 119 | } 120 | return response; 121 | }; 122 | 123 | /** 124 | * @function getMappings 125 | * @description gets the mappings for the specied index on the specified domain. 126 | * @param {string} domain - Domain from which to get mappings. 127 | * @param {string} index - Index from which to get mappings. 128 | * @returns {Object} the mappings. 129 | */ 130 | export const getMappings = async (domain, index) => { 131 | const path = `${index}/_mappings`; 132 | const request = buildRequest(domain, path); 133 | const { body: response } = await makeRequest(request); 134 | return response; 135 | }; 136 | 137 | /** 138 | * @function updateMapping 139 | * @description updates the mapping on the specified domain and index. 140 | * @param {string} domain - domain on which to update the mappings. 141 | * @param {string} index - index on which to update the mappings. 142 | * @param {Object} [options] 143 | * @param {Object} [options.payload={}] - payload for request. 144 | * @returns {Object} response object. 145 | */ 146 | export const updateMapping = async ( 147 | domain, 148 | index, 149 | { payload } = {} 150 | ) => { 151 | const path = `${index}/_mappings`; 152 | const request = buildRequest(domain, path, 'PUT', { payload }); 153 | const { body: response } = await makeRequest(request); 154 | return response; 155 | }; 156 | -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedConfidenceHistogram/README.md: -------------------------------------------------------------------------------- 1 | ## Flattened `confidence` Histogram 2 | 3 | Aggregates all `confidence` values into a histogram, each bucket indicating one of 4 | the 10 possible `confidence` levels annotated. Flattened here denotes the fact 5 | that all annotated entities are treated as a flat list - no per document 6 | analysis is performed. 7 | 8 | Endpoint: `POST arxiv_v6/_search` 9 | 10 | See: 11 | 12 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/search-aggregations-bucket-terms-aggregation.html 13 | - https://www.elastic.co/guide/en/elasticsearch/reference/7.4/nested.html 14 | 15 | ## Notes 16 | 17 | We have decided not to use the `histogram` API here due to errors produced by 18 | rounding of floating precision point values. When using a histogram with 19 | interval 0.1, the values for the buckets turn out to be incorrect. In 20 | particular, there are no entities found with `confidence `0.7, which is obviously 21 | wrong. Instead, it seems like all entities tagged at `confidence` 0.7 are 22 | erroneously counted in the 0.6 bucket, meaning that bucket contains all entities 23 | for `confidence` 0.6 and 0.7: 24 | 25 | Request: 26 | ```json 27 | { 28 | "size": 0, 29 | "aggs": { 30 | "dbpedia": { 31 | "nested": { 32 | "path": "dbpedia_entities" 33 | }, 34 | "aggs": { 35 | "confidence": { 36 | "histogram": { 37 | "field": "dbpedia_entities.confidence", 38 | "interval": 0.1 39 | } 40 | } 41 | } 42 | } 43 | } 44 | } 45 | ``` 46 | Truncated response: 47 | ```json 48 | ... 49 | "aggregations": { 50 | "dbpedia": { 51 | "doc_count": 75296846, 52 | "confidence": { 53 | "buckets": [ 54 | { 55 | "key": 0.1, 56 | "doc_count": 411055 57 | }, 58 | { 59 | "key": 0.2, 60 | "doc_count": 652848 61 | }, 62 | { 63 | "key": 0.30000000000000004, 64 | "doc_count": 53424468 65 | }, 66 | { 67 | "key": 0.4, 68 | "doc_count": 6007261 69 | }, 70 | { 71 | "key": 0.5, 72 | "doc_count": 3751608 73 | }, 74 | { 75 | "key": 0.6000000000000001, 76 | "doc_count": 3500601 77 | }, 78 | { 79 | "key": 0.7000000000000001, 80 | "doc_count": 0 81 | }, 82 | { 83 | "key": 0.8, 84 | "doc_count": 7549005 85 | } 86 | ] 87 | } 88 | } 89 | } 90 | ``` 91 | The `terms` aggregation has difficulty creating buckets whose keys are of type 92 | float or double, due to floating point precision errors. As a result, the keys 93 | found in the `response.json` can look bizarre. In actual fact, the keys are 94 | indistinguishable (in the Java Runtime) due to the rounding errors. Example 95 | (using key for `confidence` bucket 0.4): 96 | 97 | ```java 98 | class Main { 99 | public static void main(String args[]) { 100 | float example = 0.4000000059604645f; 101 | System.out.println(example); // 0.4 102 | } 103 | } 104 | ``` 105 | 106 | You can find a replit for the example 107 | [here]([https://replit.com/@doogyb/Floating-Point-Precision-Errors#Main.java). 108 | 109 | We've decided to document this behavior for now and move on. However, there 110 | exists two possible solutions to the problem. The first involves changing the 111 | schema so that `confidence` values are encoded as integers. The current values 112 | would be mapped using a factor of 10, so that entities tagged at confidence 113 | level 0.3 would have an integer `confidence` value of 3, those tagged at 0.7 an 114 | integer value of 7, and so on. The advantage of this approach is that we 115 | guarantee the correct term bucket keys due to no risk of floating point 116 | precision errors. However, we deviate from the accepted inputs of the Spotlight 117 | API, which only accepts values for `confidence` within the range 0 and 1. 118 | 119 | The second solution is to use the `histogram` API, with interval set to 0.1 and 120 | an offset set to a value very slightly below zero. The following request is 121 | included for reference: 122 | 123 | ```json 124 | { 125 | "size": 0, 126 | "aggs": { 127 | "dbpedia": { 128 | "nested": { 129 | "path": "dbpedia_entities" 130 | }, 131 | "aggs": { 132 | "confidence": { 133 | "histogram": { 134 | "field": "dbpedia_entities.confidence", 135 | "interval": 0.1, 136 | "offset": -0.0000001 137 | } 138 | } 139 | } 140 | } 141 | } 142 | } 143 | ``` 144 | 145 | Truncated response: 146 | ```json 147 | ... 148 | "aggregations": { 149 | "dbpedia": { 150 | "doc_count": 75296846, 151 | "confidence": { 152 | "buckets": [ 153 | { 154 | "key": 0.0999999, 155 | "doc_count": 411055 156 | }, 157 | { 158 | "key": 0.1999999, 159 | "doc_count": 652848 160 | }, 161 | { 162 | "key": 0.29999990000000004, 163 | "doc_count": 53424468 164 | }, 165 | { 166 | "key": 0.3999999, 167 | "doc_count": 6007261 168 | }, 169 | { 170 | "key": 0.4999999, 171 | "doc_count": 3751608 172 | }, 173 | { 174 | "key": 0.5999999000000001, 175 | "doc_count": 1970197 176 | }, 177 | { 178 | "key": 0.6999999000000001, 179 | "doc_count": 1530404 180 | }, 181 | { 182 | "key": 0.7999999000000001, 183 | "doc_count": 1346700 184 | }, 185 | { 186 | "key": 0.8999999000000001, 187 | "doc_count": 6202305 188 | } 189 | ] 190 | } 191 | } 192 | } 193 | ``` 194 | 195 | However the values still do not strictly match up with the encoded confidence 196 | levels. This approach is however more in line with what is suggested according 197 | to this Github [issue](https://github.com/elastic/elasticsearch/issues/30529) 198 | for Elastic Search, due to the issue surrounding encoding floating point values 199 | accurately when using a base 2 system vs base 10 system. -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/flattenedURITermsByConfidence/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "dbpedia": { 5 | "nested": { 6 | "path": "dbpedia_entities" 7 | }, 8 | "aggs": { 9 | "top_URI_100": { 10 | "filter": { 11 | "term": { 12 | "dbpedia_entities.confidence": 100 13 | } 14 | }, 15 | "aggs": { 16 | "URIs_100": { 17 | "terms": { 18 | "field": "dbpedia_entities.URI", 19 | "size": 100 20 | } 21 | } 22 | } 23 | }, 24 | "top_URI_90": { 25 | "filter": { 26 | "term": { 27 | "dbpedia_entities.confidence": 90 28 | } 29 | }, 30 | "aggs": { 31 | "URI": { 32 | "terms": { 33 | "field": "dbpedia_entities.URI", 34 | "size": 100 35 | } 36 | } 37 | } 38 | }, 39 | "top_URI_80": { 40 | "filter": { 41 | "term": { 42 | "dbpedia_entities.confidence": 80 43 | } 44 | }, 45 | "aggs": { 46 | "URI": { 47 | "terms": { 48 | "field": "dbpedia_entities.URI", 49 | "size": 100 50 | } 51 | } 52 | } 53 | }, 54 | "top_URI_70": { 55 | "filter": { 56 | "term": { 57 | "dbpedia_entities.confidence": 70 58 | } 59 | }, 60 | "aggs": { 61 | "URI": { 62 | "terms": { 63 | "field": "dbpedia_entities.URI", 64 | "size": 100 65 | } 66 | } 67 | } 68 | }, 69 | "top_URI_60": { 70 | "filter": { 71 | "term": { 72 | "dbpedia_entities.confidence": 60 73 | } 74 | }, 75 | "aggs": { 76 | "URI": { 77 | "terms": { 78 | "field": "dbpedia_entities.URI", 79 | "size": 100 80 | } 81 | } 82 | } 83 | }, 84 | "top_URI_50": { 85 | "filter": { 86 | "term": { 87 | "dbpedia_entities.confidence": 50 88 | } 89 | }, 90 | "aggs": { 91 | "URI": { 92 | "terms": { 93 | "field": "dbpedia_entities.URI", 94 | "size": 100 95 | } 96 | } 97 | } 98 | }, 99 | "top_URI_40": { 100 | "filter": { 101 | "term": { 102 | "dbpedia_entities.confidence": 40 103 | } 104 | }, 105 | "aggs": { 106 | "URI": { 107 | "terms": { 108 | "field": "dbpedia_entities.URI", 109 | "size": 100 110 | } 111 | } 112 | } 113 | }, 114 | "top_URI_30": { 115 | "filter": { 116 | "term": { 117 | "dbpedia_entities.confidence": 30 118 | } 119 | }, 120 | "aggs": { 121 | "URI": { 122 | "terms": { 123 | "field": "dbpedia_entities.URI", 124 | "size": 100 125 | } 126 | } 127 | } 128 | }, 129 | "top_URI_20": { 130 | "filter": { 131 | "term": { 132 | "dbpedia_entities.confidence": 20 133 | } 134 | }, 135 | "aggs": { 136 | "URI": { 137 | "terms": { 138 | "field": "dbpedia_entities.URI", 139 | "size": 100 140 | } 141 | } 142 | } 143 | }, 144 | "top_URI_10": { 145 | "filter": { 146 | "term": { 147 | "dbpedia_entities.confidence": 10 148 | } 149 | }, 150 | "aggs": { 151 | "URI": { 152 | "terms": { 153 | "field": "dbpedia_entities.URI", 154 | "size": 100 155 | } 156 | } 157 | } 158 | }, 159 | "top_URI_0": { 160 | "filter": { 161 | "term": { 162 | "dbpedia_entities.confidence": 0 163 | } 164 | }, 165 | "aggs": { 166 | "URI": { 167 | "terms": { 168 | "field": "dbpedia_entities.URI", 169 | "size": 100 170 | } 171 | } 172 | } 173 | } 174 | } 175 | } 176 | } 177 | } -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountOverTokenCountByConfidence/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "entities_count_over_token_count_at_0_extended_stats": { 5 | "extended_stats": { 6 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['textBody_abstract_article.token_count'].value;" 7 | } 8 | }, 9 | "entities_count_over_token_count_at_0_histogram": { 10 | "histogram": { 11 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['textBody_abstract_article.token_count'].value;", 12 | "interval": 0.01, 13 | "min_doc_count": 1 14 | } 15 | }, 16 | "entities_count_over_token_count_at_10_extended_stats": { 17 | "extended_stats": { 18 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['textBody_abstract_article.token_count'].value;" 19 | } 20 | }, 21 | "entities_count_over_token_count_at_10_histogram": { 22 | "histogram": { 23 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['textBody_abstract_article.token_count'].value;", 24 | "interval": 0.01, 25 | "min_doc_count": 1 26 | } 27 | }, 28 | "entities_count_over_token_count_at_20_extended_stats": { 29 | "extended_stats": { 30 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['textBody_abstract_article.token_count'].value;" 31 | } 32 | }, 33 | "entities_count_over_token_count_at_20_histogram": { 34 | "histogram": { 35 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['textBody_abstract_article.token_count'].value;", 36 | "interval": 0.01, 37 | "min_doc_count": 1 38 | } 39 | }, 40 | "entities_count_over_token_count_at_30_extended_stats": { 41 | "extended_stats": { 42 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['textBody_abstract_article.token_count'].value;" 43 | } 44 | }, 45 | "entities_count_over_token_count_at_30_histogram": { 46 | "histogram": { 47 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['textBody_abstract_article.token_count'].value;", 48 | "interval": 0.01, 49 | "min_doc_count": 1 50 | } 51 | }, 52 | "entities_count_over_token_count_at_40_extended_stats": { 53 | "extended_stats": { 54 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['textBody_abstract_article.token_count'].value;" 55 | } 56 | }, 57 | "entities_count_over_token_count_at_40_histogram": { 58 | "histogram": { 59 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['textBody_abstract_article.token_count'].value;", 60 | "interval": 0.01, 61 | "min_doc_count": 1 62 | } 63 | }, 64 | "entities_count_over_token_count_at_50_extended_stats": { 65 | "extended_stats": { 66 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['textBody_abstract_article.token_count'].value;" 67 | } 68 | }, 69 | "entities_count_over_token_count_at_50_histogram": { 70 | "histogram": { 71 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['textBody_abstract_article.token_count'].value;", 72 | "interval": 0.01, 73 | "min_doc_count": 1 74 | } 75 | }, 76 | "entities_count_over_token_count_at_60_extended_stats": { 77 | "extended_stats": { 78 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['textBody_abstract_article.token_count'].value;" 79 | } 80 | }, 81 | "entities_count_over_token_count_at_60_histogram": { 82 | "histogram": { 83 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['textBody_abstract_article.token_count'].value;", 84 | "interval": 0.01, 85 | "min_doc_count": 1 86 | } 87 | }, 88 | "entities_count_over_token_count_at_70_extended_stats": { 89 | "extended_stats": { 90 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['textBody_abstract_article.token_count'].value;" 91 | } 92 | }, 93 | "entities_count_over_token_count_at_70_histogram": { 94 | "histogram": { 95 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['textBody_abstract_article.token_count'].value;", 96 | "interval": 0.01, 97 | "min_doc_count": 1 98 | } 99 | }, 100 | "entities_count_over_token_count_at_80_extended_stats": { 101 | "extended_stats": { 102 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['textBody_abstract_article.token_count'].value;" 103 | } 104 | }, 105 | "entities_count_over_token_count_at_80_histogram": { 106 | "histogram": { 107 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['textBody_abstract_article.token_count'].value;", 108 | "interval": 0.01, 109 | "min_doc_count": 1 110 | } 111 | }, 112 | "entities_count_over_token_count_at_90_extended_stats": { 113 | "extended_stats": { 114 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['textBody_abstract_article.token_count'].value;" 115 | } 116 | }, 117 | "entities_count_over_token_count_at_90_histogram": { 118 | "histogram": { 119 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['textBody_abstract_article.token_count'].value;", 120 | "interval": 0.01, 121 | "min_doc_count": 1 122 | } 123 | }, 124 | "entities_count_over_token_count_at_100_extended_stats": { 125 | "extended_stats": { 126 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['textBody_abstract_article.token_count'].value;" 127 | } 128 | }, 129 | "entities_count_over_token_count_at_100_histogram": { 130 | "histogram": { 131 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['textBody_abstract_article.token_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['textBody_abstract_article.token_count'].value;", 132 | "interval": 0.01, 133 | "min_doc_count": 1 134 | } 135 | } 136 | } 137 | } -------------------------------------------------------------------------------- /bin/annotationsDataQuality/requests/entitiesCountAggsByConfidenceOverEntitiesCount/request.json: -------------------------------------------------------------------------------- 1 | { 2 | "size": 0, 3 | "aggs": { 4 | "confidence_0_normalised_with_entities_count_extended_stats": { 5 | "extended_stats": { 6 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 7 | } 8 | }, 9 | "confidence_0_normalised_with_entities_count_histogram": { 10 | "histogram": { 11 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.0'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.0'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 12 | "interval": 0.01, 13 | "min_doc_count": 1 14 | } 15 | }, 16 | "confidence_10_normalised_with_entities_count_extended_stats": { 17 | "extended_stats": { 18 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 19 | } 20 | }, 21 | "confidence_10_normalised_with_entities_count_histogram": { 22 | "histogram": { 23 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.10'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.10'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 24 | "interval": 0.01, 25 | "min_doc_count": 1 26 | } 27 | }, 28 | "confidence_20_normalised_with_entities_count_extended_stats": { 29 | "extended_stats": { 30 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 31 | } 32 | }, 33 | "confidence_20_normalised_with_entities_count_histogram": { 34 | "histogram": { 35 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.20'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.20'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 36 | "interval": 0.01, 37 | "min_doc_count": 1 38 | } 39 | }, 40 | "confidence_30_normalised_with_entities_count_extended_stats": { 41 | "extended_stats": { 42 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 43 | } 44 | }, 45 | "confidence_30_normalised_with_entities_count_histogram": { 46 | "histogram": { 47 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.30'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.30'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 48 | "interval": 0.01, 49 | "min_doc_count": 1 50 | } 51 | }, 52 | "confidence_40_normalised_with_entities_count_extended_stats": { 53 | "extended_stats": { 54 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 55 | } 56 | }, 57 | "confidence_40_normalised_with_entities_count_histogram": { 58 | "histogram": { 59 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.40'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.40'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 60 | "interval": 0.01, 61 | "min_doc_count": 1 62 | } 63 | }, 64 | "confidence_50_normalised_with_entities_count_extended_stats": { 65 | "extended_stats": { 66 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 67 | } 68 | }, 69 | "confidence_50_normalised_with_entities_count_histogram": { 70 | "histogram": { 71 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.50'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.50'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 72 | "interval": 0.01, 73 | "min_doc_count": 1 74 | } 75 | }, 76 | "confidence_60_normalised_with_entities_count_extended_stats": { 77 | "extended_stats": { 78 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 79 | } 80 | }, 81 | "confidence_60_normalised_with_entities_count_histogram": { 82 | "histogram": { 83 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.60'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.60'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 84 | "interval": 0.01, 85 | "min_doc_count": 1 86 | } 87 | }, 88 | "confidence_70_normalised_with_entities_count_extended_stats": { 89 | "extended_stats": { 90 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 91 | } 92 | }, 93 | "confidence_70_normalised_with_entities_count_histogram": { 94 | "histogram": { 95 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.70'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.70'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 96 | "interval": 0.01, 97 | "min_doc_count": 1 98 | } 99 | }, 100 | "confidence_80_normalised_with_entities_count_extended_stats": { 101 | "extended_stats": { 102 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 103 | } 104 | }, 105 | "confidence_80_normalised_with_entities_count_histogram": { 106 | "histogram": { 107 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.80'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.80'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 108 | "interval": 0.01, 109 | "min_doc_count": 1 110 | } 111 | }, 112 | "confidence_90_normalised_with_entities_count_extended_stats": { 113 | "extended_stats": { 114 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 115 | } 116 | }, 117 | "confidence_90_normalised_with_entities_count_histogram": { 118 | "histogram": { 119 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.90'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.90'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 120 | "interval": 0.01, 121 | "min_doc_count": 1 122 | } 123 | }, 124 | "confidence_100_normalised_with_entities_count_extended_stats": { 125 | "extended_stats": { 126 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['dbpedia_entities_metadata.entities_count'].value;" 127 | } 128 | }, 129 | "confidence_100_normalised_with_entities_count_histogram": { 130 | "histogram": { 131 | "script": "if (doc['dbpedia_entities_metadata.confidence_counts.100'].size() == 0 || doc['dbpedia_entities_metadata.entities_count'].size() == 0) { return 0; } return ((double) doc['dbpedia_entities_metadata.confidence_counts.100'].value) / doc['dbpedia_entities_metadata.entities_count'].value;", 132 | "interval": 0.01, 133 | "min_doc_count": 1 134 | } 135 | } 136 | } 137 | } -------------------------------------------------------------------------------- /aws/s3.mjs: -------------------------------------------------------------------------------- 1 | import { 2 | S3Client, 3 | GetObjectCommand, 4 | GetObjectAttributesCommand, 5 | CreateMultipartUploadCommand, 6 | UploadPartCommand, 7 | CompleteMultipartUploadCommand, 8 | HeadBucketCommand, 9 | HeadObjectCommand, 10 | GetBucketAclCommand, 11 | PutObjectCommand 12 | } from '@aws-sdk/client-s3'; 13 | import { defaultProvider } from "@aws-sdk/credential-provider-node"; 14 | import * as cliProgress from 'cli-progress'; 15 | 16 | import * as _ from 'lamb'; 17 | 18 | import { bulkRequest } from '../es/bulk.mjs'; 19 | import { scroll } from '../es/search.mjs'; 20 | import { count, createIndex } from '../es/index.mjs'; 21 | 22 | 23 | // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html 24 | export const MIN_PART_SIZE = 5242880; 25 | 26 | const config = { 27 | credentials: defaultProvider(), 28 | region: 'eu-west-2', 29 | }; 30 | const client = new S3Client(config); 31 | 32 | const parseMost = (chunk, type) => { 33 | const [ start, end ] = type === 'object' ? ['{', '}'] : ['[', ']']; 34 | for (let i = chunk.length - 1; i >= 0; i--) { 35 | if (chunk[i] === ',' || chunk[i] === end) { 36 | const test = `${start}${_.slice(chunk, 0, i).join('')}${end}`; 37 | try { 38 | const documents = JSON.parse(test); 39 | const leftover = _.slice(chunk, i+1, chunk.length).join(''); 40 | return { documents, leftover }; 41 | } catch {} 42 | } 43 | } 44 | return { documents: null, leftover: chunk }; 45 | }; 46 | 47 | const getObject = (bucket, key, { start=0, end=-1 }={}) => { 48 | return new Promise(async (resolve, error) => { 49 | const get = new GetObjectCommand({ 50 | Bucket: bucket, 51 | Key: key, 52 | Range: `bytes=${start}-${end}` 53 | }); 54 | const { Body, ContentLength } = await client.send(get); 55 | const finished = end === -1 || ContentLength < end - start; 56 | const data = []; 57 | Body.on('error', err => error(err)); 58 | Body.on('data', chunk => data.push(chunk)); 59 | Body.on('end', () => resolve({data: data.join(''), finished})); 60 | }); 61 | }; 62 | 63 | const getObjectAttributes = async( 64 | bucket, 65 | key, 66 | attributeList=['ETag', 'Checksum', 'ObjectParts', 'StorageClass', 'ObjectSize'] 67 | ) => { 68 | const get = new GetObjectAttributesCommand({ 69 | Bucket: bucket, 70 | Key: key, 71 | ObjectAttributes: attributeList 72 | }); 73 | const attributes = await client.send(get); 74 | return attributes; 75 | }; 76 | 77 | async function *stream( 78 | bucket, 79 | key, 80 | type, 81 | { increment=512_000 }={} 82 | ) { 83 | let current = 0; 84 | let chunk, finished; 85 | let data = ''; 86 | 87 | // check at very beginning that types match up 88 | const { data: first } = await getObject( 89 | bucket, key, { start: 0, end: 0 } 90 | ); 91 | if ( 92 | first === '{' && type !== 'object' || 93 | first === '[' && type !== 'array' || 94 | first !== '{' && first !== '[') { 95 | throw new Error( 96 | `Type errror. Are you sure the bucket object\'s type is correct?` 97 | ); 98 | } 99 | 100 | const { ObjectSize: size } = await getObjectAttributes(bucket, key); 101 | const bar = new cliProgress.SingleBar(cliProgress.Presets.shades_classic); 102 | bar.start(size, 0); 103 | do { 104 | 105 | // always omit first byte, as we know it's either '{' or '[' 106 | // eslint-disable-next-line no-await-in-loop 107 | ({ data: chunk, finished } = await getObject( 108 | bucket, key, { start: current+1, end: current + increment } 109 | )); 110 | data += chunk; 111 | const { documents, leftover } = parseMost(data, type); 112 | if (documents) { 113 | yield documents; 114 | data = leftover; 115 | } 116 | current += increment; 117 | bar.update(current); 118 | } while (!finished); 119 | bar.update(size); 120 | bar.stop(); 121 | } 122 | 123 | export const streamObject = ( 124 | bucket, 125 | key, 126 | { increment=64_000 }={} 127 | ) => stream(bucket, key, 'object', { increment }); 128 | 129 | export const streamArray = ( 130 | bucket, 131 | key, 132 | { increment=64_000 }={} 133 | ) => stream(bucket, key, 'array', { increment }); 134 | 135 | export const initialiseMultiPartUpload = async (bucket, key) => { 136 | const create = new CreateMultipartUploadCommand({ 137 | Bucket: bucket, 138 | Key: key 139 | }); 140 | const { UploadId: uploadId } = await client.send(create); 141 | return uploadId; 142 | }; 143 | 144 | export const uploadPart = async ( 145 | data, 146 | bucket, 147 | key, 148 | uploadId, 149 | partNumber 150 | ) => { 151 | const upload = new UploadPartCommand({ 152 | Body: data, 153 | Bucket: bucket, 154 | Key: key, 155 | UploadId: uploadId, 156 | PartNumber: partNumber 157 | }); 158 | 159 | const { ETag } = await client.send(upload); 160 | return ETag; 161 | }; 162 | 163 | export const completeMultiPartUpload = async ( 164 | bucket, 165 | key, 166 | parts, 167 | uploadId 168 | ) => { 169 | const complete = new CompleteMultipartUploadCommand({ 170 | Bucket: bucket, 171 | Key: key, 172 | MultipartUpload: { Parts: parts }, 173 | UploadId: uploadId 174 | }); 175 | const completeResponse = await client.send(complete); 176 | return completeResponse; 177 | }; 178 | 179 | 180 | export const bucketToIndex = async ( 181 | index, 182 | domain, 183 | bucket, 184 | key, 185 | { 186 | idField=null, 187 | format='array', 188 | chunkSize=8_388_608, // 8MB, 189 | refresh=false 190 | }={} 191 | ) => { 192 | 193 | let count_ = 0; 194 | const method = idField ? 'create' : 'index'; 195 | const formatObject = _.pipe([ 196 | _.pairs, 197 | _.mapWith(([k, value]) => ({ _id: k, data: { value } })) 198 | ]); 199 | const formatArray = _.mapWith( 200 | ({ [idField]: id, ...rest }) => ({ 201 | ...id && {_id: id}, 202 | data: rest 203 | }) 204 | ); 205 | const funcs = { 206 | object: [streamObject, formatObject], 207 | array: [streamArray, formatArray] 208 | }; 209 | 210 | await createIndex(domain, index); 211 | const [stream_, formatter] = funcs[format]; 212 | const streamer = stream_( 213 | bucket, 214 | key, 215 | { increment: chunkSize } 216 | ); 217 | 218 | for await (let docs of streamer) { 219 | const bulkFormat = formatter(docs); 220 | await bulkRequest( 221 | domain, 222 | index, 223 | bulkFormat, 224 | method, 225 | { refresh } 226 | ); 227 | count_ += docs.length; 228 | } 229 | return count_; 230 | }; 231 | 232 | /* Index to Bucket Specific Functions */ 233 | 234 | const separate = (start, stop, data, page, total) => { 235 | let raw = JSON.stringify(data).slice(1, -1); 236 | if (page === 1) { 237 | raw = `${start}${raw}`; 238 | } 239 | if (page === total) { 240 | raw = `${raw}${stop}`; 241 | } else { 242 | raw = `${raw},`; 243 | } 244 | return raw; 245 | }; 246 | 247 | const arrayFormatter = (data, page, total) => { 248 | return separate('[', ']', data, page, total); 249 | }; 250 | 251 | const objectFormatter = (data, page, total, { key=null }={}) => { 252 | 253 | const getter = key ? _.getPath(key) : _.identity; 254 | const documents = _.reduce( 255 | data, 256 | (acc, doc) => { 257 | acc[doc.id] = getter(doc); 258 | return acc; 259 | }, 260 | {} 261 | ); 262 | return separate('{', '}', documents, page, total); 263 | }; 264 | 265 | const entitiesFormatter = (data, page, total) => 266 | objectFormatter(data, page, total, { key: 'dbpedia_entities' }); 267 | 268 | const extractSource = _.mapWith(doc => ({ id: doc._id, ...doc._source })); 269 | 270 | const extractURIandConfidence = _.mapWith( 271 | doc => { 272 | doc.dbpedia_entities = _.map( 273 | doc.dbpedia_entities || [], 274 | entity => ({ URI: entity.URI, confidence: entity.confidence }) 275 | ); 276 | return doc; 277 | } 278 | ); 279 | 280 | const filterByConfidence = threshold => _.mapWith( 281 | doc => { 282 | if (doc._source.dbpedia_entities) { 283 | doc._source.dbpedia_entities = _.filter( 284 | doc._source.dbpedia_entities || [], 285 | entity => entity.confidence > threshold 286 | ); 287 | } 288 | return doc; 289 | } 290 | ); 291 | 292 | export const indexToBucket = async( 293 | index, 294 | domain, 295 | bucket, 296 | key, 297 | { 298 | threshold=0, 299 | pages='all', 300 | pageSize=10000, 301 | format='array', 302 | processor='default' 303 | }={} 304 | ) => { 305 | 306 | const formats = { 307 | array: arrayFormatter, 308 | object: objectFormatter, 309 | entities: entitiesFormatter 310 | }; 311 | 312 | const processors = { 313 | es: _.identity, 314 | default: extractSource, 315 | simple: _.pipe([extractSource, extractURIandConfidence]) 316 | }; 317 | 318 | const filter = filterByConfidence(threshold); 319 | const processor_ = processors[processor]; 320 | const etl = _.pipe([filter, processor_]); 321 | const formatter = formats[format]; 322 | 323 | const scroller = scroll(domain, index, { 324 | pages, 325 | size: pageSize, 326 | }); 327 | 328 | const totalDocuments = await count(domain, index); 329 | const totalWork = pages === 'all' 330 | ? totalDocuments 331 | : pages * pageSize; 332 | 333 | const pagesNeeded = Math.floor(totalDocuments / pageSize) + 1; 334 | const pages_ = pages === 'all' 335 | ? pagesNeeded 336 | : Math.min(pagesNeeded, pages); 337 | 338 | const bar = new cliProgress.SingleBar( 339 | cliProgress.Presets.shades_classic 340 | ); 341 | 342 | const uploadId = await initialiseMultiPartUpload(bucket, key); 343 | bar.start(totalWork, 0); 344 | 345 | let partNumber = 1; 346 | let currentPage = 1; 347 | let parts = []; 348 | let chunk = ''; 349 | 350 | for await (let page of scroller) { 351 | 352 | const data = etl(page.hits.hits); 353 | const raw = formatter(data, currentPage, pages_); 354 | chunk += raw; 355 | 356 | // check if the chunk is large enough to upload as a part to s3 357 | if (Buffer.byteLength(chunk) >= MIN_PART_SIZE) { 358 | const ETag = await uploadPart( 359 | chunk, bucket, key, uploadId, partNumber 360 | ); 361 | parts.push({ PartNumber: partNumber, ETag }); 362 | partNumber++; 363 | chunk = ''; 364 | } 365 | bar.increment(page.hits.hits.length); 366 | currentPage++; 367 | } 368 | 369 | // if chunk as not been reset on last iteration, there's still one last 370 | // upload to perform 371 | if (chunk.length) { 372 | const ETag = await uploadPart( 373 | chunk, bucket, key, uploadId, partNumber 374 | ); 375 | parts.push({ PartNumber: partNumber, ETag }); 376 | partNumber++; 377 | } 378 | await completeMultiPartUpload(bucket, key, parts, uploadId); 379 | bar.stop(); 380 | }; 381 | 382 | export const headBucket = async bucket => { 383 | const command = new HeadBucketCommand({ Bucket: bucket }); 384 | const response = await client.send(command); 385 | return response; 386 | }; 387 | 388 | export const headObject = async (bucket, key) => { 389 | const command = new HeadObjectCommand({ Key: key, Bucket: bucket }); 390 | const response = await client.send(command); 391 | return response; 392 | }; 393 | 394 | export const bucketACL = async bucket => { 395 | const command = new GetBucketAclCommand({ Bucket: bucket }); 396 | const response = await client.send(command); 397 | return response; 398 | }; 399 | 400 | export const putObject = async (bucket, key, data) => { 401 | const command = new PutObjectCommand({ 402 | Bucket: bucket, 403 | Key: key, 404 | body: data 405 | }); 406 | const response = await client.send(command); 407 | return response; 408 | }; 409 | -------------------------------------------------------------------------------- /dbpedia/spotlight.mjs: -------------------------------------------------------------------------------- 1 | import * as cliProgress from 'cli-progress'; 2 | import * as _ from 'lamb'; 3 | 4 | import { getLength, mergeWithMerge, stringify } from '@svizzle/utils'; 5 | import { fetch } from 'undici' 6 | 7 | import { defaultMapping, metaDataMapping } from '../conf/mappings.mjs'; 8 | import { count, updateMapping } from '../es/index.mjs'; 9 | import { update } from '../es/update.mjs'; 10 | import { scroll, clearScroll } from '../es/search.mjs'; 11 | import { bulkRequest } from '../es/bulk.mjs'; 12 | import { batch } from '../util/array.mjs'; 13 | import { logger } from '../logging/logging.mjs'; 14 | import { promisesHandler } from '../util/promises.mjs'; 15 | import { spotlightEndpoint, confidenceValues } from '../conf/config.mjs'; 16 | 17 | /** 18 | * The resource object that the spotlight tool responds with. Each resource corresponds to a DBpedia URI. 19 | * @typedef SpotlightResource 20 | * @type {Object} 21 | * @property {string} @URI - the Unique Resource Identifier for this resource. 22 | * @property {number} @support - the support for the annotated resource (see {@link SpotlightAnnotation}) 23 | * @property {string} @types - the types the resource belongs to in the ontology. 24 | * @property {string} @surfaceForm - the original string used to produce this resource. 25 | * @property {number} @offset - the index at which the surface form was found in the provided text. 26 | * @property {number} @similarityScore - cosine similarity between the context vectors and the context surrounding the surface form. 27 | * @property {number} @percentageOfSecondRank - the relative difference in topic score between the first and the second ranked resource. 28 | */ 29 | 30 | /** 31 | * The annotation response. This object is the response to a call made when annotating a piece of text. 32 | * @typedef SpotlightAnnotation 33 | * @type {Object} 34 | * @property {string} text - text to be annotated. 35 | * @property {string} confidence - confidence score for disambiguation / linking. 36 | * @property {number} support - how prominent is this entity in Lucene Model, i.e. number of inlinks in Wikipedia 37 | * @property {string} types - types filter (Eg.DBpedia:Place). 38 | * @property {string} sparql - SPARQL filtering 39 | * @property {string} policy - (whitelist) select all entities that have the same type; (blacklist) - select all entities that have not the same type. 40 | * @property {SpotlightResource[]} Resources - the resources found for the supplied text. 41 | */ 42 | 43 | /** 44 | * @function castAnnotation 45 | * @description the Spotlight API returns the annotations with certan values cast as strings. 46 | * This function recasts the values back to their appropriate types. 47 | * @param {SpotlightAnnotation} annotation 48 | * @returns the Spotlight annotation, correctly parsed and casted 49 | */ 50 | const castAnnotation = annotation => { 51 | 52 | // FIXME: Use mapping to determine which types to cast 53 | const Resources = annotation.Resources 54 | ? annotation.Resources.map(r => { 55 | return { 56 | ...r, 57 | '@support': parseInt(r['@support'], 10), 58 | '@offset': parseFloat(r['@offset'], 10), 59 | '@similarityScore': parseFloat(r['@similarityScore'], 10), 60 | '@percentageOfSecondRank': parseFloat( 61 | r['@percentageOfSecondRank'], 10 62 | ), 63 | }; 64 | }) 65 | : null; 66 | return { 67 | ...annotation, 68 | '@confidence': parseInt(100 * parseFloat(annotation['@confidence']), 10), 69 | '@support': parseInt(annotation['@support'], 10), 70 | Resources, 71 | }; 72 | }; 73 | 74 | /** 75 | * @function annotate 76 | * @description Returns an annotation object for the specified inputs. 77 | * @param {string} text - Text to annotate 78 | * @param {float} confidence - Confidence with which to annotate 79 | * @param {Object} [options] - Options object for the annotation process 80 | * @param {string} [options.endpoint] - Endpoint url where the Spotlight process runs. Defaults to the Docker container running on Nesta's EC2 instance. 81 | * @returns {SpotlightAnnotation} Spotlight annotation for given input paramaters 82 | */ 83 | export const annotate = async ( 84 | text, 85 | confidence, 86 | { endpoint = spotlightEndpoint } = {} 87 | ) => { 88 | const url = new URL(endpoint); 89 | const body = `text=${encodeURIComponent(text)}&confidence=${confidence}`; 90 | const response = await fetch(url, { 91 | method: 'POST', 92 | headers: { 93 | Accept: 'application/json', 94 | 'content-type': 'application/x-www-form-urlencoded', 95 | }, 96 | body, 97 | }); 98 | if (!response.ok) { 99 | throw new Error(`Annotation failed\nResponse: ${stringify(response)}`); 100 | } 101 | const annotation = await response.json(); 102 | return castAnnotation(annotation); 103 | }; 104 | 105 | /** 106 | * For our purposes, we simplify the {@link SpotlightResource} object. 107 | * All properties below are the exact same. 108 | * @typedef ReducedResource 109 | * @type {Object} 110 | * @property {string} URI - the Unique Resource Identifier for this resource. 111 | * @property {string} surfaceForm - the original string used to produce this resource. 112 | * @property {number} similarityScore - cosine similarity between the context vectors and the context surrounding the surface form. 113 | * @property {number} percentageOfSecondRank - the relative difference in topic score between the first and the second ranked resource. 114 | */ 115 | 116 | /** 117 | * 118 | * @typedef ParsedAnnotation 119 | * @type {Object} 120 | * @property {ReducedResource[]} results - an array of simplified results. 121 | * @property {number} confidence - the confidence at which these results were annotated. 122 | */ 123 | 124 | /** 125 | * @function parseAnnotationResults 126 | * @description this function takes an array of Annotation objects and simplifies them. 127 | * @param {SpotlightAnnotation} spotlightAnnotation - an object returned by {@link annotate} 128 | * @returns {ParsedAnnotation} parsed Annotation. 129 | */ 130 | export const parseAnnotationResults = spotlightAnnotation => 131 | spotlightAnnotation.Resources 132 | ? _.flatMap(spotlightAnnotation.Resources, result => ({ 133 | confidence: spotlightAnnotation['@confidence'], 134 | URI: result['@URI'], 135 | surfaceForm: result['@surfaceForm'], 136 | similarityScore: result['@similarityScore'], 137 | percentageOfSecondRank: result['@percentageOfSecondRank'], 138 | })) 139 | : []; 140 | 141 | /** 142 | * The final form of resource, this is the same as {@link ReducedResource}, however 143 | * the confidence property has been added to the objects values. 144 | * All properties below are the exact same. 145 | * @typedef DBpediaEntity 146 | * @type {Object} 147 | * @property {string} URI - the Unique Resource Identifier for this resource. 148 | * @property {string} surfaceForm - the original string used to produce this resource. 149 | * @property {number} similarityScore - cosine similarity between the context vectors and the context surrounding the surface form. 150 | * @property {number} percentageOfSecondRank - the relative difference in topic score between the first and the second ranked resource. 151 | * @property {number} confidence - the highest confidence at which this resource could be found. This means that all annotations performed at a lower confidence than the one given here will also produce this annotation. 152 | */ 153 | 154 | /** 155 | * @function reduceAnnotationResults 156 | * @description Maps {@link ParsedAnnotation} objects using their confidedence. 157 | * @param {Object.} parsedAnnotationByConfidence - an object where {@link ReducedResource} objects are mapped by the condience with which they were produced. 158 | * @returns {DBpediaEntity[]} - a list of annotated entities. 159 | */ 160 | export const reduceAnnotationResults = spotlightTerms => { 161 | 162 | const reduceTerms = _.mapValuesWith( 163 | _.reduceWith( 164 | (acc, curr) => curr.confidence > acc.confidence ? curr : acc, 165 | ) 166 | ); 167 | 168 | const countDuplicatesOf = confidence => _.mapValuesWith( 169 | _.pipe([ 170 | _.filterWith(_.hasKeyValue('confidence', confidence)), 171 | getLength, 172 | value => ({ [`duplicates_${confidence}`]: value }) 173 | ]), 174 | ); 175 | 176 | const reduceAndCountDuplicates = confidences => _.pipe([ 177 | _.groupBy(_.getKey('URI')), 178 | _.collect([ 179 | reduceTerms, 180 | ..._.map(confidences, countDuplicatesOf) 181 | ]), 182 | _.reduceWith(mergeWithMerge), 183 | _.values 184 | ]); 185 | 186 | const reduceAndCountDuplicatesOf = reduceAndCountDuplicates([10, 60]); 187 | const finalResults = reduceAndCountDuplicatesOf(spotlightTerms); 188 | return finalResults; 189 | }; 190 | 191 | export const generateMetaData = (reducedTerms, spotlightResults) => { 192 | 193 | const metaReducer = (prev, curr) => { 194 | return { 195 | entities_count: prev.entities_count + 1, 196 | confidence_avg: prev.confidence_avg + curr.confidence, 197 | confidence_max: curr.confidence > prev.confidence_max ? curr.confidence : prev.confidence_max, 198 | confidence_min: curr.confidence < prev.confidence_min ? curr.confidence : prev.confidence_min, 199 | dupes_10_count: prev.dupes_10_count + (curr.duplicates_10 > 1 ? 1 : 0), 200 | dupes_60_count: prev.dupes_60_count + (curr.duplicates_60 > 1 ? 1 : 0), 201 | confidence_counts: { 202 | ...prev.confidence_counts, 203 | [curr.confidence]: prev.confidence_counts[curr.confidence] 204 | ? prev.confidence_counts[curr.confidence] + 1 205 | : 1, 206 | }, 207 | }; 208 | }; 209 | 210 | const intialMetaData = { 211 | entities_count: 0, 212 | confidence_avg: 0, 213 | confidence_max: 0, 214 | confidence_min: 100, 215 | dupes_10_count: 0, 216 | dupes_60_count: 0, 217 | confidence_counts: {}, 218 | }; 219 | const reducedMetaData = reducedTerms.reduce(metaReducer, intialMetaData); 220 | const metadata = { 221 | ...reducedMetaData, 222 | confidence_avg: reducedMetaData.confidence_avg / reducedMetaData.entities_count, 223 | dupes_10_ratio: reducedMetaData.dupes_10_count / reducedMetaData.entities_count, 224 | dupes_60_ratio: reducedMetaData.dupes_60_count / reducedMetaData.entities_count 225 | }; 226 | return metadata; 227 | }; 228 | 229 | export const annotateText = async ( 230 | text, 231 | { endpoint = spotlightEndpoint, includeMetaData = null } = {} 232 | ) => { 233 | const spotLightPromises = _.map( 234 | confidenceValues, 235 | confidence => annotate(text, confidence, { endpoint, }) 236 | ); 237 | 238 | /** @type {SpotlightAnnotation[]} */ 239 | const spotlightResults = (await Promise.all(spotLightPromises)).filter( 240 | r => 'Resources' in r 241 | ); 242 | 243 | /** @type {ParsedAnnotation[]} */ 244 | const reducedTerms = _.pipe([ 245 | _.mapWith(parseAnnotationResults), 246 | _.flatten, 247 | reduceAnnotationResults 248 | ])(spotlightResults); 249 | 250 | const metadata = 251 | includeMetaData && generateMetaData(reducedTerms, spotlightResults); 252 | 253 | return { 254 | annotations: reducedTerms, 255 | ...metadata && { metadata }, 256 | }; 257 | }; 258 | 259 | export const annotateArray = async (texts, endpoint) => { 260 | const body = JSON.stringify({ texts }); 261 | const headers = { 'Content-Type': 'application/json' }; 262 | const result = await fetch(endpoint, { body, headers, method: 'POST' }); 263 | const annotations = await result.json(); 264 | return annotations; 265 | }; 266 | 267 | /** 268 | * Results for the higher level process of annotating an ElasticSearch document. 269 | * @typedef documentAnnotationResult 270 | * @type {Object} 271 | * @property {Object} document - the ElasticSearch document supplied for annotation. 272 | * @property {DBpediaEntity[]} annotations - a list of annotations for the supplied document. 273 | */ 274 | 275 | /** 276 | * @function annotateDocument 277 | * @description takes an Elastic search document from Arxlive and annotates the abstract_article field. 278 | * @param {Object} doc - an ElasticSearch document from the Arxlive domain. 279 | * @param {string} field - the field of the docment to use as text for the annotation 280 | * @param {string} endpoint - the endpoint pointing to the Spotlight REST API. 281 | * @return {documentAnnotationResult} - the annotations for this document 282 | */ 283 | export const annotateDocument = async ( 284 | doc, 285 | field, 286 | { includeMetaData = null, endpoint = spotlightEndpoint } = {} 287 | ) => { 288 | const annotationData = await annotateText(doc._source[field], { endpoint, includeMetaData }); 289 | return { id: doc._id, ...annotationData }; 290 | }; 291 | 292 | /** 293 | * @function uploadAnnotatedDocument 294 | * @description abstracts process of uploading document, to avoid uploading empty annotations 295 | * @param {Object} annotations - the dbpedia annotations provided by {@link annotatedDocument} 296 | * @param {string} id - id of document to update 297 | * @param {string} domain - domain on which to upload 298 | * @param {*} index - index on which to upload 299 | * @returns {Promise} a promise indicating status of upload process 300 | */ 301 | export const uploadAnnotatedDocument = ( 302 | { annotations, id, metadata }, 303 | fieldName, 304 | domain, 305 | index 306 | ) => { 307 | 308 | // no point in uploading if the doc/payload is empty 309 | if (Object.keys(annotations).length === 0) { 310 | return Promise.resolve(); 311 | } 312 | return update(domain, index, id, { 313 | [fieldName]: annotations, 314 | ...metadata && { [`${fieldName}_metadata`]: metadata }, 315 | }); 316 | }; 317 | 318 | const annotateBatch = async ( 319 | docs, 320 | fieldName, 321 | newFieldName, 322 | endpoint, 323 | includeMetaData 324 | ) => { 325 | 326 | const toBulkFormat = doc => ({ 327 | '_id': doc._id, 328 | data: { 329 | [newFieldName]: doc.annotations, 330 | ...doc.metadata && { [`${newFieldName}_metadata`]: doc.metadata } 331 | } 332 | }); 333 | 334 | // filter out docs with empty text 335 | const nonEmptyDocs = docs.filter(doc => doc._source[fieldName]); 336 | const emptyDocs = docs.filter(doc => !doc._source[fieldName]); 337 | _.forEach( 338 | emptyDocs, 339 | doc => logger.warn(`Empty field: ${JSON.stringify(doc)}`) 340 | ); 341 | const texts = _.map(nonEmptyDocs, _.getPath(`_source.${fieldName}`)); 342 | const results = await annotateArray(texts, endpoint); 343 | const inputs = _.map( 344 | _.zip(nonEmptyDocs, results), 345 | ([doc, data]) => ({ ...doc, ...data }) 346 | ); 347 | const [annotations, empties] = _.partition( 348 | inputs, 349 | doc => doc.annotations.length !== 0 350 | ); 351 | 352 | if (empties.length) { 353 | _.forEach( 354 | empties, 355 | doc => logger.warn(`Empty doc: ${JSON.stringify(doc)}`) 356 | ); 357 | } 358 | const bulkFormat = _.map(annotations, toBulkFormat); 359 | return bulkFormat; 360 | }; 361 | 362 | const initialiseIndexProgressBar = async (domain, index, batchSize) => { 363 | const bar = new cliProgress.SingleBar( 364 | { etaBuffer: batchSize * 10 }, 365 | cliProgress.Presets.shades_classic 366 | ); 367 | const totalDocuments = await count(domain, index); 368 | bar.start(totalDocuments, 0); 369 | return bar; 370 | }; 371 | 372 | const generateMappingPayload = (name, includeMetaData) => { 373 | const mappingPayload = { 374 | properties: { 375 | [name]: defaultMapping, 376 | ...includeMetaData && { 377 | [`${name}_metadata`]: metaDataMapping, 378 | }, 379 | }, 380 | }; 381 | return mappingPayload; 382 | }; 383 | 384 | export const annotateIndex = async ( 385 | domain, 386 | index, 387 | endpoint, 388 | field, 389 | { 390 | batchSize=50, 391 | groupSize=4, 392 | includeMetaData=true, 393 | newField='dbpedia_entities', 394 | pages='all', 395 | pageSize=10000, 396 | progress=null, 397 | }={} 398 | ) => { 399 | 400 | const mappingPayload = generateMappingPayload(newField, includeMetaData); 401 | await updateMapping(domain, index, { payload: mappingPayload }); 402 | 403 | const bar = progress 404 | ? progress 405 | : await initialiseIndexProgressBar(domain, index, batchSize); 406 | const scroller = scroll(domain, index, { size: pageSize, pages }); 407 | 408 | let page; 409 | for await (page of scroller) { 410 | const batches = batch(page.hits.hits, batchSize); 411 | const groups = batch(batches, groupSize); 412 | const updates = []; 413 | for await (const group of groups) { 414 | // eslint-disable-next-line no-await-in-loop 415 | const promises = _.map(group, docs => 416 | annotateBatch(docs, field, newField, endpoint, includeMetaData) 417 | ); 418 | const resolvedPromises = await promisesHandler(promises); 419 | const annotations = _.flatten(resolvedPromises); 420 | updates.push(annotations); 421 | bar.increment(_.flatten(group).length); 422 | }; 423 | const flattenedUpdates = _.flatten(updates); 424 | 425 | // this is likely to be too big, so separate by default size 426 | const batchedUpdates = batch(flattenedUpdates, 500); 427 | for await (const update_ of batchedUpdates) { 428 | await bulkRequest( 429 | domain, 430 | index, 431 | update_, 432 | 'update', 433 | { error: false, refresh: 'wait_for' } 434 | ); 435 | } 436 | } 437 | 438 | bar.stop(); 439 | 440 | if (page) { 441 | clearScroll(domain, page._scroll_id); 442 | } 443 | 444 | }; 445 | 446 | export const annotateRequest = async request => { 447 | 448 | await annotateIndex( 449 | request.domain, 450 | request.index, 451 | request.annotationEndpoint, 452 | request.field, 453 | request, 454 | ); 455 | return request; 456 | }; 457 | --------------------------------------------------------------------------------