├── .gitignore ├── README.md └── frank /.gitignore: -------------------------------------------------------------------------------- 1 | .project 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Frank 2 | 3 | A frank interface to the LOD Cloud, using the [LOD Laundromat web service](http://lodlaundromat.org "LOD Laundromat web service"). 4 | This interface contains three simple bash scripts. To see the individual documentation of these bash scripts, run them with the `-h` argument. 5 | 6 | * ``frank statements``: fetch statements from the LOD Cloud 7 | * ``frank documents``: fetch LOD Laundromat document reference 8 | * ``frank meta``: fetch metadata for a particular document 9 | -------------------------------------------------------------------------------- /frank: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curlUserAgent="-H \"User-Agent: Frank\"" 3 | #first check bash version. needs to be 4+ to support associative arrays (used for keeping track of prefixes) 4 | associativeSupported=false 5 | if [ ${BASH_VERSION:0:1} -gt "3" ] ; then 6 | associativeSupported=true; 7 | fi 8 | command -v curl >/dev/null 2>&1 || { echo >&2 "Please install curl first"; exit 1; } 9 | ldfApi="http://ldf.lodlaundromat.org" 10 | endpoint="http://sparql.backend.lodlaundromat.org/" 11 | downloadUrl="http://download.lodlaundromat.org" 12 | resourceUrl="http://lodlaundromat.org/resource" 13 | r2dIndex="http://index.lodlaundromat.org/r2d/" 14 | ns2dIndex="http://index.lodlaundromat.org/ns2d/" 15 | #limit how many values clauses we include in sparql. setting too high will result in gigantic and slow sparql queries 16 | sparqlMaxDocsAsValues=50000 17 | echoerr() { echo -e "\e[01;31m$@\e[0m" 1>&2; } 18 | caterr() { 19 | while IFS= read line; do 20 | echo "$line" 1>&2; 21 | done 22 | } 23 | 24 | 25 | ## 26 | ## All documentation 27 | ## 28 | show_main_help() { 29 | caterr << EOF 30 | Usage: ${0##*/} [-h] [] 31 | 32 | The three main frank commands are: 33 | statements Fetch statements from the LOD Cloud, gathered by the LOD Laundromat (add -h for help) 34 | documents Fetch LOD Laundromat document references (add -h for help) 35 | meta Fetch meta-data for a given document (add -h for help) 36 | EOF 37 | } 38 | 39 | 40 | 41 | show_statements_help() { 42 | caterr << EOF 43 | usage: ${0##*/} $mode [] [] 44 | 45 | Fetch statements from the LOD Cloud, using data hosted and cleaned by the LOD Laundromat. 46 | When a LOD Laundromat document reference () is passed as parameter, statements are fetched from that document. 47 | Otherwise, statements are fetched from the complete LOD Cloud 48 | 49 | -h 50 | display this help and exit 51 | -s 52 | --subject 53 | Filter statements by 54 | -p 55 | --predicate 56 | Filter statements by 57 | -o 58 | --object 59 | Filter statements by 60 | -g 61 | --showGraph 62 | Return quads, where the named graph is the reference to the LOD Laundromat resource 63 | of this particular dataset 64 | 65 | EOF 66 | } 67 | 68 | show_documents_help() { 69 | caterr << EOF 70 | Usage: ${0##*/} $mode [] 71 | 72 | Fetch LOD Laundromat document references 73 | 74 | -h 75 | display this help and exit 76 | -b 77 | --buffer 78 | Buffer x documents references in memory. Default: 50 79 | --minTriples 80 | Only fetch documents with at least this amount of triples 81 | --maxTriples 82 | Only fetch documents with at most this amount of triples 83 | --minAvgOutDegree 84 | Only fetch documents with an average out degree of at least this amount 85 | --maxAvgOutDegree 86 | Only fetch documents with an average out degree of maximum this amount 87 | --minAvgInDegree 88 | Only fetch documents with an average in degree of at least this amount 89 | --maxAvgInDegree 90 | Only fetch documents with an average in degree of maximum this amount 91 | --minAvgDegree 92 | Only fetch documents with an average degree of at least this amount 93 | --maxAvgDegree 94 | Only fetch documents with an average degree of maximum this amount 95 | --namespace 96 | Filter document for this namespace. Provide this flag several times to filter 97 | for multiple namespaces ('OR' operation) 98 | --sparql 99 | Use your own SPARQL bgp to filter the results. Use ?doc to refer 100 | to the current document. For more information on available properties, 101 | check schema, or any resource 102 | (e.g. http://lodlaundromat.org/resource/e439f40f187906eb8e6223d57a77d24d) 103 | Example: 104 | {?doc llm:metrics/llm:IRILength/llm:std ?std . 105 | FILTER(?std > 50)} 106 | Available prefixes: 107 | ll: 108 | llo: 109 | llm: ] ... 122 | 123 | Fetch LOD Laundromat meta-data for each LOD Laundromat document references () passed as arguments. 124 | 125 | -h 126 | display this help and exit 127 | EOF 128 | } 129 | 130 | #statements options 131 | sub= 132 | pred= 133 | obj= 134 | showGraph=false 135 | 136 | #documents options 137 | limit=50 138 | download=true 139 | resource=true 140 | minTriples=1; 141 | maxTriples= 142 | minAvgOutDegree= 143 | maxAvgOutDegree= 144 | minAvgInDegree= 145 | maxAvgInDegree= 146 | minAvgDegree= 147 | maxAvgDegree= 148 | bgp="" 149 | namespaces=() 150 | verbose=false; 151 | [ "$#" -eq "0" ] && show_main_help && exit; 152 | mode="$1";shift; 153 | if [[ "$mode" != "statements" && "$mode" != "documents" && "$mode" != "meta" ]]; then show_main_help; exit 1; fi 154 | 155 | if [[ "$mode" == "statements" ]]; then 156 | while [ "$#" -gt 0 ]; do 157 | case $1 in 158 | -h|-\?|--help) 159 | show_statements_help 160 | exit 161 | ;; 162 | -v|--verbose) 163 | verbose=true; 164 | shift; 165 | continue; 166 | ;; 167 | ##parse subject 168 | -s|--subject) 169 | if [ "$#" -gt 1 ]; then 170 | sub=$2 171 | shift 2 172 | continue 173 | else 174 | echoerr 'ERROR: Must specify a non-empty "--subject " argument.' >&2 175 | exit 1 176 | fi 177 | ;; 178 | --subject=?*) 179 | sub=${1#*=} # Delete everything up to "=" and assign the remainder. 180 | ;; 181 | --subject=) 182 | echoerr 'ERROR: Must specify a non-empty "--subject " argument.' >&2 183 | exit 1 184 | ;; 185 | 186 | ##parse predicate 187 | -p|--predicate) 188 | if [ "$#" -gt 1 ]; then 189 | pred=$2 190 | shift 2 191 | continue 192 | else 193 | echoerr 'ERROR: Must specify a non-empty "--predicate " argument.' >&2 194 | exit 1 195 | fi 196 | ;; 197 | --predicate=?*) 198 | pred=${1#*=} # Delete everything up to "=" and assign the remainder. 199 | ;; 200 | --predicate=) 201 | echoerr 'ERROR: Must specify a non-empty "--predicate " argument.' >&2 202 | exit 1 203 | ;; 204 | 205 | ##parse object 206 | -o|--object) 207 | if [ "$#" -gt 1 ]; then 208 | obj=$2 209 | shift 2 210 | continue 211 | else 212 | echoerr 'ERROR: Must specify a non-empty "--object " argument.' >&2 213 | exit 1 214 | fi 215 | ;; 216 | --object=?*) 217 | obj=${1#*=} # Delete everything up to "=" and assign the remainder. 218 | ;; 219 | --object=) 220 | echoerr 'ERROR: Must specify a non-empty "--object " argument.' >&2 221 | exit 1 222 | ;; 223 | 224 | 225 | 226 | -g|--showGraph) 227 | showGraph=true; 228 | ;; 229 | --) # End of all options. 230 | shift 231 | break 232 | ;; 233 | -?*) 234 | printf 'WARN: Unknown option (ignored): %s\n' "$1" >&2 235 | ;; 236 | *) # Default case: If no more options then break out of the loop. 237 | break 238 | esac 239 | 240 | shift 241 | done 242 | elif [[ "$mode" == "documents" ]]; then 243 | while [ "$#" -gt 0 ]; do 244 | case $1 in 245 | -h|-\?|--help) 246 | show_documents_help 247 | exit 248 | ;; 249 | -v|--verbose) 250 | verbose=true; 251 | shift; 252 | continue; 253 | ;; 254 | ##parse limit size ('buffer') 255 | -b|--buffer) 256 | if [ "$#" -gt 1 ]; then 257 | limit=$2 258 | shift 2 259 | continue 260 | else 261 | echoerr 'ERROR: Must specify a non-empty "--buffer " argument.' >&2 262 | exit 1 263 | fi 264 | ;; 265 | --buffer=?*) 266 | limit=${1#*=} # Delete everything up to "=" and assign the remainder. 267 | ;; 268 | --buffer=) 269 | echoerr 'ERROR: Must specify a non-empty "--buffer " argument.' >&2 270 | exit 1 271 | ;; 272 | 273 | ##parse min triples) 274 | --minTriples) 275 | if [ "$#" -gt 1 ]; then 276 | minTriples=$2 277 | shift 2 278 | continue 279 | else 280 | echoerr 'ERROR: Must specify a non-empty "--minTriples " argument.' >&2 281 | exit 1 282 | fi 283 | ;; 284 | --minTriples=?*) 285 | minTriples=${1#*=} # Delete everything up to "=" and assign the remainder. 286 | ;; 287 | --minTriples=) 288 | echoerr 'ERROR: Must specify a non-empty "--minTriples " argument.' >&2 289 | exit 1 290 | ;; 291 | ##parse max triples) 292 | --maxTriples) 293 | if [ "$#" -gt 1 ]; then 294 | maxTriples=$2 295 | shift 2 296 | continue 297 | else 298 | echoerr 'ERROR: Must specify a non-empty "--maxTriples " argument.' >&2 299 | exit 1 300 | fi 301 | ;; 302 | --maxTriples=?*) 303 | maxTriples=${1#*=} # Delete everything up to "=" and assign the remainder. 304 | ;; 305 | --maxTriples=) 306 | echoerr 'ERROR: Must specify a non-empty "--maxTriples " argument.' >&2 307 | exit 1 308 | ;; 309 | ##parse min avg out degree) 310 | --minAvgOutDegree) 311 | if [ "$#" -gt 1 ]; then 312 | minAvgOutDegree=$2 313 | shift 2 314 | continue 315 | else 316 | echoerr 'ERROR: Must specify a non-empty "--minAvgOutDegree " argument.' >&2 317 | exit 1 318 | fi 319 | ;; 320 | --minAvgOutDegree=?*) 321 | minAvgOutDegree=${1#*=} # Delete everything up to "=" and assign the remainder. 322 | ;; 323 | --minAvgOutDegree=) 324 | echoerr 'ERROR: Must specify a non-empty "--minAvgOutDegree " argument.' >&2 325 | exit 1 326 | ;; 327 | ##parse max avg out degree) 328 | --maxAvgOutDegree) 329 | if [ "$#" -gt 1 ]; then 330 | maxAvgOutDegree=$2 331 | shift 2 332 | continue 333 | else 334 | echoerr 'ERROR: Must specify a non-empty "--maxAvgOutDegree " argument.' >&2 335 | exit 1 336 | fi 337 | ;; 338 | --maxAvgOutDegree=?*) 339 | maxAvgOutDegree=${1#*=} # Delete everything up to "=" and assign the remainder. 340 | ;; 341 | --maxAvgOutDegree=) 342 | echoerr 'ERROR: Must specify a non-empty "--maxAvgOutDegree " argument.' >&2 343 | exit 1 344 | ;; 345 | ##parse min avg in degree) 346 | --minAvgInDegree) 347 | if [ "$#" -gt 1 ]; then 348 | minAvgInDegree=$2 349 | shift 2 350 | continue 351 | else 352 | echoerr 'ERROR: Must specify a non-empty "--minAvgInDegree " argument.' >&2 353 | exit 1 354 | fi 355 | ;; 356 | --minAvgInDegree=?*) 357 | minAvgInDegree=${1#*=} # Delete everything up to "=" and assign the remainder. 358 | ;; 359 | --minAvgInDegree=) 360 | echoerr 'ERROR: Must specify a non-empty "--minAvgInDegree " argument.' >&2 361 | exit 1 362 | ;; 363 | ##parse max avg in degree) 364 | --maxAvgInDegree) 365 | if [ "$#" -gt 1 ]; then 366 | maxAvgInDegree=$2 367 | shift 2 368 | continue 369 | else 370 | echoerr 'ERROR: Must specify a non-empty "--maxAvgInDegree " argument.' >&2 371 | exit 1 372 | fi 373 | ;; 374 | --maxAvgInDegree=?*) 375 | maxAvgInDegree=${1#*=} # Delete everything up to "=" and assign the remainder. 376 | ;; 377 | --maxAvgInDegree=) 378 | echoerr 'ERROR: Must specify a non-empty "--maxAvgInDegree " argument.' >&2 379 | exit 1 380 | ;; 381 | ##parse min avg degree) 382 | --minAvgDegree) 383 | if [ "$#" -gt 1 ]; then 384 | minAvgDegree=$2 385 | shift 2 386 | continue 387 | else 388 | echoerr 'ERROR: Must specify a non-empty "--minAvgDegree " argument.' >&2 389 | exit 1 390 | fi 391 | ;; 392 | --minAvgDegree=?*) 393 | minAvgDegree=${1#*=} # Delete everything up to "=" and assign the remainder. 394 | ;; 395 | --minAvgDegree=) 396 | echoerr 'ERROR: Must specify a non-empty "--minAvgDegree " argument.' >&2 397 | exit 1 398 | ;; 399 | ##parse max avg degree) 400 | --maxAvgDegree) 401 | if [ "$#" -gt 1 ]; then 402 | maxAvgDegree=$2 403 | shift 2 404 | continue 405 | else 406 | echoerr 'ERROR: Must specify a non-empty "--maxAvgDegree " argument.' >&2 407 | exit 1 408 | fi 409 | ;; 410 | --maxAvgDegree=?*) 411 | maxAvgDegree=${1#*=} # Delete everything up to "=" and assign the remainder. 412 | ;; 413 | --maxAvgDegree=) 414 | echoerr 'ERROR: Must specify a non-empty "--maxAvgDegree " argument.' >&2 415 | exit 1 416 | ;; 417 | ##parse sparql param) 418 | --sparql) 419 | if [ "$#" -gt 1 ]; then 420 | bgp=$2 421 | shift 2 422 | continue 423 | else 424 | echoerr 'ERROR: Must specify a non-empty "--sparql " argument.' >&2 425 | exit 1 426 | fi 427 | ;; 428 | --sparql=?*) 429 | bgp=${1#*=} # Delete everything up to "=" and assign the remainder. 430 | ;; 431 | --sparql=) 432 | echoerr 'ERROR: Must specify a non-empty "--sparql " argument.' >&2 433 | exit 1 434 | ;; 435 | 436 | ##parse namespace param) 437 | --namespace) 438 | if [ "$#" -gt 1 ]; then 439 | namespaces+=("$2") 440 | shift 2 441 | continue 442 | else 443 | echoerr 'ERROR: Must specify a non-empty "--namespace " argument.' >&2 444 | exit 1 445 | fi 446 | ;; 447 | --namespace=?*) 448 | namespaces+=(${1#*=}) # Delete everything up to "=" and assign the remainder. 449 | ;; 450 | --namespace=) 451 | echoerr 'ERROR: Must specify a non-empty "--namespace " argument.' >&2 452 | exit 1 453 | ;; 454 | 455 | 456 | -d|--downloadUri) 457 | resource=false; 458 | ;; 459 | -r|--resourceUri) 460 | download=false; 461 | ;; 462 | --) # End of all options. 463 | shift 464 | break 465 | ;; 466 | -?*) 467 | printf 'WARN: Unknown option (ignored): %s\n' "$1" >&2 468 | ;; 469 | *) # Default case: If no more options then break out of the loop. 470 | break 471 | esac 472 | 473 | shift 474 | done 475 | elif [[ "$mode" == "meta" ]]; then 476 | while [ "$#" -gt 0 ]; do 477 | case $1 in 478 | -h|-\?|--help) 479 | show_meta_help 480 | exit 481 | ;; 482 | -v|--verbose) 483 | verbose=true; 484 | shift; 485 | continue; 486 | ;; 487 | --) # End of all options. 488 | shift 489 | break 490 | ;; 491 | -?*) 492 | printf 'WARN: Unknown option (ignored): %s\n' "$1" >&2 493 | ;; 494 | *) # Default case: If no more options then break out of the loop. 495 | break 496 | esac 497 | 498 | shift 499 | done 500 | fi 501 | 502 | 503 | 504 | 505 | 506 | prefixes=false 507 | if $associativeSupported; then 508 | #top25 of prefix.cc prefixes 509 | declare -A prefixes=(["yago"]="http://yago-knowledge.org/resource/" ["rdf"]="http://www.w3.org/1999/02/22-rdf-syntax-ns#" ["foaf"]="http://xmlns.com/foaf/0.1/" ["dbp"]="http://dbpedia.org/property/" ["dc"]="http://purl.org/dc/elements/1.1/" ["owl"]="http://www.w3.org/2002/07/owl#" ["rdfs"]="http://www.w3.org/2000/01/rdf-schema#" ["dbo"]="http://dbpedia.org/ontology/" ["ont"]="http://purl.org/net/ns/ontology-annot#" ["onto"]="http://www.ontotext.com/" ["skos"]="http://www.w3.org/2004/02/skos/core#" ["geo"]="http://www.w3.org/2003/01/geo/wgs84_pos#" ["rss"]="http://purl.org/rss/1.0/" ["gldp"]="http://www.w3.org/ns/people#" ["sioc"]="http://rdfs.org/sioc/ns#" ["fb"]="http://rdf.freebase.com/ns/" ["sc"]="http://purl.org/science/owl/sciencecommons/" ["geonames"]="http://www.geonames.org/ontology#" ["xsd"]="http://www.w3.org/2001/XMLSchema#" ["gr"]="http://purl.org/goodrelations/v1#" ["dcterms"]="http://purl.org/dc/terms/" ["dct"]="http://purl.org/dc/terms/" ["org"]="http://www.w3.org/ns/org#" ["dbr"]="http://dbpedia.org/resource/" ["qb"]="http://purl.org/linked-data/cube#" ["void"]="http://rdfs.org/ns/void#") 510 | 511 | #first try to substitute namespace for sub,pred,obj filters 512 | if [ -n "$sub" ]; then 513 | ns=`echo "$sub"| cut -d':' -f 1` 514 | if [ -n "${prefixes["$ns"]}" ]; then 515 | localN=`echo "$sub"| cut -d':' -f 2` 516 | sub="${prefixes["$ns"]}$localN" 517 | fi 518 | fi 519 | if [ -n "$pred" ]; then 520 | ns=`echo "$pred"| cut -d':' -f 1` 521 | if [ -n "${prefixes["$ns"]}" ]; then 522 | localN=`echo "$pred"| cut -d':' -f 2` 523 | pred="${prefixes["$ns"]}$localN" 524 | fi 525 | fi 526 | if [ -n "$obj" ]; then 527 | ns=`echo "$obj"| cut -d':' -f 1` 528 | if [ -n "${prefixes["$ns"]}" ]; then 529 | localN=`echo "$obj"| cut -d':' -f 2` 530 | obj="${prefixes["$ns"]}$localN" 531 | fi 532 | fi 533 | 534 | 535 | 536 | if [ -n "$namespaces" ] && [ ${#namespaces[@]} -gt 0 ]; then 537 | #now try to substitute namespaces in namespaces array (used as filter) 538 | newNamespaces=() 539 | for ns in "${namespaces[@]}"; do 540 | if [ "${ns:0:4}" == "http" ]; then 541 | #use as-is 542 | newNamespaces+=("$ns") 543 | else 544 | #try substituting 545 | if [ -n "${prefixes["$ns"]}" ]; then 546 | #found one in our list. add that value 547 | newNamespaces+=("${prefixes["$ns"]}") 548 | else 549 | echoerr "Could not find prefix IRI for namespace name \"$ns\". Instead, filtering for documents that contain IRIs that start with \"<$ns\"" 550 | newNamespaces+=("$ns") 551 | fi 552 | fi 553 | done 554 | namespaces=$newNamespaces 555 | fi 556 | fi 557 | #check whether we've found the full URI for any prefixed sub/pred/obj 558 | prefixRegex='^\w\+:\w\+$' 559 | if [ -n "$sub" ] && grep -q "$prefixRegex" <<< $sub; then 560 | if ! $associativeSupported; then 561 | echoerr "Prefixes are only supported by bash version 4. Your version does not meet these requirements: $BASH_VERSION" 562 | else 563 | echoerr "Could not find prefixed $sub. Use the full URI instead"; 564 | fi 565 | fi 566 | if [ -n "$pred" ] && grep -q "$prefixRegex" <<< $pred; then 567 | if ! $associativeSupported; then 568 | echoerr "Prefixes are only supported by bash version 4. Your version does not meet these requirements: $BASH_VERSION" 569 | else 570 | echoerr "Could not find prefixed $pred. Use the full URI instead"; 571 | fi 572 | fi 573 | if [ -n "$obj" ] && grep -q "$prefixRegex" <<< $obj; then 574 | if ! $associativeSupported; then 575 | echoerr "Prefixes are only supported by bash version 4. Your version does not meet these requirements: $BASH_VERSION" 576 | else 577 | echoerr "Could not find prefixed $obj. Use the full URI instead"; 578 | fi 579 | fi 580 | 581 | 582 | fetchStatementsForDoc() { 583 | #start with a sanity check (simple non-empty check) 584 | if [ -z "$1" ]; then return; fi 585 | 586 | #extract md5 from resource url 587 | md5=$(sed 's/.*\/\([a-z0-9]\{32\}\).*/\1/' <<< "$1") 588 | 589 | if [ -z "$md5" ]; then echo "Not a valid LOD Laundromat document identifier: $1" && exit 1; fi 590 | 591 | if [ -z "$sub" ] && [ -z "$pred" ] && [ -z "$obj" ] && ! $showGraph; then 592 | #just fetch gzip file. don't need to filter 593 | curl -Gs $curlUserAgent $downloadUrl/$md5 | zcat; 594 | else 595 | 596 | if [ -n "$sub" ] || [ -n "$pred" ] || [ -n "$obj" ]; then 597 | #Ah, in this case we've already looked up which documents to process. Check whether this md5 is one of them.. 598 | if ! $filterForDocsFailed && ! grep -q "$md5" <<< "$filterForDocs"; then 599 | return; 600 | fi 601 | 602 | fi 603 | 604 | #might need to use this var more often, so generate it once for this particular doc 605 | quadReplace=$(sed -e 's/[\/&]/\\&/g' <<< "$1") 606 | 607 | url="$ldfApi/$md5" 608 | page=1; 609 | hasNext=true 610 | #while we have something to send request to, i.e. pagination has not finished yet for this document 611 | while $hasNext; do 612 | hasNext=false; 613 | response=$(curl $curlUserAgent -Gs $url -H "Accept: application/n-quads" --data-urlencode "subject=$sub" --data-urlencode "predicate=$pred" --data-urlencode "object=$obj" --data-urlencode "page=$page") 614 | while read -r quad; do 615 | #skip blank lines 616 | if [ -z "$quad" ]; then continue; fi 617 | if echo $quad | grep -qE '#metadata>\s*\.'; then 618 | if echo $quad | grep -q 'nextPage'; then 619 | hasNext=true 620 | ((page++)) 621 | fi 622 | else 623 | if $showGraph ; then 624 | echo "$quad" | sed "s/\(.*\)\.$/\1 <$quadReplace>./" 625 | else 626 | echo $quad; 627 | fi 628 | fi 629 | 630 | done <<< "$response" 631 | done; 632 | fi; 633 | } 634 | 635 | fetchDocs() { 636 | useSparql=false; 637 | prefixes="PREFIX llm: PREFIX llo: PREFIX ll: " 638 | offset=0 639 | docTpfs="?doc llo:triples ?triples ; llo:md5 ?md5 ." 640 | filters="FILTER(?triples >= $minTriples) " 641 | if [ -n "$maxTriples" ]; then 642 | useSparql=true; 643 | filters="$filters FILTER(?triples <= $maxTriples) " 644 | fi 645 | if [ "$minTriples" -gt "2" ]; then 646 | useSparql=true; 647 | fi 648 | if [ -n "$minAvgInDegree" ]; then 649 | useSparql=true; 650 | filters="$filters ?doc llm:metrics/llm:inDegree/llm:mean ?avgInDegree. FILTER(?avgInDegree >= $minAvgInDegree) " 651 | fi 652 | if [ -n "$maxAvgInDegree" ]; then 653 | useSparql=true; 654 | filters="$filters ?doc llm:metrics/llm:inDegree/llm:mean ?avgInDegree. FILTER(?avgInDegree <= $maxAvgInDegree) " 655 | fi 656 | if [ -n "$minAvgOutDegree" ]; then 657 | useSparql=true; 658 | filters="$filters ?doc llm:metrics/llm:outDegree/llm:mean ?avgOutDegree. FILTER(?avgOutDegree >= $minAvgOutDegree) " 659 | fi 660 | if [ -n "$maxAvgoutDegree" ]; then 661 | useSparql=true; 662 | filters="$filters ?doc llm:metrics/llm:outDegree/llm:mean ?avgOutDegree. FILTER(?avgOutDegree <= $maxAvgOutDegree) " 663 | fi 664 | if [ -n "$minAvgDegree" ]; then 665 | useSparql=true; 666 | filters="$filters ?doc llm:metrics/llm:degree/llm:mean ?avgDegree. FILTER(?avgDegree >= $minAvgDegree) " 667 | fi 668 | if [ -n "$maxAvgDegree" ]; then 669 | useSparql=true; 670 | filters="$filters ?doc llm:metrics/llm:degree/llm:mean ?avgDegree. FILTER(?avgDegree <= $maxAvgDegree) " 671 | fi 672 | if [ -z "$namespaces" ] || [ ${#namespaces[@]} -eq 0 ]; then 673 | #Not filtering by namespace. I.e., we might be filtering on nothing, and want all documents. 674 | #so, use sparql 675 | useSparql=true; 676 | fi 677 | if $useSparql; then 678 | values= 679 | if [ -n "$namespaceDocs" ]; then 680 | #first check whether we really want to include the ns docs 681 | #having too many might make this sparql query waaaaay too long (~25MB) 682 | docsLength=${#namespaceDocs} 683 | numDocs=$(($docsLength + 1 / 33)) 684 | if [ "$numDocs" -lt "$sparqlMaxDocsAsValues" ]; then 685 | values="VALUES (?doc) {" 686 | while read -r md5; do 687 | values="$values (ll:$md5)" 688 | done <<< "$namespaceDocs" 689 | values="$values }"; 690 | fi 691 | fi 692 | 693 | 694 | #Start fetching docs 695 | while true; do 696 | limitOffset="LIMIT $limit OFFSET $offset"; 697 | 698 | query="$prefixes SELECT ?doc ?md5 WHERE {$docTpfs $filters $bgp $values} $limitOffset" 699 | 700 | #nicest approach would be to stream through results. For now, just take this quick approach 701 | result=$(curl $curlUserAgent -X POST -s "$endpoint" --data-urlencode "query=$query" -H 'Accept: text/csv' | sed '1d'); 702 | 703 | while read -r line; do 704 | if [ -z "$line" ]; then 705 | #just whitespace string 706 | continue 707 | fi 708 | println=; 709 | md5=$(echo "$line" | sed 's/.*\"\([^\"]*\)\"$/\1/') 710 | 711 | #we're filtering by namespace. If md5 does not occur in a dataset containing this ns, then skip it 712 | [ -n "$namespaces" ] && [ -z "$values" ] && ! grep -q "$md5" <<< "$namespaceDocs" && continue; 713 | 714 | #do some sed parsing. Easy, because we know there will be no quotes in the resultset (won't get literals) 715 | if $download ; then 716 | println="$downloadUrl/$md5" 717 | fi 718 | if $resource ; then 719 | r=$(echo "$line" | sed 's/^\"\([^\"]*\)\".*/\1/') 720 | if [ -z $println ]; then 721 | println="$r" 722 | else 723 | println="$println $r" 724 | fi 725 | 726 | fi 727 | echo $println; 728 | done <<< "$result" 729 | if [ -z "$result" ]; then 730 | #no results left, we are done! 731 | exit 0; 732 | fi 733 | 734 | 735 | #up offset for next query 736 | offset=`expr $offset + $limit` 737 | done 738 | else 739 | #no need to use sparql. we're only filtering by namespace. Just return those documents 740 | while read -r md5; do 741 | if [ -z "$md5" ]; then 742 | #just whitespace string 743 | continue 744 | fi 745 | println=; 746 | $download && println="$downloadUrl/$md5" 747 | if $resource; then 748 | if [ -z $println ]; then 749 | println="$resourceUrl/$md5" 750 | else 751 | println="$println $resourceUrl/$md5" 752 | fi 753 | fi 754 | [ -n "$println" ] && echo "$println" 755 | done <<< "$namespaceDocs" 756 | 757 | fi 758 | } 759 | 760 | 761 | fetchMeta() { 762 | #start with a sanity check (simple non-empty check) 763 | if [ -z "$1" ]; then return; fi 764 | 765 | mainTPatterns="<$1> llm:metrics ?metricDoc ; ?pred ?obj ." 766 | optionalTPattern="?obj ?pred2 ?obj2 ." 767 | query="PREFIX llm: CONSTRUCT { $mainTPatterns $optionalTPattern } WHERE { $mainTPatterns OPTIONAL{$optionalTPattern}}" 768 | response=`curl $curlUserAgent -Gs $endpoint --data-urlencode "query=$query" -H "Accept: text/plain"` 769 | #the response may return '# Empty NT' 770 | echo "$response" | head -n 1 | grep "Empty NT" > /dev/null && return; 771 | echo "$response"; 772 | } 773 | 774 | 775 | 776 | 777 | filterForDocs= 778 | filterForDocsFailed=false 779 | hasStatementFilter=false 780 | fetchDocsForResources() { 781 | #if sub, pred or obj filter is set, check whether we can prune docs using our index 782 | if [ -n "$sub" ] || [ -n "$pred" ] || [ -n "$obj" ]; then 783 | hasStatementFilter=true 784 | if [ -n "$sub" ]; then 785 | curlCmd="curl $curlUserAgent --fail -Gs $r2dIndex$sub" 786 | filterForDocs=`eval $curlCmd | tr "," "\n" | tail -n+5 | sed '$ d' | tr -d '"'; exit ${PIPESTATUS[0]}` 787 | curlStatus=$?; 788 | if [ $curlStatus -gt 0 ]; then 789 | echoerr "Failed fetching resources from index. Command: " 790 | echoerr "$curlCmd"; 791 | filterForDocsFailed=true 792 | return 793 | fi 794 | fi 795 | 796 | if [ -n "$pred" ]; then 797 | curlCmd="curl $curlUserAgent --fail -Gs $r2dIndex$pred" 798 | docs=`eval $curlCmd | tr "," "\n" | tail -n+5 | sed '$ d' | tr -d '"'; exit ${PIPESTATUS[0]}` 799 | curlStatus=$?; 800 | if [ $curlStatus -gt 0 ]; then 801 | echoerr "Failed fetching resources from index. Command: " 802 | echoerr "$curlCmd"; 803 | filterForDocsFailed=true 804 | return 805 | fi 806 | #if we've already got a doc list, then merge both 807 | if [ -n "$filterForDocs" ]; then 808 | filterForDocs=`comm -12 <(echo "$filterForDocs") <(echo "$docs");` 809 | else 810 | filterForDocs="$docs" 811 | fi 812 | fi 813 | 814 | if [ -n "$obj" ]; then 815 | curlCmd="curl $curlUserAgent --fail -Gs $r2dIndex$obj" 816 | docs=`eval $curlCmd | tr "," "\n" | tail -n+5 | sed '$ d' | tr -d '"'; exit ${PIPESTATUS[0]}` 817 | curlStatus=$?; 818 | if [ $curlStatus -gt 0 ]; then 819 | echoerr "Failed fetching resources from index. Command: " 820 | echoerr "$curlCmd"; 821 | filterForDocsFailed=true 822 | return 823 | fi 824 | #if we've already got a doc list, then merge both 825 | if [ -n "$filterForDocs" ]; then 826 | filterForDocs=`comm -12 <(echo "$filterForDocs") <(echo "$docs")` 827 | else 828 | filterForDocs="$docs" 829 | fi 830 | 831 | fi 832 | fi 833 | } 834 | 835 | namespaceDocs= 836 | fetchNamespaces() { 837 | #if sub, pred or obj filter is set, check whether we can prune docs using our index 838 | if [ -n "$namespaces" ]; then 839 | while read -r namespace; do 840 | curlCmd="curl $curlUserAgent --fail -Gs $ns2dIndex$namespace" 841 | docs=`eval $curlCmd | tr "," "\n" | tail -n+5 | sed '$ d' | tr -d '"'; exit ${PIPESTATUS[0]}` 842 | curlStatus=$?; 843 | if [ $curlStatus -gt 0 ]; then 844 | echoerr "Failed fetching namespaces from index. Command: " 845 | echoerr "$curlCmd"; 846 | fi 847 | #if we've already got a doc list, then merge both 848 | if [ -n "$namespaceDocs" ]; then 849 | namespaceDocs=`comm -12 <(echo "$namespaceDocs") <(echo "$docs")` 850 | else 851 | namespaceDocs="$docs" 852 | fi 853 | done < <(tr ' ' '\n' <<< $namespaces) 854 | fi 855 | } 856 | 857 | 858 | ## 859 | ## This is where we actually start doing stuff 860 | ## 861 | if [[ "$mode" == "statements" ]]; then 862 | fetchDocsForResources; 863 | runForDocs=true 864 | if [ "$#" -gt 0 ]; then 865 | #there are graphs passed as arguments to the script 866 | for res in "$@"; do fetchStatementsForDoc "$res";done 867 | runForDocs=false 868 | fi 869 | if [ ! -t 0 ]; then 870 | #something is piped to this program 871 | while read res ; do fetchStatementsForDoc "$res" ; done 872 | runForDocs=false 873 | fi 874 | if $runForDocs ; then 875 | #no arguments and nothing piped. 876 | if $hasStatementFilter && ! $filterForDocsFailed && [ -z "$filterForDocs" ]; then 877 | #we're filtering statements, but could not find any related documents via our index 878 | #i.e., nothing to return! 879 | echoerr "No results found"; 880 | exit 0; 881 | fi 882 | 883 | if ! $filterForDocsFailed && [ -n "$filterForDocs" ]; then 884 | #we're filtering the results (by sub/pred/obj). I.e., we know where we need to be! 885 | #use this list 886 | while read -r md5; do 887 | fetchStatementsForDoc "http://lodlaundromat.org/resource/$md5" 888 | done <<< "$filterForDocs" 889 | else 890 | #we want only want the download links from the documents function 891 | download=false 892 | resource=true 893 | #Just use the whole lod cloud 894 | while read res ; do fetchStatementsForDoc "$res" ; done < <( fetchDocs ) 895 | fi 896 | fi 897 | elif [[ "$mode" == "documents" ]]; then 898 | fetchNamespaces 899 | fetchDocs 900 | elif [[ "$mode" == "meta" ]]; then 901 | showHelp=true 902 | if [ "$#" -gt 0 ]; then 903 | #there are graphs passed as arguments to the script 904 | for res in "$@"; do fetchMeta "$res";done 905 | showHelp=false 906 | fi 907 | if [ ! -t 0 ]; then 908 | #something is piped to this program 909 | while read res ; do fetchMeta "$res" ; done 910 | showHelp=false 911 | fi 912 | if $showHelp ; then 913 | show_meta_help 914 | fi 915 | fi 916 | --------------------------------------------------------------------------------