├── .gitignore ├── .rspec ├── .travis.yml ├── .yardopts ├── CONFIGURATION.md ├── Dockerfile ├── Gemfile ├── LICENSE ├── README-OLD.md ├── README-RDF-SAK.md ├── README.md ├── Rakefile ├── TODO.org ├── behaviour.org ├── bin ├── console └── setup ├── docker-compose.yml ├── example ├── add-creator.rq ├── cleanup.xsl ├── extract-mvp.rq ├── flow-diagram.png ├── flow-diagram.svg ├── ibis-mvp.ttl ├── ibis.ttl ├── internal-access.dot ├── internal-access.svg ├── matches.xhtml ├── nuke-old-mvp.rq ├── sh-rule.ttl ├── transforms.ttl └── transforms2.ttl ├── exe └── intertwingler ├── experimental ├── fragment.rq ├── how-about-readline.rb ├── ordered-collection.ttl ├── parse-config.rb ├── stable-fragments.rb └── tty-reader.rb ├── intertwingler.conf ├── intertwingler.gemspec ├── lib ├── intertwingler.rb └── intertwingler │ ├── cli.rb │ ├── config.rb │ ├── console.rb │ ├── docstats.rb │ ├── document.rb │ ├── engine.rb │ ├── error.rb │ ├── graphops.rb │ ├── handler.rb │ ├── handler │ ├── cas.rb │ ├── catalogue.rb │ ├── filesystem.rb │ ├── generated.rb │ └── kv.rb │ ├── harness.rb │ ├── loggable.rb │ ├── nlp.rb │ ├── params.rb │ ├── representation.rb │ ├── representation │ ├── nokogiri.rb │ └── vips.rb │ ├── resolver.rb │ ├── resource.rb │ ├── rubyurn.rb │ ├── source.rb │ ├── surface.rb │ ├── transform.rb │ ├── transform │ ├── markup.rb │ ├── raster.rb │ ├── rdf.rb │ ├── sass.rb │ └── text.rb │ ├── types.rb │ ├── urlrunner.rb │ ├── util.rb │ ├── util │ ├── clean.rb │ └── messy.rb │ ├── version.rb │ ├── vocab.rb │ └── vocab │ ├── adms.rb │ ├── cgto.rb │ ├── ci.rb │ ├── ibis.rb │ ├── itcv.rb │ ├── pav.rb │ ├── pm.rb │ ├── qb.rb │ ├── scovo.rb │ └── tfo.rb ├── sample2.conf └── spec ├── intertwingler ├── document_spec.rb ├── graphops_spec.rb ├── representation_spec.rb ├── resolver_spec.rb ├── source_spec.rb ├── surface_spec.rb └── transform_spec.rb ├── intertwingler_spec.rb └── spec_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | \#*\# 3 | .\#* 4 | ._* 5 | /.bundle/ 6 | /.yardoc 7 | /Gemfile.lock 8 | /_yardoc/ 9 | /coverage/ 10 | /doc/ 11 | /pkg/ 12 | /spec/reports/ 13 | /tmp/ 14 | sample.conf 15 | .byebug_history 16 | *.gem 17 | flow-diagram-white.png -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.3.1 4 | before_install: gem install bundler -v 1.11.2 5 | -------------------------------------------------------------------------------- /.yardopts: -------------------------------------------------------------------------------- 1 | --markup=markdown 2 | --no-private 3 | -------------------------------------------------------------------------------- /CONFIGURATION.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | who knows man i'm just getting started myself 4 | 5 | been so busy trying to get the guts to work i haven't really thought about the ux of this 6 | 7 | i mean let's say you download and install this thing, then what 8 | 9 | (a docker image is forthcoming btw) 10 | 11 | anyway it's gonna be an empty thing so not really very interesting to most people for a while 12 | 13 | # what _i_ will probably do once the damn thing is working 14 | 15 | i already said [in the video](https://youtu.be/d5-lcvKfBM4) what the first things are: 16 | 17 | * [my website](https://doriantaylor.com/) 18 | * my client extranets 19 | * [the nature of software](https://the.natureof/software/) 20 | * [remaking the ibis tool](https://ibis.makethingsmakesense.com/) 21 | 22 | ## my website 23 | 24 | this thing has a bunch of cruft on the file system which will stay there for the time being 25 | 26 | excited to finally replace the fake scaled images (rewrite rules) that have been there for like 15 years with real ones 27 | 28 | also excited to do a content inventory for once, basically bootstrap some content inventory ui 29 | 30 | also scan my whole site for terminology, proper nouns etc, finally flesh out the audience modeling stuff 31 | 32 | go back and look at the document stats and try to do something useful with it 33 | 34 | oh how about do some friggin stretchtext 35 | 36 | also noodle with that video stuff 37 | 38 | ## my client extranets 39 | 40 | this is a no-brainer; get everything into content-addressable storage 41 | 42 | ## the nature of software 43 | 44 | this one is a lot newer and can live completely in content-addressable storage 45 | 46 | i am very interested in making an annotation system 47 | 48 | plus also same indexes as my site: concepts, books, people/orgs 49 | 50 | photo index/credits a big one 51 | 52 | there is also a bunch of stuff with the original books that i'd like to do 53 | 54 | ## the ibis tool redux 55 | 56 | this thing lives almost completely in the graph 57 | 58 | this one will need the proxy handler for sure for external links 59 | 60 | here is where most of the work on really dense graph-forward stuff is gonna be 61 | 62 | definitely hook in [pm ontology](https://vocab.methodandstructure.com/process-model#) 63 | 64 | definitely need a data entry interface for people/orgs too 65 | 66 | # anyway so if you were gonna use this to absorb a website 67 | 68 | say static site, like [jekyll](https://jekyllrb.com/); if it's wordpress or whatever i have no idea 69 | 70 | i mean you could probably just load all the files into content-addressable storage, that'd be a good start 71 | 72 | > here's the thing i've been thinking though: stuff like git literally *is* a content-addressable store with an index on top, it probably wouldn't be hard to make an adapter that just hoovers up git repositories and sticks their metadata in the graph such that you could perfectly recreate a git repository 73 | > 74 | > only wrinkle is [`Store::Digest`](https://github.com/doriantaylor/rb-store-digest) isn't smart enough to store metadata about functional relationships between objects which is what you would need to represent stuff without it going nuts with redundant data 75 | > 76 | > like if you have a general-purpose content-addressable store, like just a huge gaping maw where you chuck whatever blobs of data, especially if you're using it for cache too, it's gonna get real big real fast 77 | > 78 | > git in particular runs everything through `DEFLATE` no matter what it is and then stores that, but the hashes in the file names correspond to the *un*compressed data 79 | > 80 | > git also diffs the current version of every file against the last version, keeps the current version whole to be fast, and then just stores the diff for all previous versions, which it reconstitutes on the fly (or at least it did, i dunno what it does now) 81 | > 82 | > thing about `Store::Digest` (and really any other content-addressable store) is it knows what it has in it, but it doesn't know _why_; that's what the index layer is for 83 | > 84 | > so if you have two things in the store, a big object and a small object, and can say definitively that the small object is the same as the big object with a particular invertible function applied to it (`b = gzip(a) <=> a = gunzip(b)`), then you can delete the big object and just recompute it when you need to 85 | > 86 | > now that gets trickier when a _third_ object is used as input: `diff(a, b) = c <=> patch(b, c) = a` because you will have to make goddamn certain you _never_ throw that part out 87 | > 88 | > easy enough to manage in git because everything in a git repository belongs to _it_. not the case for a general-purpose content-addressable store. 89 | > 90 | > worst thing for a content-addressable store is you have something in there and no idea why 91 | > 92 | > i.e., your graph has no record of the object at all 93 | > 94 | > can't delete it; something else might be using it 95 | > 96 | > anyway, strategies for de-chesterton-fencing a content-addressable store probably a decent idea 97 | > 98 | > inclined to partition it so stuff like cache doesn't get mixed in with non-cache; only thing about that is there *will* be collisions so sometimes you'll be storing the same object twice, although that's probably not nearly as bad as an ocean of runaway blobjects with no provenance 99 | > 100 | > i mean you'll be to scan them or whatever and make determinations on most of them but that is Work™ that somebody has to do 101 | > 102 | > another thing is you could have a definitely-cache flag when you insert an object: if the identical object is ever subsequently inserted with that flag off, then the flag stays off and never gets flipped back on no matter how many other times the object is reinserted with the flag on. hey that's actually kind of a not bad idea 103 | > 104 | > could even have some automatic capacity management and LRU policy or whatever 105 | 106 | aaanyway that was my digression on content-addressable stores, back to the absorbing a website business 107 | 108 | thing is, `Intertwingler` doesn't really have much to offer you if all you have is a regular website, i mean besides the whole url stuff and i guess the transforms on pages and images etc are nothing to sneer at 109 | 110 | that and i guess the potential to make things get real weird 111 | 112 | like what do you make when you don't have to think in terms of _pages_ anymore 113 | 114 | very PKM-ey but also kinda not 115 | 116 | makes me think about investing in some kind of sparse-to-dense onboarding process 117 | 118 | that might require me finishing [Loupe](https://vocab.methodandstructure.com/loupe#) though 119 | 120 | anyway i'll pick this up later, ta 121 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #FROM --platform=linux/amd64 ruby:latest 2 | #FROM --platform=linux/arm64 arm64v8/ruby:latest 3 | FROM ruby:latest 4 | 5 | # installing headers will implicitly install libraries 6 | RUN apt-get -y update 7 | RUN apt-get install -y liblmdb-dev libvips-dev libxml2-dev libxslt1-dev libsass-dev less vim 8 | 9 | RUN adduser --ingroup users --home /var/lib/intertwingler intertwingler 10 | 11 | RUN gem install bundler 12 | RUN bundle config set path.system true 13 | 14 | RUN mkdir /tmp/src 15 | WORKDIR /tmp/src 16 | 17 | # tried to do these with bundler; it absolutely did not cooperate 18 | 19 | # RUN git clone https://gitlab.com/doriantaylor/uri-urn.git && cd uri-urn && gem build && gem install *.gem && cd - 20 | 21 | # XXX TEMPORARY UNTIL 3.3.3 IS RELEASED 22 | RUN git clone -b speed-up-rdf-graph https://github.com/doriantaylor/rdf.git && cd rdf && gem build && gem install *.gem && cd - 23 | 24 | RUN git clone -b aliases-etc https://github.com/doriantaylor/rb-mimemagic.git && cd rb-mimemagic && gem build && gem install *.gem && cd - 25 | 26 | RUN git clone -b add-prompt-to-collector https://github.com/doriantaylor/rb-tty-prompt.git && cd rb-tty-prompt && gem build && gem install *.gem && cd - 27 | 28 | RUN git clone -b control-w-please https://github.com/doriantaylor/rb-tty-reader.git && cd rb-tty-reader && gem build && gem install *.gem && cd - 29 | 30 | RUN git clone https://github.com/doriantaylor/rb-store-digest.git && cd rb-store-digest && gem build && gem install *.gem && cd - 31 | 32 | RUN git clone https://github.com/doriantaylor/rb-store-digest-http.git && cd rb-store-digest-http && gem build && gem install *.gem && cd - 33 | 34 | RUN git clone https://github.com/doriantaylor/rb-md-noko.git && cd rb-md-noko && gem build && gem install *.gem && cd - 35 | 36 | RUN git clone https://github.com/doriantaylor/rb-rdf-lmdb.git && cd rb-rdf-lmdb && gem build && gem install *.gem && cd - 37 | 38 | # RUN --mount=type=bind,src=.,dst=/tmp/src bundle install --gemfile /tmp/src/Gemfile 39 | COPY . /tmp/src/intertwingler 40 | 41 | # RUN git clone -b transform-queues https://github.com/doriantaylor/rb-intertwingler.git /tmp/src 42 | 43 | WORKDIR /tmp/src/intertwingler 44 | 45 | RUN gem install pry pry-byebug puma engtagger 46 | # this one, i mean wtf 47 | RUN find /usr/local/bundle/gems/engtagger* -type f -print0 | xargs -0 chmod 644 48 | RUN bundle install 49 | RUN gem build ; gem install *.gem 50 | 51 | RUN mkdir -p /var/lib/type 52 | 53 | # RUN mkdir -p /var/lib/app-ibis 54 | # RUN git clone https://github.com/doriantaylor/app-ibis-front-end.git ; cd app-ibis-front-end ; make ; rsync -av target/ /var/lib/app-ibis ; cd - 55 | 56 | RUN rm -rf /tmp/src 57 | 58 | EXPOSE 10101 59 | 60 | WORKDIR /var/lib/intertwingler 61 | 62 | USER intertwingler 63 | ENV INTERTWINGLER_HOME=/var/lib/intertwingler 64 | 65 | RUN mkdir root 66 | COPY --chown=intertwingler:users example/transforms2.ttl config.ttl 67 | 68 | CMD intertwingler 69 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # uggggggggggghhhhhhhh 4 | # gem 'uri-urn', '0.0.5', 5 | # git: 'https://gitlab.com/doriantaylor/uri-urn.git', branch: 'master' 6 | 7 | # do this until upstream merges 8 | gem 'mimemagic', 9 | git: 'https://github.com/doriantaylor/rb-mimemagic.git', 10 | branch: 'aliases-etc' 11 | 12 | gem 'tty-prompt', '0.23.1', 13 | git: 'https://github.com/doriantaylor/rb-tty-prompt.git', 14 | branch: 'add-prompt-to-collector' 15 | 16 | gem 'tty-reader', '0.9.0', 17 | git: 'https://github.com/doriantaylor/rb-tty-reader.git', 18 | branch: 'control-w-please' 19 | 20 | gem 'store-digest', '0.1.4', 21 | git: 'https://github.com/doriantaylor/rb-store-digest.git' 22 | 23 | gem 'store-digest-http', '0.1.1', 24 | git: 'https://github.com/doriantaylor/rb-store-digest-http.git' 25 | 26 | # XXX why is this in here?? oh, version, duh 27 | # gem 'md-noko', '0.1.0', git: 'https://github.com/doriantaylor/rb-md-noko.git' 28 | 29 | # Specify your gem's dependencies in intertwingler.gemspec 30 | gemspec 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | 8 | #desc 'Generate Vocabularies' 9 | #task :gen_vocabs => %w(ci).map { |v| "lib/intertwingler/#{v}.rb" } 10 | 11 | # XXX turn this into a rake task at some point :P 12 | 13 | task :vocabs do 14 | require 'rdf/vocab/writer' 15 | 16 | # ehh we can circle back on this one 17 | vocabs = { 18 | ADMS: 'http://www.w3.org/ns/adms#', 19 | CI: 'https://vocab.methodandstructure.com/content-inventory#', 20 | IBIS: 'https://vocab.methodandstructure.com/ibis#', 21 | PAV: 'http://purl.org/pav/', 22 | QB: 'http://purl.org/linked-data/cube#', 23 | # SCOVO: 'http://purl.org/NET/scovo#', # this one is dead 24 | TFO: 'https://vocab.methodandstructure.com/transformation#', 25 | } 26 | end 27 | 28 | # rdf serialize --uri 'https://vocab.methodandstructure.com/content-inventory#' --output-format vocabulary --module-name Intertwingler::Vocab --class-name CI -o lib/intertwingler/vocab/ci.rb --strict 'https://vocab.methodandstructure.com/content-inventory#' 29 | -------------------------------------------------------------------------------- /behaviour.org: -------------------------------------------------------------------------------- 1 | #+STARTUP: showall hidestars 2 | * generic engine behaviour 3 | - [ ] <> engine initializes with configuration 4 | - [ ] <> we don't want to run one of these things for each site, especially if they're sharing data. more redundant config plus moving parts to fail. if anything it should be /optional/ to run multiple daemons but not /necessary/. 5 | - [ ] <> engine must be configurable with arbitrarily many authorities (host:port pairs) 6 | - [ ] <> authorities dispatched by ~Host:~ request header 7 | - [ ] <> we should be able to alias authorities 8 | - [ ] <> we need a way for the static site generator to know in advance the files it has to write out. 9 | - [ ] <> engine responds with manifest of available URIs 10 | - [ ] <> manifest can be narrowed per site 11 | - [ ] <> manifest should be since whenever 12 | - [ ] <> we want the engine to have a standard interface. 13 | - [ ] <> engine main operation receives HTTP /request/ object 14 | - [ ] <> engine transforms request 15 | - [ ] <> rewrite headers 16 | - e.g. rewrite accept headers based on user-agent header 17 | - e.g. rewrite accept headers based on query parameter 18 | - [ ] <> rewrite request-URI (path + query parameters + /path/ parameters) 19 | - note: path parameters are UI/designators for response transforms 20 | - [ ] <> rewrite request method 21 | - [ ] <> rewrite request body 22 | - [ ] <> manipulate request transform stack 23 | - note: static configuration of response transforms will need to manipulate the request transform stack 24 | - e.g. markdown to (x)html transform: presence in the stack should insert a request transform that adds text/markdown to the request's accept header 25 | - [ ] <> manipulate source polling sequence 26 | - [ ] <> manipulate response transform stack 27 | - [ ] <> engine transforms response 28 | - [ ] <> engine main operation returns HTTP /response/ object 29 | - [ ] <> transforms must have access to graph 30 | - [ ] <> transforms must have access to subrequests 31 | - (eg GETing linked resources, POSTing to content-addressable store…) 32 | ** source modules 33 | - [ ] <> a source module *should* be serviceable as a stand-alone Web app. 34 | - [ ] <> how are we going to negotiate which source handles a request? 35 | - [ ] <> have the resolver append alternate URLs to the request object. 36 | - [ ] <> this entails subclassing the Rack request objects. 37 | - [ ] <> have the source modules register one or more URI schemes. 38 | *** filesystem source 39 | - mainly intended to be for transitional (eg to straight CAS + graph) 40 | - want it to JFW with existing content 41 | - [ ] <> handle ~file:~ URIs 42 | - [ ] <> no don't do this actually, we'll treat content handlers as microservices that handle their own URI resolution (with some constraints) 43 | - [ ] <> what about content negotiation? 44 | - [ ] <> we can just say that the filesystem has additional properties that cause it to respond to ~Accept-*~ headers. 45 | - [ ] <> behave like [[https://github.com/doriantaylor/p5-catalystx-action-negotiate/blob/master/lib/CatalystX/Action/Negotiate.pm][~CatalystX::Action::Negotiate~]] 46 | 47 | *** content-addressable store 48 | - [[P022]] 49 | - [ ] <> handle ~ni:~ URIs 50 | - [ ] <> ~Store::Digest~ implements a ~/.well-known/ni/~ root and index pages that could/should be exposed 51 | *** reverse proxy 52 | - [ ] <> it is not safe to proxy just any old URL. 53 | - [ ] <> institute a default-deny policy for reverse proxying. 54 | - [ ] <> require explicit domains 55 | - [ ] <> do we need more granular than domains? 56 | - [ ] <> probably not. domains should be good enough. 57 | - [ ] <> allow any subdomain of a specified domain; if you need to block a sub-subdomain then add a block; make block take precedence over allow. 58 | *** fully graph-sourced (ie transparent; no opaque representation) 59 | - [ ] <> how do we get the request to the right handler? 60 | - [ ] <> route handlers by subject URI 61 | - [ ] <> subject URI should take precedence 62 | - [ ] <> route handlers by ~rdf:type~ 63 | - [ ] <> resources with type assertions that are "closer" topologically (via ~rdfs:subClassOf~ or ~owl:equivalentClass~) to the handler's configured types get a higher score 64 | - [ ] <> route handlers by ~Accept-*~ headers 65 | - [ ] <> really only ~Accept~ and ~Accept-Language~ because ~Accept-Charset~ and ~Accept-Encoding~ are moot (via transforms) 66 | - note: what about those other dimensions recently added (~Prefer~ etc)? 67 | - [ ] <> make request transforms strip off ~Accept-Charset~ and ~Accept-Encoding~ 68 | **** indexes (skos concept schemes/collections, sioc containers, etc) 69 | - [ ] <> paginate (?) 70 | **** document stats 71 | - [ ] <> respond to ~qb:DataSet~, ~qb:Slice~, and ~qb:ObservationGroup~. 72 | **** generic 73 | - [ ] <> make this thing powered by [[https://vocab.methodandstructure.com/loupe#][Loupe]] 74 | **** other generated/non-(x)html 75 | ***** feeds 76 | - [ ] <> respond to ~dcat:Catalog~ and others 77 | - [ ] <> compute feed for audience 78 | ***** google site map 79 | - [ ] <> google prefers big sites split up their sitemap 80 | - [ ] <> split up the sitemap according to some manageable scheme and link to it from the root 81 | - [ ] <> let's not pollute the root shall we 82 | - [ ] <> place sitemap at ~/.well-known/sitemap.xml~ with a permanent redirect 83 | ***** ~robots.txt~ 84 | - one might also imagine ~humans.txt~, ~ads.txt~, ~credits.txt~ and so on 85 | - [ ] <> make whatever happens here actually use the graph data so it stays up to date 86 | ***** json-ld? 87 | - more generally a ~;meta~ pseudo-transform that could be polymorphic, rdfa/turtle/json-ld/whatever 88 | - would it really be a /pseudo/-transform though? 89 | ** URI resolver 90 | - [ ] <> 91 | ** transforms in general 92 | - [ ] <> transforms should be microservices with their own URLs that you can POST to directly. 93 | - [ ] <> this facilitates [[https://doriantaylor.com/intelligent-heterogeneity][intelligent heterogeneity]] by making it possible for transforms to be stand-alone microservices 94 | - [ ] <> how do we ensure transforms get executed in the right order? 95 | - [ ] <> create execution phases like apache 96 | *** request transforms 97 | - [ ] <> Certain request transforms will elicit erroneous responses (ie they will return something other than what the client asked for) if they do not have a concomitant response transform. 98 | - [ ] <> make it so request transforms can conditionally push a response transform onto the stack. 99 | - [[I015]] 100 | *** response transforms 101 | - [ ] <> map response transforms to content types. 102 | - [ ] <> certain response transforms will not make sense to run without a concomitant request transform having been run first. 103 | - [ ] <> have a way to pair request transforms to response transforms. 104 | - [ ] <> have a given response transform install its paired request transform? 105 | * static site generator behaviour 106 | - [ ] <> generator initializes with configuration 107 | ** generator main function 108 | - [ ] <> we need a way for the static site generator to know in advance the files it has to write out. 109 | - [ ] <> generator receives manifest from engine 110 | - [ ] <> generator writes resources to disk 111 | - [ ] <> generator examines target mtimes 112 | - [ ] <> generator only overwrites files changed since last write 113 | - [ ] <> generator writes rewrite maps 114 | - [ ] <> generator writes site map 115 | * "live" engine adapter behaviour 116 | - [ ] <> adapter initializes with configuration 117 | - [ ] <> adapter spawns daemon 118 | - [ ] <> daemon forks/threads as necessary (tunable in config) 119 | - [ ] <> daemon listens on a socket 120 | * async daemon behaviour 121 | - [ ] <> async daemon initializes with configuration 122 | - [ ] <> async daemon runs plain command queue 123 | - [ ] <> queue has persistent state/resumes when interrupted 124 | - [ ] <> AMQP node? 125 | - [ ] <> async daemon behaves like ~at(1)~ (scheduled one-off commands) 126 | - [ ] <> async daemon behaves like ~cron(1)~ (scheduled repeating commands) 127 | ** pluggable operations 128 | - [ ] <> external link crawler 129 | - [ ] <> RSS/Atom feed poller 130 | - [ ] <> PSHB event handler via webhook? 131 | - [ ] <> polling cues from statistics/feed payload? (yeah right) 132 | - [ ] <> content-addressable store bulk scanner/compressor 133 | * CLI behaviour 134 | - [ ] <> spawn daemon from CLI 135 | - [ ] <> run static site generator from CLI 136 | ** interactive shell 137 | - [ ] <> tooling for RDF sucks in general 138 | - [ ] <> query and manipulate graph 139 | - [ ] <> shell interprets basic graph manipulation commands (as Turtle/SPARQL syntax) 140 | - [ ] <> autocomplete symbols 141 | - [ ] <> autocomplete all syntax 142 | - [ ] <> set prefix mappings 143 | - [ ] <> shell interprets SPARQL commands 144 | - [ ] <> pipe sparql output to targets 145 | - [ ] <> provide alternative syntaxes 146 | - [ ] <> load graph from file 147 | - [ ] <> auto-detect syntax 148 | - [ ] <> set default graph context (?) 149 | - [ ] <> dump graph to file 150 | - [ ] <> Turtle and others 151 | - [ ] <> find and tag jargon 152 | - [ ] <> must attempt to resolve to existing SKOS concepts or provide UI to create new ones 153 | - [ ] <> must write back to source 154 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "intertwingler" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start 15 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | web: 5 | image: intertwingler:latest 6 | build: 7 | network: host 8 | context: . 9 | command: intertwingler 10 | volumes: 11 | # Set the source volume to whatever your preference. 12 | - /tmp/intertwingler:/var/lib/intertwingler 13 | networks: 14 | - web 15 | ports: 16 | # Set the port if you want it to be something else, but it 17 | # should match the one inside the image since the engine keys 18 | # off the Host: header which will contain the port number. 19 | - "10101:10101" 20 | # There is currently nothing special in the environment. 21 | # environment: 22 | 23 | networks: 24 | web: 25 | -------------------------------------------------------------------------------- /example/add-creator.rq: -------------------------------------------------------------------------------- 1 | PREFIX dct: 2 | PREFIX skos: 3 | 4 | INSERT { 5 | GRAPH { 6 | ?s dct:creator . 7 | } 8 | } 9 | WHERE { 10 | GRAPH { 11 | ?s (skos:inScheme|skos:topConceptOf|^skos:hasTopConcept)? . 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /example/cleanup.xsl: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /example/extract-mvp.rq: -------------------------------------------------------------------------------- 1 | PREFIX skos: 2 | 3 | DESCRIBE ?s 4 | WHERE { 5 | GRAPH { 6 | ?s (skos:inScheme|skos:topConceptOf|^skos:hasTopConcept)? 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /example/flow-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doriantaylor/rb-intertwingler/a0897d47daf96d6a258ec6c3a357ec1d3ce63728/example/flow-diagram.png -------------------------------------------------------------------------------- /example/internal-access.dot: -------------------------------------------------------------------------------- 1 | digraph { 2 | graph [bgcolor=transparent,rankdir=LR]; 3 | // graph [rankdir=TB]; 4 | node [shape=box,color="#777777",fontcolor="#777777",fontname="Lucida Grande, Lucida Sans Unicode, sans-serif"]; 5 | edge [color="#777777",fontcolor="#777777",fontname="Lucida Grande, Lucida Sans Unicode, sans-serif"]; 6 | 7 | subgraph multiples { 8 | node [shape=box3d]; 9 | 10 | subgraph multihandlers { 11 | node [color="#5e8052",fontcolor="#5e8052"]; 12 | Handler; 13 | Engine; 14 | } 15 | 16 | subgraph multiparam { 17 | node [color="#5a7a98",fontcolor="#5a7a98"]; 18 | Parameter [label="Parameter\nTemplate"]; 19 | P_Group [label="Parameter\nGroup"]; 20 | } 21 | 22 | subgraph multitransform { 23 | node [color="#bc566e",fontcolor="#bc566e"]; 24 | Queue [label="Transform\nQueue"]; 25 | Entry [label="Transform\n(Metadata)"]; 26 | Partial [label="Partial\nInvocation"]; 27 | } 28 | 29 | Resource [style=dotted]; 30 | } 31 | 32 | GraphDB [label="Graph\nDatabase", shape=cylinder]; 33 | 34 | subgraph handlers { 35 | node [color="#5e8052",fontcolor="#5e8052"]; 36 | Harness; 37 | } 38 | 39 | subgraph params { 40 | node [color="#5a7a98",fontcolor="#5a7a98"]; 41 | P_Registry [label="Parameter\nRegistry"]; 42 | P_Instance [label="Parameters\n(Instance)"]; 43 | } 44 | 45 | subgraph transforms { 46 | node [color="#bc566e",fontcolor="#bc566e"]; 47 | T_Harness [label="Transform\nHarness"]; 48 | Chain [label="Queue\nChain"]; 49 | } 50 | 51 | subgraph backrefs { 52 | edge [dir=both]; 53 | 54 | Engine -> Dispatcher -> T_Harness -> Chain; 55 | T_Harness -> Queue; 56 | Engine -> P_Registry -> P_Group -> P_Instance; 57 | P_Registry -> Parameter; 58 | } 59 | 60 | subgraph ephemeral { 61 | edge [style=dotted]; 62 | // T_Harness -> P_Registry; 63 | // Dispatcher -> P_Registry; 64 | // Queue -> Dispatcher -> Resolver; 65 | // T_Harness -> GraphDB; 66 | // P_Registry -> GraphDB; 67 | Entry -> Resource; 68 | Entry -> P_Group; 69 | // Handler -> GraphDB; 70 | } 71 | 72 | CLI -> Harness -> Engine -> Resolver -> GraphDB; 73 | Engine -> GraphDB; 74 | Dispatcher -> Handler -> Resource; 75 | Handler -> Engine; 76 | Queue -> Entry -> T_Harness; 77 | T_Harness -> P_Registry; 78 | Chain -> Queue -> Partial -> Entry; 79 | Partial -> P_Instance; 80 | P_Group -> Parameter; 81 | } 82 | -------------------------------------------------------------------------------- /example/matches.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | i match lol 5 | 6 | 7 |
8 |

hooray you found the main element

9 |
10 | 11 | 12 | -------------------------------------------------------------------------------- /example/nuke-old-mvp.rq: -------------------------------------------------------------------------------- 1 | PREFIX skos: 2 | 3 | DELETE { 4 | GRAPH { 5 | ?s ?p ?o . 6 | } 7 | } 8 | WHERE { 9 | GRAPH { 10 | ?s (skos:inScheme|skos:topConceptOf|^skos:hasTopConcept)? . 11 | ?s ?p ?o . 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /example/sh-rule.ttl: -------------------------------------------------------------------------------- 1 | @prefix rdf: . 2 | @prefix rdfs: . 3 | @prefix sh: . 4 | @prefix xsd: . 5 | @prefix itcv: . 6 | 7 | # So §9.6.1 of the SKOS reference says that we should infer that the elements of a skos:memberList of a skos:OrderedCollection should also be related to said collection via skos:member, but laments that OWL has no way of expressing this. It turns out, though, that you can say exactly this in SHACL: 8 | 9 | # can we do this with a triple rule? 10 | a sh:SPARQLRule ; 11 | sh:subject sh:this ; 12 | sh:construct "CONSTRUCT { $this itcv:handler ?h } WHERE { ?this itcv:handler-list/rdf:rest*/rdf:first ?h }" ; 13 | sh:condition [ sh:path itcv:handler-list ] . 14 | 15 | # yes, we can 16 | a sh:TripleRule ; 17 | sh:condition [ sh:path itcv:handler-list ] ; 18 | sh:subject sh:this ; 19 | sh:predicate itcv:handler ; 20 | sh:object [ sh:path ( itcv:handler-list [ sh:zeroOrMorePath rdf:rest ] rdf:first ) ] . 21 | 22 | # Granted these examples aren't SKOS, but they consider the exact same problem. -------------------------------------------------------------------------------- /example/transforms.ttl: -------------------------------------------------------------------------------- 1 | @prefix rdf: . 2 | @prefix rdfs: . 3 | @prefix owl: . 4 | @prefix xsd: . 5 | @prefix dct: . 6 | @prefix skos: . 7 | @prefix ci: . 8 | @prefix tfo: . 9 | @prefix xf: . 10 | 11 | xf:prefix a tfo:Parameter ; 12 | skos:prefLabel "Prefix"@en ; 13 | rdfs:comment "A compact prefix declaration of the form prefix:url"@en ; 14 | dct:identifier "prefix"^^xsd:token ; 15 | rdfs:range xsd:token . 16 | 17 | xf:xpath a tfo:Parameter ; 18 | skos:prefLabel "XPath"@en ; 19 | rdfs:comment "An XPath expression"@en ; 20 | dct:identifier "xpath"^^xsd:token ; 21 | owl:cardinality 1 ; 22 | rdfs:range xsd:string . 23 | 24 | xf:reindent a tfo:Parameter ; 25 | skos:prefLabel "Reindent"@en ; 26 | rdfs:comment "Reindent the XML tree"@en ; 27 | dct:identifier "reindent"^^xsd:token ; 28 | tfo:default true ; 29 | owl:cardinality 1 ; 30 | rdfs:range xsd:boolean . 31 | 32 | xf:subtree a tfo:Transform ; 33 | skos:prefLabel "Subtree"@en ; 34 | rdfs:comment "Isolate an X(HT)ML node using XPath."@en ; 35 | tfo:implementation ; 36 | tfo:accepts "application/xml"^^tfo:content-type ; 37 | tfo:returns "application/xml"^^tfo:content-type ; 38 | tfo:parameter xf:xpath, xf:prefix, xf:reindent ; 39 | tfo:parameter-list ( xf:xpath xf:prefix xf:reindent ) . 40 | 41 | xf:cleanup a tfo:Transform ; 42 | skos:prefLabel "Cleanup"@en ; 43 | rdfs:comment "Apply cleanup.xsl to the input."@en ; 44 | tfo:implementation ; 45 | tfo:accepts "application/xml"^^tfo:content-type ; 46 | tfo:returns "application/xml"^^tfo:content-type . 47 | 48 | a tfo:Partial ; 49 | tfo:transform xf:subtree ; 50 | xf:xpath "//html:main[1]"^^xsd:string ; 51 | xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token . 52 | 53 | a tfo:Application ; 54 | tfo:input ; 55 | tfo:output ; 56 | tfo:transform xf:subtree ; 57 | xf:xpath "//html:main[1]"^^xsd:string ; 58 | xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token . 59 | -------------------------------------------------------------------------------- /exe/intertwingler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | lib_path = File.expand_path('../lib', __dir__) 5 | $:.unshift(lib_path) if !$:.include?(lib_path) 6 | require 'intertwingler/cli' 7 | 8 | Signal.trap('INT') do 9 | warn("\n#{caller.join("\n")}: interrupted") 10 | exit(1) 11 | end 12 | 13 | begin 14 | Intertwingler::CLI.start 15 | rescue Intertwingler::CLI::Error => err 16 | puts "ERROR: #{err.message}" 17 | exit 1 18 | end 19 | -------------------------------------------------------------------------------- /experimental/fragment.rq: -------------------------------------------------------------------------------- 1 | # basically what we want here is the cheapest possible thing to execute. 2 | PREFIX rdf: 3 | PREFIX skos: 4 | SELECT DISTINCT ?o WHERE { 5 | ?s ^(skos:memberList/(rdf:first|rdf:rest+/rdf:first)) ?o 6 | { ?o a skos:Collection } UNION { ?o a skos:OrderedCollection } 7 | } 8 | 9 | => "#) (skos: )) (distinct (project (?o) (join (path ?s (reverse (seq (alt (seq (path+ ) )) )) ?o ) (union (bgp (triple ?o a )) (bgp (triple ?o a ))) )) )) )>" 10 | 11 | 12 | (distinct 13 | (project (?o) 14 | (join (path ?s (reverse 15 | (seq 16 | (alt 17 | (seq (path+ ) 18 | )) )) ?o ) 19 | (union 20 | (bgp (triple ?o a )) 21 | (bgp (triple ?o a ))) 22 | )) ) 23 | -------------------------------------------------------------------------------- /experimental/how-about-readline.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # -*- mode: enh-ruby -*- 3 | 4 | # i was specifically trying to avoid readline 5 | require 'readline' 6 | 7 | # typical completion; btw has to be an actual proc, can't be a lambda 8 | Readline.completion_proc = proc do |word| 9 | words = %w[hurr durr lol] 10 | words.grep(/\A#{Regexp.quote word}/) 11 | end 12 | 13 | # this is how you get text onto the command line 14 | Readline.pre_input_hook = proc do 15 | Readline.insert_text 'h' 16 | Readline.redisplay 17 | end 18 | 19 | # how do you make readline read just one line? 20 | line = Readline.readline '> ', true 21 | 22 | warn line 23 | -------------------------------------------------------------------------------- /experimental/ordered-collection.ttl: -------------------------------------------------------------------------------- 1 | @prefix skos: . 2 | 3 | a skos:OrderedCollection ; 4 | skos:prefLabel "bogus ordered collection"@en ; 5 | skos:memberList ( ) . -------------------------------------------------------------------------------- /experimental/parse-config.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # -*- mode: enh-ruby -*- 3 | 4 | # third-party rdf 5 | require 'rdf' 6 | require 'rdf/vocab' # dat vocab 7 | require 'rdf/turtle' # to parse input 8 | require 'sparql' # dat algebra, lol 9 | 10 | # this should haul in a bunch of stuff 11 | require 'intertwingler/vocab' 12 | require 'intertwingler/graphops' 13 | 14 | # save some typing 15 | ITCV = Intertwingler::Vocab::ITCV 16 | TFO = Intertwingler::Vocab::TFO 17 | CI = Intertwingler::Vocab::CI 18 | SH = RDF::Vocab::SH 19 | XSD = RDF::Vocab::XSD 20 | 21 | SAO = SPARQL::Algebra::Operator 22 | 23 | PATHS = { 24 | SH.alternativePath => -> repo, o { 25 | # this shoouuuuld be a list 26 | list = RDF::List subject: o, graph: repo 27 | list.reverse.reduce do |a, x| 28 | SPARQL::Algebra::Operator::Alt.new x, a 29 | end 30 | }, 31 | SH.inversePath => SPARQL::Algebra::Operator::Reverse, 32 | SH.oneOrMorePath => SPARQL::Algebra::Operator::PathPlus, 33 | SH.zeroOrOnePath => SPARQL::Algebra::Operator::PathOpt, 34 | SH.zeroOrMorePath => SPARQL::Algebra::Operator::PathStar, 35 | } 36 | 37 | # entail a URI and return an alt 38 | def entail_term repo, term 39 | fwd = repo.property_set term 40 | rev = repo.property_set term, inverse: true 41 | 42 | fwd = fwd.count == 1 ? fwd.first : fwd.reduce do |a, b| 43 | SPARQL::Algebra::Operator::Alt.new b, a 44 | end 45 | 46 | unless rev.empty? 47 | rev = rev.count == 1 ? rev.first : rev.reduce do |a, b| 48 | SPARQL::Algebra::Operator::Alt.new b, a 49 | end 50 | 51 | fwd = SPARQL::Algebra::Operator::Alt.new fwd, 52 | SPARQL::Algebra::Operator::Reverse.new(rev) 53 | end 54 | 55 | fwd 56 | end 57 | 58 | # entail an arbitrary operator 59 | def entail_op repo, expr 60 | case expr 61 | when RDF::URI 62 | entail_term repo, expr 63 | when SPARQL::Algebra::Operator 64 | ops = expr.operands.map { |o| entail_op repo, o } 65 | # XXX there is no accessor for options 66 | expr.class.new(*ops) 67 | else 68 | expr 69 | end 70 | end 71 | 72 | # recursively trace the shacl predicate paths. the shacl module 73 | # appears to be really only about doing shacl shapes and appears to 74 | # turn the rdf into json-ld first before parsing it??? no idea. 75 | def algebra repo, subject, seen = {} 76 | # it's either a sequence or one of these buggers 77 | if !repo.objects_for(subject, RDF.first, only: :resource).empty? 78 | list = RDF::List.new(subject: subject, graph: repo).map do |x| 79 | algebra repo, x 80 | end 81 | 82 | return list.reverse.reduce do |a, x| 83 | SPARQL::Algebra::Operator::Seq.new x, a 84 | end 85 | else 86 | struct = repo.struct_for subject 87 | # warn struct 88 | # this should yield exactly one thing 89 | keys = PATHS.keys & struct.keys 90 | raise ArgumentError, 91 | "more than one key: #{keys.sort.join ?,}" if keys.size > 1 92 | 93 | if keys.empty? 94 | # then this is a term 95 | return subject 96 | else 97 | op = PATHS[keys.first] 98 | obj = struct[keys.first].first # XXX this should only be one 99 | 100 | return op.is_a?(Proc) ? op.call(repo, obj) : op.new(algebra repo, obj) 101 | end 102 | end 103 | end 104 | 105 | # parse a sparql property 106 | def parse_path path, prefixes 107 | out = SPARQL::Grammar::Parser.new(path, prefixes: prefixes).parse(:Path) 108 | out.last 109 | end 110 | 111 | # query the graph for the host document using the shacl/sparql-based spec 112 | def host_for_internal repo, subject, spec, seen = nil, dtypes = nil, graph: nil, 113 | published: false, circulated: false 114 | 115 | # 1. attempt to detect a direct assertion that this is a fragment 116 | host = repo.objects_for( 117 | subject, CI['fragment-of'], graph: graph, only: :resource).sort.first 118 | 119 | # XXX disambiguate if there is more than one direct assertion (is 120 | # document type, is published, newest?, alphabetical) 121 | 122 | ft = spec.keys - [RDF::RDFS.Resource, RDF::OWL.Thing] 123 | dtypes ||= repo.document_types(fragments: true) & repo.all_types 124 | 125 | types = repo.types_for subject, graph: graph 126 | isdoc = repo.type_is? types, dtypes 127 | frags = repo.type_is? types, ft 128 | 129 | unless host or (isdoc and not frags) 130 | # obtain property paths 131 | paths = spec.map do |type, path| 132 | score = repo.type_is?(types, type) or next 133 | [score, path] 134 | end.compact.sort do |a, b| 135 | a.first <=> b.first 136 | end.map(&:last).flatten(1).uniq 137 | 138 | # accumulate candidates 139 | tab = {} # type cache 140 | pab = {} # priority cache 141 | hosts = paths.reduce([]) do |array, path| 142 | # okay here is where we construct the query from the algebra 143 | 144 | o = RDF::Query::Variable.new ?o 145 | c = RDF::Query::Variable.new ?c 146 | # this says SELECT DISTINCT ?o WHERE { 147 | # $subject $path $o . $o a $c FILTER (?c in ($dtypes)) } 148 | query = SAO::Distinct.new( 149 | SAO::Project.new([o], SAO::Filter.new(SAO::In.new(c, *dtypes), 150 | SAO::Sequence.new(SAO::Path.new(subject, path, o), 151 | RDF::Query.new { pattern [o, RDF.type, c] })))) 152 | 153 | # this just says SELECT DISTINCT ?o WHERE { $subject $path ?o } 154 | # query = SAO::Distinct.new(SAO::Project.new( 155 | # [o], SAO::Path.new(subject, path, o))) 156 | 157 | array + query.execute(repo).map { |sol| sol[:o] } 158 | end.uniq 159 | 160 | if host = hosts.first and not seen.include? host 161 | parent = host_for_internal repo, host, spec, seen | Set[host], dtypes, 162 | graph: graph, published: published, circulated: circulated 163 | host = parent if parent 164 | end 165 | end 166 | 167 | hcache[key] = host 168 | end 169 | 170 | # load the graph 171 | repo = RDF::Repository.load ARGV.first 172 | 173 | # okay let's get the resolver and look at it 174 | resolver = repo.all_of_type(ITCV.Resolver).first 175 | 176 | # what are the things we need to get from the resolver? 177 | 178 | params = {} 179 | 180 | # 1. addressing information, the base URI and its aliases 181 | 182 | params[:base] = repo.objects_for(resolver, ITCV.manages, only: :resource).first 183 | params[:aliases] = repo.objects_for(resolver, ITCV.alias, only: :resource) 184 | 185 | # 2. prefix and vocab mappings 186 | 187 | pfx = params[:prefix] = {} 188 | 189 | repo.objects_for(resolver, ITCV.prefix, only: :resource).each do |decl| 190 | prefix = repo.objects_for(decl, SH.prefix, only: :literal).sort.first 191 | ns = repo.objects_for( 192 | decl, SH.namespace, only: :literal, datatype: XSD.anyURI).sort.first 193 | # i don't understand why this but 194 | pfx[prefix.to_s.to_sym] = RDF::URI(ns.to_s) 195 | end 196 | 197 | if v = repo.objects_for(resolver, ITCV.vocab).first 198 | v = RDF::URI(v.to_s) if v.literal? 199 | params[:vocab] = v 200 | end 201 | 202 | # 3. document class and fragment specifiers 203 | 204 | params[:documents] = repo.objects_for(resolver, ITCV.document, only: :resource) 205 | 206 | params[:fragments] = {} 207 | 208 | repo.objects_for(resolver, ITCV.fragment, only: :resource).each do |frag| 209 | specifier = {} 210 | 211 | specifier[:fragment] = fc = repo.objects_for(frag, 212 | ITCV[:"fragment-class"], only: :resource).sort.first 213 | 214 | if host = repo.objects_for(frag, 215 | ITCV[:"host-class"], only: :resource).sort.first 216 | specifier[:host] = host 217 | end 218 | 219 | repo.objects_for(frag, ITCV.via, only: :resource).each do |path| 220 | # this will be either a term or a path object or a list 221 | via = specifier[:via] ||= [] 222 | via << entail_op(repo, algebra(repo, path)) 223 | end 224 | 225 | params[:fragments][fc] = specifier 226 | end 227 | 228 | # that it? 229 | 230 | require 'pry' 231 | binding.pry 232 | 233 | #puts "#{params}" 234 | -------------------------------------------------------------------------------- /experimental/stable-fragments.rb: -------------------------------------------------------------------------------- 1 | # This is a little side quest to figure out stable fragment identifiers 2 | # based on RFC9562 §5.8 (UUID v8): https://datatracker.ietf.org/doc/html/rfc9562 3 | # 4 | # What we _want_ are fragment identifiers that can be 5 | # deterministically generated based on a set of immutable facts about 6 | # a particular (RDF) subject. By "immutable" we mean if you change the 7 | # assertion then you change the resource, so we're talking about the 8 | # kinds of values that would constitute a (potentially composite) 9 | # primary key in a conventional SQL database. 10 | # 11 | # Why do we want fragment identifiers instead of fully-fledged 12 | # document identifiers? Well, because the resources they identify are 13 | # small and there are a lot of them. My specific use case is 14 | # `qb:Observation`, but other classes of entities apply. 15 | # 16 | # The desideratum is an identifier that can be derived from known 17 | # facts, that isn't too long (lexically), that nevertheless has enough 18 | # entropy that we won't get collisions. The typical strategy for 19 | # something like this is to use a cryptographic hash, because it takes 20 | # an arbitrary input and returns a fixed-length string of bytes. 21 | # 22 | # Here we note that we already have the compact UUID format in play 23 | # (https://datatracker.ietf.org/doc/html/draft-taylor-uuid-ncname) for 24 | # fragment identifiers, so I have a strong bias to using it for this 25 | # as well. The Base64 UUID-NCName variant produces a symbol that is 22 26 | # characters long, which is long, but not unusably long. The UUID 27 | # specification, furthermore, lays out two methods of deriving 28 | # identifiers by cryptographic hash (MD5 and a truncated SHA1). Both 29 | # of these hash functions are obsolete, but that doesn't matter as 30 | # much for this application as does the method of computing them. 31 | # Specifically, RFC4122 (over)prescribes what goes into the hash as 32 | # input. 33 | # 34 | # RFC9652 defines UUID version 8 which is a fully user-defined 35 | # identifier. This means we can compute a hash over whatever we like. 36 | # Current hash algorithms (SHA256 and up), are of course much longer 37 | # than a UUID. However, since the input domain to the hashes _itself_ 38 | # is highly constrained (whatever the solution will be is some small 39 | # collection of RDF terms), it will be _incredibly_ unlikely to 40 | # produce an input within those constraints that will cause a 41 | # collision in the output in even a heavily-truncated hash. 42 | # 43 | # There is furthermore the matter of picking out these identifiers as 44 | # such. I am inclined to set aside some of the 128 (really 122) bits 45 | # of real estate to do so. My particular inclination is to _spell_ 46 | # something — something short — at the front of the identifier, and 47 | # use the rest for hash data. UUID-NCNames move the masked bits out to 48 | # be "bookends" of the identifier, and sequence them using the same 49 | # alphabet as Base32/Base64 (they are four bits apiece the first 16 50 | # positions are the same for both encodings). If we mask the remaining 51 | # two bits in the "variant" field (itself useful as a signal that this 52 | # is what we are doing) then we have 120 bits remaining. If we 53 | # truncate the hash to 96 bits, that leaves 24 bits, or four Base64 54 | # characters, to spell something with. We will table the decision on 55 | # precisely _what_ to spell for now, after we have determined how the 56 | # hash is to be computed. 57 | # 58 | # Consider the following SPARQL query: 59 | # 60 | # ``` 61 | # PREFIX qb: 62 | # PREFIX cgto: 63 | # 64 | # SELECT DISTINCT ?s ?d ?c 65 | # WHERE { 66 | # ?s a qb:Observation ; 67 | # qb:dataSet ?d . 68 | # cgto:class ?c . 69 | # } 70 | # ``` 71 | # 72 | # In this example, the variables ?d and ?c would be known, and thus 73 | # bound. The CURIEs can of course be expanded to their canonical URI 74 | # counterparts. The wrinkle is that the subject ?s is the thing we're 75 | # trying to compute. A cryptographic hash requires a definite string 76 | # of bytes as input, so what we need here is a rule for crafting 77 | # precisely what input is to be hashed. 78 | # 79 | # > The type assertion `?s a qb:Observation` is there for the clarity 80 | # > of the example, and would just be a liability when computing the 81 | # > hash. 82 | # 83 | # While the paradigm case is data cube `qb:Observation`s, the method 84 | # of constructing the hash input string should be generic and 85 | # versatile enough that it can be used in other situations. 86 | # 87 | # > What if a subproperty/equivalent property (ie same meaning, 88 | # > different URI) is being used? 89 | # 90 | # I am loath to put anything into the hash input that _could_ vary but 91 | # that doesn't _have_ to vary. Therein lies tears. Examples: 92 | # 93 | # * subclasses/equivalent classes 94 | # * subproperties/equivalent properties 95 | # * any other statements that don't unambiguously identify a subject 96 | # 97 | # > What if the observation is in more than one dataset? 98 | # 99 | # We are _approaching_ this from the context of the dataset in 100 | # question, so the identity of the dataset we care about is _known_. 101 | # That said, if an observation is in more than one dataset then it may 102 | # make sense to mint a random UUID for it and thus any custom v8 UUID 103 | # derived from its members would be considered secondary. 104 | # 105 | # > This means we will need some kind of ranking system for durable 106 | # > identifiers, which is a good idea anyway, in case a subject has 107 | # > more than one of them. 108 | # 109 | # My inclination is that the function just takes a list of RDF terms 110 | # rather than a set of statements or fragment of SPARQL or something. 111 | # However, the terms should be sequenced _as if_ they were the result 112 | # of a SPARQL query. Sort of. Suppose the dimension property (in this 113 | # example, `cgto:class`) has more than one value. (It shouldn't, but a 114 | # similar construct might.) Just take the set of values and sort them 115 | # lexically. Then go to the next set of values and do the same. This 116 | # will give you a flattened list that should give repeatable results. 117 | # 118 | # Say then that we took the clauses from the SPARQL above in order, 119 | # our input is a serialization of the values for `?d` and `?c`, which 120 | # may look like this: 121 | # 122 | # ``` 123 | # 124 | # ``` 125 | # 126 | # …where the first term represents `?d` and the second represents 127 | # `?c` for the given `?s`, which is the identifier we are trying to 128 | # compute. 129 | # 130 | # > We will table the matter of Unicode normalization for now and just 131 | # > assume NFC or something (https://www.unicode.org/reports/tr15/). 132 | # > Some literals may also require additional escaping, e.g. things 133 | # > like double quotes which are part of the serialization syntax. 134 | # 135 | # Suppose, furthermore, that there is more than one dimension 136 | # property, or plainly more than one property to consider. Or if a 137 | # property (potentially to be) asserted is a 138 | # subproperty/equivalent/inverse etc of the one in the spec. 139 | # 140 | # The algorithm should be something like, for a given list of 141 | # identifying properties (sorted if not given an explicit order), get 142 | # the (unique) values and sort them. Then concatenate this together 143 | # into a list. Null values/empty sets should be illegal (or 144 | # nonsensical because these values will be known given the context). 145 | # 146 | # > We may be able to repurpose bits of SHACL to represent these rules. 147 | # 148 | # Having decided to have the bulk of the identifier consist of the 149 | # first 96 bits of a SHA256 hash, and we have furthermore decided that 150 | # the input to the hash will be a space-separated string of RDF 151 | # terms (serialized to their NTriples representations), we need to 152 | # consider what the remaining 24 bits should say. What I'm thinking 153 | # here is some sort of fixed code that indicates what kind of 154 | # identifier this is. My proposal here is something like `SF0-` in the 155 | # Base64 representation, which stands for stable fragment revision 156 | # zero, followed by a separator. The three bytes, when transformed, 157 | # will be `48 5d 3e`. 158 | # 159 | # Thus, an identifier given the inputs above will look like 160 | # `ISF0-gYTySpDxALMNjWJ4I`, which translates into an equivalent UUID 161 | # that looks like `485d3e81-84f2-84a9-80f1-00b30d8d6278`. 162 | # 163 | # > The `I`…`I` characters at either end of the fragment identifier 164 | # > represent the version and variant nybbles, respectively (the 165 | # > latter with the remaining two bits masked). These are explained in 166 | # > the UUID-NCName spec 167 | # > (https://datatracker.ietf.org/doc/html/draft-taylor-uuid-ncname). 168 | 169 | require 'rdf' 170 | require 'rdf/turtle' 171 | require 'rdf/vocab' 172 | require 'intertwingler/vocab' 173 | require 'uuidtools' 174 | require 'uuid-ncname' 175 | require 'digest' 176 | require 'base64' 177 | 178 | def make_stable_fragment *terms 179 | # generate a string of terms separated by spaces 180 | input = terms.flatten.map(&:to_sxp).join(' ') 181 | 182 | # warn input 183 | 184 | # generate a sha256 hash truncated to 96 bits 185 | hash = Digest::SHA256.digest(input).slice 0, 12 186 | 187 | # base64 encode that baby 188 | b64 = Base64.urlsafe_encode64 hash 189 | 190 | # I… is version 8 and …I is the variant 0b1000 (with the last two 191 | # bits masked); `SF0-` (48 5d 3e) stands for stable fragment rev 0 192 | "ISF0-%sI" % b64 193 | end 194 | 195 | if $0 == __FILE__ 196 | require 'pry' 197 | 198 | # start with some imaginary subject 199 | s = RDF::URI("urn:uuid:3ebd018a-149a-4d62-8a84-8f3dc2cd58d8") 200 | 201 | # say we want the observation over skos:Concepts 202 | x = make_stable_fragment s, RDF::Vocab::SKOS.Concept 203 | 204 | # look ma, valid uuid 205 | u = UUIDTools::UUID.parse UUID::NCName.from_ncname x 206 | 207 | # aand rdf subject 208 | r = RDF::URI(u.to_uri) 209 | 210 | # here, poke around 211 | binding.pry 212 | end 213 | -------------------------------------------------------------------------------- /experimental/tty-reader.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # -*- mode: enh-ruby -*- 3 | 4 | # turns out tty-prompt subsumes tty-reader 5 | require 'tty-prompt' 6 | require 'pathname' 7 | 8 | prompt = TTY::Prompt.new 9 | reader = prompt.reader 10 | 11 | reader.completion_handler = -> path do 12 | path = path.strip 13 | path = ?/ if path.empty? 14 | path = Pathname(path).expand_path 15 | 16 | # warn path 17 | 18 | until path.exist? 19 | path = path.parent 20 | end 21 | 22 | path.children.map(&:to_s).sort 23 | end 24 | 25 | reader.on(:keyctrl_a) do |event| 26 | event.line.move_to_start 27 | end 28 | 29 | reader.on(:keyctrl_e) do |event| 30 | event.line.move_to_end 31 | end 32 | 33 | reader.on(:keyctrl_u) do |event| 34 | line = event.line 35 | line.move_to_start 36 | line.delete line.text.length 37 | end 38 | 39 | reader.on(:keyctrl_w) do |event| 40 | line = event.line 41 | if m = /.*(\/[^\/]*\/*)\z/.match(line) 42 | chars = m.captures.first.length 43 | event.line.left chars 44 | event.line.delete chars 45 | end 46 | end 47 | 48 | # this mysteriously does not work 49 | # reader.on(:keyctrl_l) do |event| 50 | # warn event 51 | # # reader.cursor.clear_lines 100 52 | # # reader.clear_display event.line, TTY::Screen.width 53 | # end 54 | 55 | home = Pathname(Dir.home).realpath 56 | 57 | wd = Pathname.pwd.realpath 58 | 59 | short = wd.relative_path_from home 60 | warn short 61 | 62 | wd = short if short.to_s.length < wd.to_s.length 63 | 64 | hm = reader.read_line '> ', value: wd.to_s 65 | 66 | warn hm 67 | -------------------------------------------------------------------------------- /intertwingler.conf: -------------------------------------------------------------------------------- 1 | # -*- mode: yaml -*- 2 | # 3 | # This is the version of intertwingler.conf that only controls the 4 | # residual configuration that doesn't make sense to put in the graph, 5 | # like where to find the graph. Also, what host/port for the engine to 6 | # listen on, and what domains to initialize out of the graph. 7 | # 8 | # Since the engine does not have the juice to run in production, nor 9 | # does it have any authentication infrastructure, we default to 10 | # listening on localhost with the expectation that it will be 11 | # reverse-proxied. 12 | host: 127.0.0.1 13 | port: 10101 14 | # Here we specify a default driver for the RDF graph database, along 15 | # with any first-run initialization files. 16 | graph: 17 | # We posit the driver as a made-up URN NID called x-ruby that 18 | # encodes which module to load and which class, plus initialization 19 | # parameters which I hope should be largely self-explanatory (well, 20 | # maybe not `mapsize`; that's an LMDB thing). You will see this 21 | # pattern all over the place in Intertwingler. 22 | driver: "urn:x-ruby:rdf/lmdb;RDF::LMDB::Repository?=dir=/var/lib/intertwingler&mapsize=128M" 23 | # These files will be read into an empty graph. 24 | init: 25 | - ~/projects/active/intertwingler/example/transforms2.ttl 26 | jwt: 27 | secret: skr00b12345 28 | authorities: 29 | "doriantaylor.com": 30 | graph: 31 | init: 32 | - ~/projects/active/doriantaylor.com/experimental/content-inventory.ttl 33 | - ~/projects/active/doriantaylor.com/experimental/concept-scheme.ttl 34 | - ~/projects/active/summer-of-protocols/content-inventory.ttl 35 | - ~/projects/active/summer-of-protocols/ibis.ttl 36 | - ~/projects/active/intertwingler/example/ibis.ttl 37 | static: 38 | target: ~/projects/active/doriantaylor.com/target 39 | "methodandstructure.com": 40 | "natureof.software": 41 | graph: 42 | init: 43 | - ~/projects/active/nature-of-software/content-inventory.ttl 44 | - ~/projects/active/nature-of-software/concepts.ttl 45 | - ~/projects/active/nature-of-software/books.ttl 46 | "intertwingler.net": 47 | graph: 48 | init: 49 | - ~/clients/me/intertwingler.net/content-inventory.ttl 50 | -------------------------------------------------------------------------------- /intertwingler.gemspec: -------------------------------------------------------------------------------- 1 | # -*- mode: enh-ruby -*- 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'intertwingler/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = 'intertwingler' 8 | spec.version = Intertwingler::VERSION 9 | spec.authors = ['Dorian Taylor'] 10 | spec.email = ['code@doriantaylor.com'] 11 | spec.license = 'Apache-2.0' 12 | spec.homepage = 'https://github.com/doriantaylor/rb-intertwingler' 13 | spec.summary = 'An engine for dense hypermedia.' 14 | spec.description = <<~DESC 15 | Intertwingler is an engine for making and managing websites that have 16 | a characteristically large number of very small pages, that are very 17 | densely interlinked. It can run as a stand-alone application server, 18 | or use any other Web server interface that Rack supports. 19 | 20 | Intertwingler has three closely-related goals: 21 | 22 | * Effective custody of accurate information, 23 | * Reducing the overhead of communicating said information, 24 | * Artistic and narrative forms that exploit these dynamics. 25 | 26 | Consult the README in this distribution for a more involved 27 | discussion of the problems Intertwingler was designed to solve. 28 | DESC 29 | 30 | spec.files = `git ls-files -z`.split("\x0").reject do |f| 31 | f.match(%r{^(test|spec|features|example|experimental)/}) 32 | end 33 | spec.bindir = 'exe' 34 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 35 | spec.require_paths = %w[lib] 36 | 37 | # ruby 38 | spec.required_ruby_version = '~> 3', '>= 3.2' 39 | 40 | # dev/test dependencies 41 | spec.add_development_dependency 'bundler', '>= 2.4' 42 | spec.add_development_dependency 'rake', '>= 13.1' 43 | spec.add_development_dependency 'rspec', '>= 3.12' 44 | 45 | # stuff we use pretty universally 46 | spec.add_runtime_dependency 'dry-schema', '>= 1.13.3' 47 | spec.add_runtime_dependency 'http-negotiate', '>= 0.2.2' # mine 48 | spec.add_runtime_dependency 'linkeddata', '>= 3.1.2' 49 | spec.add_runtime_dependency 'mimemagic', '>= 0.5.3' # my patch 50 | spec.add_runtime_dependency 'rack', '~> 3' 51 | spec.add_runtime_dependency 'rackup', '~> 2' 52 | spec.add_runtime_dependency 'rdf-reasoner', '>= 0.9.0' 53 | spec.add_runtime_dependency 'sparql', '>= 3.3.0' 54 | spec.add_runtime_dependency 'uri', '~> 1', '>= 1.0.3' # bugs 55 | spec.add_runtime_dependency 'uri-urn', '~> 0.0', '>= 0.0.5' # git 56 | spec.add_runtime_dependency 'uuid-ncname', '>= 0.4.1' # mine 57 | spec.add_runtime_dependency 'uuidtools', '>= 2.1.5' 58 | 59 | # stuff for handlers/transforms 60 | spec.add_runtime_dependency 'md-noko', '~> 0.1', '>= 0.1.1' # mine 61 | spec.add_runtime_dependency 'params-registry', '~> 0.2', '>= 0.2.2' # mine 62 | spec.add_runtime_dependency 'rdf-kv', '~> 0.1', '>= 0.1.8' # mine 63 | spec.add_runtime_dependency 'sassc', '>= 2.2.1' 64 | spec.add_runtime_dependency 'store-digest', '>= 0.1.4' # mine 65 | spec.add_runtime_dependency 'store-digest-http', '>= 0.1.1' # mine 66 | spec.add_runtime_dependency 'vips', '>= 8.12.2' 67 | spec.add_runtime_dependency 'xml-mixup', '~> 0.2', '>= 0.2.1' # mine 68 | 69 | # stuff for cli 70 | spec.add_runtime_dependency 'thor', '>= 1.2.2' 71 | spec.add_runtime_dependency 'tty-markdown', '>= 0.7.2' 72 | spec.add_runtime_dependency 'tty-progressbar', '>= 0.18.2' 73 | spec.add_runtime_dependency 'tty-prompt', '>= 0.23.1' 74 | spec.add_runtime_dependency 'tty-reader', '>= 0.9.0' # my patch 75 | 76 | # stuff for urlrunner 77 | spec.add_runtime_dependency 'concurrent-ruby', '>= 1.1.6' 78 | spec.add_runtime_dependency 'concurrent-ruby-edge', '>= 0.6.0' 79 | spec.add_runtime_dependency 'crass', '>= 1.0.6' 80 | spec.add_runtime_dependency 'tidy_ffi', '>= 1.0.0' 81 | 82 | # stuff for docstats 83 | spec.add_runtime_dependency 'descriptive_statistics', '>= 2.5.1' 84 | spec.add_runtime_dependency 'engtagger', '>= 0.4.1' 85 | spec.add_runtime_dependency 'lemmatizer', '>= 0.2.2' 86 | end 87 | -------------------------------------------------------------------------------- /lib/intertwingler/config.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/resolver' 2 | 3 | module Intertwingler::Config 4 | # uhh do we even need this?? 5 | end 6 | -------------------------------------------------------------------------------- /lib/intertwingler/console.rb: -------------------------------------------------------------------------------- 1 | # rack? sure 2 | 3 | require 'intertwingler/util' 4 | 5 | require 'rack' 6 | require 'rack/request' 7 | require 'rack/response' 8 | require 'http/negotiate' 9 | require 'uuidtools' 10 | require 'pathname' 11 | 12 | class Intertwingler::Console 13 | 14 | class Request < Rack::Request 15 | # this should be in vanilla Rack::Request 16 | def full_uri 17 | URI(base_url) + env['REQUEST_URI'] 18 | end 19 | 20 | end 21 | 22 | class Response < Rack::Response 23 | 24 | private 25 | 26 | # mapping xml namespaces to content types 27 | TYPES = { 28 | nil => { 29 | html: 'application/xhtml+xml', 30 | }, 31 | 'http://www.w3.org/1999/xhtml' => { 32 | nil => 'application/xhtml+xml', 33 | }, 34 | } 35 | 36 | def resolve_type node 37 | doc = node.is_a?(Nokogiri::XML::Document) ? node : node.document 38 | node = doc.root 39 | ns = node.namespace.href if node.namespace 40 | type = TYPES.key?(ns) ? 41 | TYPES[ns].fetch(node.name.to_sym, TYPES[ns][nil]) : 'application/xml' 42 | 43 | "#{type};charset=#{(doc.encoding || 'utf-8').downcase}" 44 | end 45 | 46 | public 47 | 48 | def initialize body, status, headers 49 | body ||= '' 50 | status ||= 501 # Not Implemented lol 51 | headers ||= {} 52 | 53 | if body.is_a? Nokogiri::XML::Document 54 | # XXX failure modes in here like there not being a root 55 | headers[Rack::CONTENT_TYPE] = resolve_type body 56 | 57 | body = body.to_xml 58 | # XXX why is there no explicit set? 59 | headers[Rack::CONTENT_LENGTH] = body.b.length.to_s 60 | end 61 | 62 | super body, status, headers 63 | end 64 | end 65 | 66 | private 67 | 68 | # http errors we care about 69 | ERROR = { 70 | 403 => -> req { 71 | }, 72 | 404 => -> req { 73 | # nothing here, make something? 74 | 75 | doc = context.xhtml_stub().document 76 | }, 77 | 405 => -> req { 78 | }, 79 | 406 => -> req { 80 | }, 81 | } 82 | 83 | # content-negotiating static handler 84 | STATIC = -> req { 85 | 86 | } 87 | 88 | DISPATCH = { 89 | Intertwingler::Util::UUID_PATH => { 90 | GET: -> req { 91 | uri = req.full_uri 92 | match = Intertwingler::Util::UUID_PATH.match uri.request_uri 93 | 94 | subject = context.canonical_uuid match.captures.first 95 | 96 | # first check if the subject is present; if not then 404 97 | 98 | # this could include being an object 99 | 100 | # do content negotiation; hive off non-xhtml responses 101 | 102 | # lol caching fuhgedaboudit; you would need metadata about 103 | # metadata about resources and how that changes over time 104 | # which wouldn't be impossible but also wouldn't be reliable 105 | # without baking it into the quad store 106 | 107 | # generate the response (here is where loupe would be handy) 108 | doc = Intertwingler::Util.generate_doc graph, subject, base: base, 109 | langs: req.accept_language.to_h, prefixes: context.prefixes 110 | 111 | # return 200 112 | [200, {}, doc] 113 | }, 114 | POST: -> req { 115 | # negotiate input 116 | 117 | # modify graph 118 | 119 | # return 303 to itself 120 | [303, nil, { 'Location' => req.full_uri.to_s }] 121 | }, 122 | DELETE: -> req { 123 | # nuke all statements to and from this subject 124 | # return 204 125 | [204, {}, nil] 126 | }, 127 | }, 128 | # internut hoem paeg maybe just show all the (non-blank) subjects 129 | # in the graph? iunno how about subject (with label), outbound 130 | # links collated by type? 131 | ?/ => { 132 | GET: -> req { 133 | subjects = graph.subjects.sort.select &:uri? 134 | body = subjects.map(&:to_s).join "\n" 135 | [200, { 'Content-Type' => 'text/plain' }, body] 136 | }, 137 | }, 138 | # this is the /me resource that tells the ui about who is looking at it 139 | '/me' => { 140 | GET: -> req { 141 | }, 142 | }, 143 | # these are utility resources so the ui has some content to work with 144 | '/classes' => { 145 | GET: -> req { 146 | # needs at least one in-domain-of or in-range-of query 147 | # parameter or it will disgorge everything it knows lol 148 | 149 | # althouuugh that might not actually be that big a deal in 150 | # practice; how many classes can there be (spoiler: lots) 151 | }, 152 | }, 153 | '/properties' => { 154 | GET: -> req { 155 | # this on the other hand, well, there are always gonna be way 156 | # more properties than classes so it should probably be pruned 157 | 158 | # so say by default it returns all known properties with an 159 | # unspecified domain; then you can specify one or more classes 160 | # in domain= (and range= for parity although this might be dumb) 161 | }, 162 | }, 163 | /^\/.*/ => { 164 | GET: -> req { 165 | }, 166 | }, 167 | }.transform_values do |methods| 168 | # copies GET to HEAD but only if there is a GET (and no HEAD already) 169 | methods[:GET] ? { HEAD: methods[:GET] }.merge(methods) : methods 170 | end.freeze 171 | 172 | def dispatch req 173 | uri = req.full_uri 174 | 175 | # match uri or 404; stash the match data while we're at it 176 | _, methods = DISPATCH.detect { |test, _| test === uri.request_uri } 177 | 178 | # match method or 405 179 | handler = methods ? 180 | methods.fetch(req.request_method.to_sym, ERROR[405]) : STATIC 181 | 182 | # run the handler, whatever that may be 183 | resp = instance_exec(req, &handler).to_a 184 | # XXX what if there is not one of these? lol 185 | 186 | body = resp.last 187 | 188 | if body.is_a? Nokogiri::XML::Document 189 | if btag = body.at_xpath( 190 | '/html:html/html:head[1]/html:base[1]', Intertwingler::Util::XPATHNS) 191 | buri = RDF::URI(btag['href']) 192 | if buri.authority == base.authority 193 | buri.scheme = uri.scheme 194 | buri.authority = RDF::URI(uri.to_s).authority 195 | btag['href'] = buri.to_s 196 | end 197 | end 198 | end 199 | 200 | resp = Response[*resp.to_a] 201 | 202 | # XXX maybe nuke the body if it is a HEAD request? ehh i think 203 | # that happens already 204 | resp 205 | end 206 | 207 | public 208 | 209 | attr_reader :context 210 | 211 | # Initialize a new Web console. 212 | # 213 | # @param context [Intertwingler::Context] let's just be lazy for now 214 | # 215 | def initialize context 216 | @context = context 217 | end 218 | 219 | # Returns the RDF graph (repository, not quad graph identifier). 220 | # 221 | # @return [RDF::Repository] 222 | # 223 | def graph 224 | @context.graph 225 | end 226 | 227 | # Returns the base URI (if configured, which it should be). 228 | # 229 | # @return [RDF::URI] 230 | # 231 | def base 232 | @context.base 233 | end 234 | 235 | # Returns the configured prefix mapping 236 | # 237 | # @return [Hash] 238 | # 239 | def prefixes 240 | @context.prefixes 241 | end 242 | 243 | # Run the response. 244 | # 245 | # @param env [Hash] the #Rack environment 246 | # 247 | def call env 248 | # normalize the environment in the case of ssl tomfoolery 249 | env['HTTPS'] = 'on' if env.key? 'REQUEST_SCHEME' and 250 | env['REQUEST_SCHEME'].to_s.strip.downcase == 'https' 251 | req = Request.new env 252 | 253 | # here is where we would rewrite the request i guess 254 | 255 | # you know and maybe resolve it to an actual handler or something 256 | resp = dispatch req 257 | 258 | # here is where we would rewrite the response i guess 259 | 260 | # aand kick it out the door 261 | resp.finish 262 | end 263 | end 264 | -------------------------------------------------------------------------------- /lib/intertwingler/docstats.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/version' 2 | require 'set' 3 | require 'descriptive_statistics' 4 | require 'nokogiri' 5 | 6 | class Intertwingler::DocStats < Nokogiri::XML::SAX::Document 7 | private 8 | 9 | MAYBE = %i[dt dd li td th caption figcaption] 10 | SKIP = %i[html head title base link meta script] 11 | BLOCKS = Set.new(%i[body p h1 h2 h3 h4 h5 h6 ul ol pre dl main header footer 12 | article section aside figure nav div noscript blockquote form hr 13 | table fieldset address] + MAYBE).freeze 14 | SECTIONS = Set.new(%i[body article section]).freeze 15 | IMAGES = Set.new(%i[img picture]).freeze 16 | VIDEOS = Set.new(%i[video]).freeze 17 | EMBEDS = Set.new(%i[embed object iframe]) 18 | COUNTS = { 19 | sections: %i[body article section header footer nav aside], 20 | images: %i[img picture], 21 | videos: %i[video], 22 | embeds: %i[embed object iframe], 23 | tables: %i[table], 24 | lists: %i[ul ol dl], 25 | forms: %i[form], 26 | scripts: %i[script], 27 | sheets: %i[style], 28 | }.transform_values { |v| Set.new v }.freeze 29 | 30 | 31 | NODEXP = '/html:html/html:body[not(*)]|/html:html/html:body//*[not(*)]'.freeze 32 | XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze 33 | XPATHNS = { html: XHTMLNS }.freeze 34 | 35 | # ok listen up fools here is the new html document stats algo: 36 | 37 | # okay we want to count characters, words, blocks, and sections, as 38 | # well as gather stats on words per block (and probably blocks per section) 39 | 40 | # the problem is we don't want to count blocks that only contain other blocks 41 | 42 | # we also don't want to count the text of sub-blocks in a superordinate block 43 | 44 | # there are also quasi-blocks that we may not ordinarily count, 45 | # except if they themselves contain two or more adjacent 46 | # blocks. (examples: li, th/td, h1-6, caption/figcaption) 47 | 48 | # count the block only if it contains text and inline elements (and 49 | # only count the text and inline elements) 50 | 51 | # if 52 | 53 | # we can also 54 | 55 | # use xpath to find all the leaf node elements 56 | # 57 | 58 | def pretend_sax node 59 | case node.type 60 | when Nokogiri::XML::Node::DOCUMENT_NODE 61 | # if node is a document run begin and end document and then run 62 | # for children 63 | start_document 64 | node.children.each { |c| pretend_sax c } 65 | end_document 66 | when Nokogiri::XML::Node::ELEMENT_NODE 67 | # if node is an element run begin and end element and run for children 68 | prefix, uri = if ns = node.namespace 69 | [ns.prefix, ns.href] 70 | end 71 | ns = node.namespace_scopes.map { |n| [ns.prefix, ns.href] } 72 | attrs = node.attribute_nodes.map do |a| 73 | an = a.name 74 | an = "#{a.namespace.prefix}:#{an}" if 75 | a.namespace and a.namespace.prefix 76 | [an, a.content] 77 | end 78 | start_element_namespace node.name, attrs, prefix, uri, ns 79 | node.children.each { |c| pretend_sax c } 80 | end_element_namespace node.name, prefix, uri 81 | when Nokogiri::XML::Node::TEXT_NODE 82 | characters node.content 83 | when Nokogiri::XML::Node::CDATA_SECTION_NODE 84 | cdata_block node.content 85 | end 86 | end 87 | 88 | def do_block name 89 | if BLOCKS.include? name.to_sym 90 | w = @text.strip.split 91 | t = w.join ' ' 92 | 93 | unless w.empty? 94 | words = w.length 95 | @counts[:chars] += t.length 96 | @counts[:words] += words 97 | @counts[:blocks] += 1 98 | @wpb << words 99 | @stack << t 100 | @text = '' 101 | end 102 | end 103 | end 104 | 105 | def clear_text 106 | @text = '' 107 | end 108 | 109 | public 110 | 111 | attr_reader :chars, :words, :blocks 112 | 113 | def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = [] 114 | unless uri != XHTMLNS or SKIP.include? name.to_sym 115 | @on = true 116 | do_block name 117 | end 118 | end 119 | 120 | def end_element_namespace name, prefix = nil, uri = nil 121 | if uri == XHTMLNS 122 | SKIP.include?(name.to_sym) ? clear_text : do_block(name) 123 | COUNTS.each do |type, set| 124 | @counts[type] += 1 if set.include? name.to_sym 125 | end 126 | @counts[:sections] -= 1 if name == 'body' 127 | @on = false if name == 'body' 128 | end 129 | end 130 | 131 | def characters string 132 | @text += string if @on 133 | end 134 | 135 | def cdata_block string 136 | characters string 137 | end 138 | 139 | # @return [Float] mean of words per block 140 | def mean 141 | @wpb.mean 142 | end 143 | 144 | # @return [Float] standard deviation of words per block 145 | def sd 146 | @wpb.standard_deviation 147 | end 148 | 149 | # @return 150 | def quartiles 151 | [0, 25, 50, 75, 100].map { |pct| @wpb.percentile(pct) } 152 | end 153 | 154 | def counts 155 | @counts.dup.freeze 156 | end 157 | 158 | def initialize 159 | @on = false 160 | @text = '' 161 | @stack = [] # XXX i don't think we use this one 162 | @wpb = [] 163 | @counts = %i[chars words blocks sections images videos embeds 164 | tables lists forms scripts sheets].map { |k| [k, 0] }.to_h 165 | end 166 | 167 | def scan doc 168 | if doc.is_a? Nokogiri::XML::Node 169 | pretend_sax doc 170 | else 171 | parser = Nokogiri::XML::SAX::Parser.new self 172 | parser.parse doc 173 | end 174 | 175 | self 176 | end 177 | 178 | def self.scan doc 179 | new.scan doc 180 | end 181 | 182 | def to_h 183 | { mean: mean, sd: sd, quartiles: quartiles }.merge counts 184 | end 185 | 186 | def to_rdf uri: nil, subject: nil 187 | end 188 | end 189 | -------------------------------------------------------------------------------- /lib/intertwingler/error.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/version' 2 | 3 | module Intertwingler::Error 4 | 5 | class Config < ArgumentError 6 | end 7 | 8 | end 9 | -------------------------------------------------------------------------------- /lib/intertwingler/handler.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/error' 2 | require 'rack/request' 3 | require 'rack/response' 4 | 5 | # Everything in {Intertwingler} is a handler. 6 | # 7 | class Intertwingler::Handler 8 | 9 | # do this to declare the symbol 10 | class ::Intertwingler::Engine < self 11 | end 12 | 13 | # This is the abstract parent {::Exception} class that acts as an escape 14 | # hatch for responses that are something _other_ than 200-series, 15 | # i.e. they are not-successful (albeit not strictly _unsuccessful_) 16 | # responses. 17 | class AnyButSuccess < Exception 18 | STATUS = nil 19 | 20 | def initialize message, status: nil 21 | @status = status || self.class.const_get(:STATUS) 22 | 23 | super message 24 | end 25 | 26 | attr_reader :status 27 | alias_method :code, :status 28 | 29 | def response 30 | Rack::Response[status, { 'content-type' => 'text/plain' }, [message]] 31 | end 32 | end 33 | 34 | # Redirects are an example of not-successful-yet-not-unsuccessful responses. 35 | class Redirect < AnyButSuccess 36 | # Make a new redirect "exception" 37 | # 38 | # @param message [#to_s] the error message 39 | # @param status [Integer] the response code 40 | # @param location [URI, RDF::URI, #to_s, nil] 41 | # @param as [:uri, :rdf] URI coercion type 42 | # 43 | def initialize message, status: nil, location: nil, as: :uri 44 | @location = 45 | Intertwingler::Resolver.coerce_resource location, as: as if location 46 | super message, status: status || 302 47 | end 48 | 49 | attr_reader :location 50 | 51 | def response 52 | hdr = { 'content-type' => 'text/plain' } 53 | hdr['location'] = location.to_s if location 54 | Rack::Response[status, hdr, StringIO.new(message)] 55 | end 56 | end 57 | 58 | # This is the superclass of HTTP errors. 59 | # 60 | class Error < AnyButSuccess 61 | def response 62 | Rack::Response[status, { 'content-type' => 'text/plain' }, 63 | [message, (backtrace || []).join("\n")]] 64 | end 65 | 66 | class Client < self 67 | def initialize message, status: nil 68 | super message, status: status 69 | end 70 | end 71 | 72 | class BadRequest < Client 73 | STATUS = 400 74 | end 75 | 76 | class Forbidden < Client 77 | STATUS = 403 78 | end 79 | 80 | class NotFound < Client 81 | STATUS = 404 82 | end 83 | 84 | class NotAllowed < Client 85 | STATUS = 405 86 | 87 | attr_reader :request_method 88 | 89 | def initialize message, status: 405, method: 'GET' 90 | @request_method = method 91 | 92 | super message, status: status 93 | end 94 | end 95 | 96 | class Conflict < Client 97 | STATUS = 409 98 | end 99 | 100 | class Server < self 101 | STATUS = 500 102 | def initialize message, status: nil 103 | super message, status: status || 500 104 | end 105 | end 106 | end 107 | 108 | # Handle a {Rack::Request}. Return a {Rack::Response}. 109 | # 110 | # @param req [Rack::Request] the request. 111 | # 112 | # @return [Rack::Response] the response. 113 | # 114 | def handle req 115 | raise NotImplementedError, 'Subclasses must implement their own `handle`' 116 | end 117 | 118 | # Handle a Rack request from the wire. 119 | # 120 | # @param env [Hash, Rack::Request] the Rack environment or request. 121 | # 122 | # @return [Array<(Integer, Hash, #each)>] the response. 123 | # 124 | def call env 125 | # XXX maybe wrap this or put it in a base class i dunno 126 | req = env.is_a?(Rack::Request) ? env : Rack::Request.new(env) 127 | 128 | # XXX DO WE WANT THIS HERE?? 129 | if forwarded = req.env['HTTP_FORWARDED'] 130 | # we only care about the first one 131 | forwarded = forwarded.strip.downcase.split(/\s*,\s*/).first 132 | forwarded = forwarded.split(/\s*;\s*/).map do |pair| 133 | # XXX we should really parse this properly but echhh 134 | k, v = pair.gsub(/['"]/, '').split(/\s*=\s*/, 2) 135 | [k.to_sym, v] 136 | end.to_h 137 | 138 | req.env['HTTP_HOST'] = forwarded[:host] if forwarded[:host] 139 | req.env['REMOTE_ADDR'] = forwarded[:for] if forwarded[:for] 140 | elsif forwarded = req.env['HTTP_X_FORWARDED_HOST'] 141 | fwdfor = req.env['HTTP_X_FORWARDED_FOR'] 142 | fwdproto = req.env['HTTP_X_FORWARDED_PROTO'] 143 | 144 | req.env['HTTP_HOST'] = forwarded 145 | req.env['REMOTE_ADDR'] = fwdfor if fwdfor 146 | if /https/i =~ fwdproto.to_s 147 | req.env['HTTPS'] = 'on' 148 | end 149 | end 150 | 151 | handle(req).finish 152 | end 153 | 154 | # Normalize a set of request headers into something that can be 155 | # counted on downstream. 156 | # 157 | # @note This method is 100% provisional. 158 | # 159 | # @param req [Rack::Request] a Rack request. 160 | # @param as_symbols [false, true] whether to coerce keys to symbols 161 | # @param split [false, true] whether to split multi-valued headers 162 | # 163 | # @return [Hash] the normalized header set 164 | # 165 | def normalize_headers req, as_symbols: false, split: false 166 | req.env.select do |k| 167 | %w[CONTENT_TYPE CONTENT_LENGTH].include?(k) or k.start_with? 'HTTP' 168 | end.reduce({}) do |hash, pair| 169 | key = pair.first.downcase.delete_prefix('http_').tr_s(?_, ?-) 170 | key = key.to_sym if as_symbols 171 | val = pair.last 172 | val = val.split(/\s*,+\s*/) if split 173 | hash[key] = val 174 | hash 175 | end 176 | end 177 | 178 | # Initialize a handler. 179 | # 180 | # @param engine [Intertwingler::Engine] 181 | # @param args [Hash{Symbol => Object}] 182 | # 183 | def initialize engine, **args 184 | raise ArgumentError, 'engine must be an Intertwingler::Engine' unless 185 | engine.is_a? ::Intertwingler::Engine 186 | @engine = engine 187 | end 188 | 189 | attr_reader :engine 190 | 191 | # Get the {Intertwingler::Resolver} for the given request. 192 | # 193 | # @return [Intertwingler::Resolver, nil] the resolver, maybe 194 | # 195 | def resolver 196 | @engine.resolver 197 | end 198 | 199 | # Get the resolver's graph for the given request. 200 | # 201 | # @return [RDF::Repository] the graph. 202 | # 203 | def repo 204 | @engine.repo 205 | end 206 | 207 | # Get the engine's logger. 208 | # 209 | # @return [Logger] the logger object. 210 | # 211 | def log 212 | @engine.log 213 | end 214 | end 215 | -------------------------------------------------------------------------------- /lib/intertwingler/handler/cas.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/handler' 2 | require 'store/digest' 3 | require 'store/digest/http' 4 | 5 | class Intertwingler::Handler::CAS < Intertwingler::Handler 6 | def initialize engine, **options 7 | super 8 | 9 | @store = Store::Digest.new(**options) 10 | @proxy = Store::Digest::HTTP.new(@store, base: @engine.resolver.base) 11 | end 12 | 13 | def handle req 14 | 15 | # if body = req.body 16 | # File.open('/tmp/wtf.lol', 'wb') { |fh| fh << body.read } 17 | # body.seek 0 if body.respond_to? :seek 18 | # end 19 | 20 | # XXX handle OPTIONS * 21 | @proxy.handle req 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/intertwingler/handler/filesystem.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/handler' 2 | 3 | require 'mimemagic' 4 | require 'http-negotiate' 5 | 6 | # This is a toy content handler for serving content-negotiated files 7 | # directly from the file system. It is intended to be for residual 8 | # interoperability with long-standing expectations about how a 9 | # website ought to work, eventually to be supplanted by more 10 | # sophisticated methods of storing opaque resources. This handler 11 | # only responds to `GET` and `HEAD` methods, as it assumes to be 12 | # situated in an ecosystem where other request methods are handled 13 | # by other handlers. This handler does limited resolution to and 14 | # from request-URIs of the form `/{uuid}`, and will likewise perform 15 | # limited redirections from non-canonical request-URIs to canonical 16 | # ones (except if the original request is `/{uuid}`). The 17 | # `If-Modified-Since` request header will be honoured with a `304 18 | # Not Modified` response when appropriate. Content negotiation is 19 | # similar to Apache's `mod_negotiation`, insofar as content types 20 | # are derived from (possibly multiple) file extensions. Where this 21 | # handler departs from Apache is that it will serve (a negotiated) 22 | # `/dir.ext` (i.e., `/dir` with no trailing slash) in the presence 23 | # of a `/dir/index.ext`, while Apache does the opposite. 24 | # 25 | # This content handler provides rudimentary security in the form of 26 | # checking that symbolic links resolve inside the document root, and 27 | # declining to serve dotfiles (files that begin with a period `.`). 28 | # Otherwise, ordinary files system permissions apply. This handler 29 | # does not generate directory indexes. 30 | # 31 | # Error responses are expected to be handled downstream, though this 32 | # handler will produce the correct ones: 403 for attempts to request 33 | # unreadable files 405 for methods other than `GET` or `HEAD`, 406 34 | # for incompatible `Accept*` headers to variants, and 404 when no 35 | # variants can be found. If a resource is present in the graph but 36 | # not on the file system, the handler can be configured to return 37 | # `410 Gone` (though this may conflict with attempts to poll other 38 | # content handlers). The error response bodies are minimal, intended 39 | # for debugging purposes. 40 | # 41 | class Intertwingler::Handler::FileSystem < Intertwingler::Handler 42 | 43 | # Initialize a handler with parameters. 44 | # 45 | # @param resolvers [Array] the URI resolver(s) 46 | # @param root [Pathname, #to_s, Array] the document root(s) 47 | # @param indices [#to_s, Array<#to_s>] slugs to use for directory index 48 | # 49 | def initialize engine, root: nil, indices: %w[index].freeze 50 | # coerce document root(s) 51 | @roots = (root.respond_to?(:to_a) ? root.to_a : [root]).map do |r| 52 | Pathname(r).expand_path.realpath 53 | end 54 | 55 | @indices = indices.respond_to?(:to_a) ? indices.to_a : [indices] 56 | 57 | super engine 58 | end 59 | 60 | attr_reader :roots, :indices 61 | 62 | # XXX do we wanna do method methods? is this dumb? 63 | # def GET req 64 | # end 65 | 66 | # Handle the request. 67 | # 68 | # @param req [Rack::Request] the request 69 | # 70 | # @return [Rack::Response] the response 71 | # 72 | def handle req 73 | # XXX do the thing i said on stream about mapping request 74 | # methods to actual methods so you don't have to keep typing the 75 | # 405 thing 76 | 77 | # step zero return 405 unless GET or HEAD (or OPTIONS but that 78 | # is a special case) 79 | return Rack::Response[405, {}, []] unless 80 | %w[HEAD GET].include? req.request_method 81 | 82 | # basically what we want is this thing to do as little work as 83 | # it can get away with since most of the URI resolution will be 84 | # done upstream, but still be robust enough to run as a 85 | # standalone content-negotiating static filesystem handler, but 86 | # also not heavily duplicate any redirection or access control 87 | # behaviour. that said, it should not follow symlinks outside 88 | # the document root, or try to serve raw directories, or things 89 | # like dotfiles that would otherwise be readable. 90 | 91 | # general strategy is to build up a list of candidates and then 92 | # eliminate them 93 | 94 | # * we start with the actual URI that was requested, which may 95 | # also be the UUID (or at least *a* UUID) 96 | # * then we get the UUID (if we didn't have it already) 97 | # * then we get the subset of `uri_for` on this scheme/authority 98 | # (that we don't already have) 99 | 100 | # determine if the requested path terminates with a slash (~ parameters) 101 | slash = resolver.slash? req.path 102 | 103 | path = resolver.clean_path(req.path, slash: false).delete_prefix ?/ 104 | 105 | # preemptively check if the request-uri is /{uuid}, otherwise get uuid 106 | is_uuid = !!(uuid = resolver.uuid_path path, as: :uri) 107 | uuid ||= resolver.uuid_for path, as: :uri 108 | 109 | paths = [] 110 | 111 | roots.each do |root| 112 | if uuid 113 | paths << root + uuid.uuid 114 | paths << root + path 115 | paths += resolver.uri_for(uuid, scalar: false, as: :uri, 116 | slugs: true, fragments: false, local: true).reduce([]) do |a, u| 117 | next a if resolver.uuid_path u 118 | a << root + resolver.clean_path(u, slash: false).delete_prefix(?/) 119 | end 120 | paths.uniq! 121 | else 122 | # who knows maybe there's a thing on the file system 123 | # XXX maybe make this verboten if it's not in the graph?? 124 | paths << root + path 125 | end 126 | end 127 | 128 | re = /^#{roots.map { |r| Regexp.quote r.to_s }.join ?|}\//o 129 | 130 | # we'll just make a big chonkin' hash of variants which we can 131 | # use for the negotiation and afterwards 132 | variants = paths.reduce({}) do |h, p| 133 | 134 | # don't do this if this is the root 135 | unless r = roots.include?(p) 136 | dn, bn = p.split 137 | dn.glob("#{bn}{,.*}").each do |x| 138 | if stat = x.stat rescue nil 139 | next if stat.directory? 140 | type = MimeMagic.by_path(x).to_s 141 | incl = re.match? x.realpath.to_s 142 | h[x] ||= { dir: false, stat: stat, type: type, included?: incl } 143 | end 144 | end 145 | end 146 | 147 | @indices.each do |i| 148 | p.glob("#{i}{,.*}").each do |x| 149 | if stat = x.stat rescue nil 150 | next if stat.directory? 151 | type = MimeMagic.by_path(x).to_s 152 | incl = re.match? x.realpath.to_s 153 | h[x] ||= { dir: true, stat: stat, type: type, included?: incl } 154 | end 155 | end 156 | end unless !r and resolver.uuid? bn 157 | 158 | h 159 | end 160 | 161 | # if there are no variants then this is a genuine 404 162 | return Rack::Response[404, {}, []] if variants.empty? 163 | 164 | # okay now subsequently process the variants 165 | variants.transform_values! do |val| 166 | stat = val[:stat] 167 | qs = 1.0 168 | 169 | # the perl CatalystX::Action::Negotiate one does some 170 | # twiddling here; not sure if i wanna copy it 171 | # if val[:dir] 172 | # else 173 | # end 174 | 175 | # this i thought was clever: you demote the variant to 176 | # oblivion so if it gets selected anyway you know to return a 177 | # 403 rather than eliminating it and having to return 404 178 | ok = stat.file? and stat.readable? and val[:included?] 179 | qs /= 100.0 unless ok 180 | 181 | val.merge( 182 | { weight: qs, size: stat.size, mtime: stat.mtime.getgm, ok: ok }) 183 | end 184 | 185 | # warn variants 186 | 187 | # now we actually perform the negotiation and get our selected variant 188 | if selected = HTTP::Negotiate.negotiate(req, variants) 189 | var = variants[selected] 190 | 191 | # warn paths.inspect 192 | # warn selected 193 | 194 | # test if readable 195 | return Rack::Response[403, {}, []] unless var[:ok] 196 | 197 | # test if uri matches requested 198 | # redirect if requested uri was not just a uuid 199 | 200 | # test mtime 201 | if ims = req.get_header('HTTP_IF_MODIFIED_SINCE') 202 | ims = (Time.httpdate(ims) rescue Time.at(0)).getgm 203 | # warn "mtime: #{var[:mtime]} (#{var[:mtime].to_i}), IMS: #{ims} (#{ims.to_i}), lt: #{var[:mtime] < ims}, cmp: #{var[:mtime] <=> ims}" 204 | # return not modified if the variant is *older* than ims 205 | # XXX TIL Time objects can be equal but not 206 | return Rack::Response[304, {}, []] if var[:mtime].to_i <= ims.to_i 207 | end 208 | 209 | return Rack::Response[200, { 210 | 'content-type' => var[:type], 211 | 'content-length' => var[:size].to_s, # rack should do this 212 | 'last-modified' => var[:mtime].httpdate, 213 | }, selected.open] 214 | end 215 | 216 | # there were variants but none were chosen so 406 217 | Rack::Response[406, {}, []] 218 | end 219 | 220 | end 221 | -------------------------------------------------------------------------------- /lib/intertwingler/handler/generated.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/handler' 2 | 3 | # for generated 4 | require 'intertwingler/document' 5 | require 'stringio' 6 | require 'uri' 7 | 8 | class Intertwingler::Handler::Generated < Intertwingler::Handler 9 | 10 | private 11 | 12 | public 13 | 14 | def handle req 15 | 16 | # yaww 17 | if repo.respond_to?(:mtime) and 18 | ims = req.get_header('HTTP_IF_MODIFIED_SINCE') 19 | ims = (Time.httpdate(ims) rescue Time.at(0)).utc 20 | lm = repo.mtime 21 | return Rack::Response[304, {}, []] if lm.to_i <= ims.to_i 22 | end 23 | 24 | # warn req.url.inspect 25 | # warn resolver.base.inspect 26 | 27 | uri = RDF::URI(req.url) 28 | 29 | orig = uri.dup 30 | 31 | # XXX lol 32 | uri.authority = resolver.base.authority if 33 | /(spigot|localhost):9292/i.match? uri.authority 34 | uri.scheme = 'https' if uri.scheme == 'http' 35 | 36 | # warn uri 37 | 38 | # resolve subject 39 | subject = resolver.uuid_for uri 40 | 41 | # bail out if this doesn't return anything 42 | 43 | return Rack::Response[404, { 44 | 'content-type' => 'text/plain', 45 | }, ['lol fail']] unless subject 46 | 47 | # okay now we see if there are any sub-handlers that will take this request 48 | 49 | # types = repo.types_for subject 50 | # strata = repo.type_strata types 51 | 52 | # otherwise we fall back to the main handler 53 | 54 | # doc = Intertwingler::Document.generate_doc resolver, subject, 55 | # prefixes: engine.resolver.prefixes 56 | generator = Intertwingler::Document.new resolver, subject 57 | 58 | doc = generator.doc 59 | 60 | # XXX nuke this later 61 | if base = doc.at_xpath('/html:html/html:head/html:base', 62 | { html: 'http://www.w3.org/1999/xhtml' }) 63 | href = RDF::URI(base['href']) 64 | href.scheme = orig.scheme 65 | href.authority = orig.authority 66 | base['href'] = href.to_s 67 | end 68 | 69 | str = doc.to_xml.b 70 | 71 | # warn 'ouate de phoque' 72 | 73 | hdrs = { 74 | 'content-type' => 'application/xhtml+xml', 75 | 'content-length' => str.length.to_s, 76 | } 77 | hdrs['last-modified'] = repo.mtime.httpdate if repo.respond_to? :mtime 78 | 79 | Rack::Response[200, hdrs, StringIO.new(str, ?r, encoding: Encoding::BINARY)] 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /lib/intertwingler/handler/kv.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/handler' 2 | require 'rdf/kv' 3 | 4 | # This is a `POST` handler for {RDF::KV}, which will likely be 5 | # supplanted by a request transform that converts RDF-KV protocol data 6 | # to an LD-Patch request, and a concomitant `PATCH` handler. So don't 7 | # get too comfy with it. 8 | # 9 | class Intertwingler::Handler::KV < Intertwingler::Handler 10 | 11 | private 12 | 13 | ERR415 = 14 | 'Request must be application/x-www-form-urlencoded or multipart/form-data' 15 | 16 | public 17 | 18 | # Handle a POST request that complies with the RDF-KV protocol. 19 | # 20 | # @param req [Rack::Request] 21 | # 22 | # @return [Rack::Response] 23 | # 24 | def handle req 25 | # we only respond to post 26 | return Rack::Response[405, {}, []] unless req.request_method == 'POST' 27 | 28 | warn "content-type: #{req.content_type.to_s}" 29 | 30 | # we only respond to ordinary web forms 31 | return Rack::Response[ 32 | 415, { 'content-type' => 'text/plain' }, [ERR415]] unless 33 | %w[application/x-www-form-urlencoded 34 | multipart/form-data].include? req.content_type.to_s.downcase 35 | 36 | subject = RDF::URI(req.url) 37 | 38 | kv = RDF::KV.new subject: subject, prefixes: resolver.prefixes, 39 | callback: -> term do 40 | log.debug "TERM: #{term.inspect}" 41 | if term.iri? 42 | # XXX THIS SHOULD PROBABLY BE LESS DUMB 43 | resolver.uuid_for(term) || term # rescue term 44 | # anyway note the parentheses. 45 | else 46 | term 47 | end 48 | end 49 | 50 | # XXX WATCH OUT THIS MIGHT SILENTLY THROW AWAY DATA 51 | req.POST.each { |k, v| log.debug "POST #{k} => #{v}" } 52 | 53 | begin 54 | # generate the changeset 55 | cs = kv.process req.POST 56 | 57 | log.debug "inserts: #{cs.inserts} deletes: #{cs.deletes}" 58 | 59 | graph = RDF::Graph.new data: repo, 60 | graph_name: RDF::URI("dns:#{resolver.base.host}") 61 | 62 | log.debug "graph size before: #{graph.size}" 63 | 64 | # apply it to the graph 65 | cs.apply graph 66 | 67 | log.debug "graph size after: #{graph.size}" 68 | 69 | # XXX we should figure out a way to hook up a rider or otherwise 70 | # smuggle callback functions in; that would entail coming up 71 | # with a more robust solution for configuring handlers though. 72 | rescue Exception => e 73 | log.error e.full_message 74 | return Rack::Response[409, { 75 | 'content-type' => 'text/plain', 76 | 'content-length' => e.full_message.b.length.to_s }, [e.full_message]] 77 | end 78 | 79 | # XXX y'know there could be some cross-site wankery in this, 80 | # requests that redirect to other sites or something, but so what? 81 | 82 | # now we redirect to self or whatever the new subject is 83 | redir = resolver.uri_for kv.subject, slugs: true, via: subject 84 | engine.log.debug "RDF::KV redirecting to #{redir}" 85 | Rack::Response[303, { 'location' => redir.to_s }, []] 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /lib/intertwingler/harness.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/handler' 2 | require 'intertwingler/resolver' 3 | require 'intertwingler/engine' 4 | require 'intertwingler/loggable' 5 | require 'pathname' 6 | 7 | # This is the multiplexing harness introduced to partition the 8 | # bootstrapping configuration 9 | class Intertwingler::Harness < Intertwingler::Handler 10 | include Intertwingler::Loggable 11 | 12 | # Create a new instance of the harness. 13 | # 14 | # @param mapping [Hash{String=>RDF::Repository}] The relation 15 | # mapping authorities (domains) to RDF repositories. 16 | # 17 | def initialize mapping, home: nil, log: nil, jwt: {} 18 | @home = home 19 | @log = log 20 | 21 | if jwt and not jwt.empty? 22 | begin 23 | log.debug "Enabling user set by JWT" 24 | require 'jwt' 25 | require 'jwt-eddsa' if %w[ED25519].include? jwt[:algorithm] 26 | @jwt = jwt 27 | rescue LoadError => e 28 | if e.path == 'eddsa' 29 | warn "The 'rbnacl' gem is required for ED25519." 30 | else 31 | warn "You have a JWT configured but no 'jwt' gem installed." 32 | end 33 | 34 | raise e 35 | end 36 | end 37 | 38 | @engines = mapping.reduce({}) do |hash, pair| 39 | authority, repo = pair 40 | # get the resolver for the authority 41 | resolver = Intertwingler::Resolver.configure repo, 42 | authority: authority, log: self.log # note this is to call Loggable 43 | 44 | # from there, load the engine 45 | engine = Intertwingler::Engine.configure resolver: resolver, home: home 46 | 47 | # map the domain aliases as well 48 | ([resolver.base] + resolver.aliases).each do |uri| 49 | hash[uri.authority] = engine if /^https?$/i.match? uri.scheme 50 | end 51 | 52 | hash 53 | end 54 | 55 | end 56 | 57 | private 58 | 59 | BEARER = /^\s*Bearer\s+([0-9A-Za-z_-]+(?:\.[0-9A-Za-z_-]+)*)/ 60 | 61 | public 62 | 63 | attr_reader :engines, :home 64 | 65 | # Dispatch the request to the appropriate engine. 66 | # 67 | # 68 | def handle req 69 | # read off the Host: header 70 | authority = req.get_header('HTTP_HOST').to_s.strip.downcase 71 | # get an override map for the authority otherwise assign itself 72 | # authority = @override.fetch authority.to_s.strip.downcase, authority 73 | 74 | # match the authority to an engine or otherwise 404 75 | engine = @engines[authority] or return Rack::Response[404, {}, []] 76 | 77 | # log.debug "Authorization: #{req.env['HTTP_AUTHORIZATION']}" 78 | # log.debug 'Bearer matches' if BEARER.match? req.env['HTTP_AUTHORIZATION'] 79 | # log.debug @jwt.inspect 80 | 81 | # handle jwt 82 | if @jwt and bearer = req.env['HTTP_AUTHORIZATION'] and 83 | m = BEARER.match(bearer) 84 | 85 | token = m.captures.first 86 | 87 | key, algo = @jwt.values_at :secret, :algorithm 88 | 89 | begin 90 | obj = JWT.decode(token, key, true, { algorithm: algo }) 91 | rescue JWT::DecodeError => e 92 | return [409, {}, ["Could not decode JWT (#{token}), #{e} (#{e.class})"]] 93 | rescue e 94 | warn e.message 95 | return [500, {}, ["Server error (check logs)"]] 96 | end 97 | 98 | # XXX better logging ??? 99 | if obj and obj.first.is_a? Hash and principal = obj.first['sub'] 100 | req.env['REMOTE_USER'] = principal 101 | log.debug "Retrieved #{req.env['REMOTE_USER']} from JWT" 102 | else 103 | log.debug "JWT: #{obj.inspect}" 104 | end 105 | end 106 | 107 | # forward request to engine 108 | engine.handle req 109 | end 110 | end 111 | -------------------------------------------------------------------------------- /lib/intertwingler/loggable.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/version' 2 | 3 | # This module implements a snap-on #log method which will load and 4 | # initialize {::Logger} in the instance variable `@log` if one is not 5 | # found. 6 | # 7 | module Intertwingler::Loggable 8 | 9 | # Return a {::Logger} instance or otherwise load one with defaults. 10 | # 11 | # @return [Logger] the logger. 12 | # 13 | def log 14 | unless @log 15 | require 'logger' 16 | @log = Logger.new $stderr 17 | end 18 | @log 19 | end 20 | 21 | end 22 | -------------------------------------------------------------------------------- /lib/intertwingler/nlp.rb: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | require 'intertwingler/version' # initialize the symbols 3 | 4 | require 'lemmatizer' 5 | require 'engtagger' 6 | 7 | private 8 | 9 | XHTMLNS = { html: 'http://www.w3.org/1999/xhtml'.freeze }.freeze 10 | 11 | # XXX why does this feel super familiar 12 | POS_MAP = { 13 | noun: %i[xnn nnp nnps nns], 14 | verb: %i[vb vbd vbg vbn vbp vbz], 15 | adj: %i[jj jjr jjs], 16 | adv: %i[rb rbr rbs rp], 17 | }.reduce({}) do |hash, pair| 18 | target = pair.first 19 | pair.last.each { |v| hash[v] = pair.first } 20 | hash 21 | end.freeze 22 | 23 | public 24 | 25 | # This is the *extremely* lightweight NLP functionality. Goals: 26 | # 27 | # * Identify N-grams in the corpus that can be candidates for terms 28 | # (concepts etc) and/or labels (of other entities). 29 | # 30 | # * Generate the raw material for `ci:mentions` relations from 31 | # enclosing text segments (documents, fragments thereof) to entities 32 | # (concepts, people, places, things). 33 | # 34 | # Since we want arbitrarily long N-grams, and we want to remember 35 | # fairly accurately where they came from, we want to take a document 36 | # to an array (of arrays) of segments, or rather a hash of arrays of 37 | # segments, keyed by fragment ID. (This implicitly should happen after 38 | # IDs have been assigned to fragments through some other process.) 39 | # Conceivably we can identify document fragments down to the paragraph 40 | # but in practice the innermost sections will probably do. (Figures, 41 | # tables, blockquotes and asides should also be identified.) 42 | # 43 | # Assuming we are beginning with markup that distinguishes between 44 | # block and inline elements, we drill down to the bottom-most blocks 45 | # and then we recursively process the inlines. These can nest 46 | # arbitrarily deeply but what we want is a flat list of text segments 47 | # to pass to the sentence segmenter. Not all inlines are 48 | # equal. Consider: 49 | # 50 | # * inlines that are always considered separate segments 51 | # (e.g. definitions, abbreviations, quotations, variables, code) 52 | # 53 | # * inlines that are never considered separate segments (e.g. links) 54 | # 55 | # * inlines that are concatenated to adjacent segments if there is no 56 | # whitespace on either side of the join (e.g. emphasis; consider 57 | # `unbelievable!` should end up as one segment) 58 | # 59 | # Once we have a flat array of strings (per identifiable document 60 | # fragment), we pass those through the sentence segmenter to get 61 | # sentences. From there we can split the sentences into clauses, which 62 | # is thankfully regexable, as commas, (semi)colons, dashes etc are 63 | # less ambiguous than periods or question/exclamation marks. This 64 | # final result is what we send to the tokenizer (if we want, we can 65 | # also remove stop words), and ultimately count as N-grams. 66 | # 67 | # The intermediate product is a mapping from an identified text 68 | # segment (eg a document or section thereof) to a word, its frequency 69 | # in the segment, along with a mapping of that word to the words found 70 | # to its immediate left or right (including nil) and the frequency 71 | # they are found adjacent to one another. From this basic element, we 72 | # can construct arbitrarily long N-grams and just say something like 73 | # P(ABC) = P(AB)P(BC) (which probably breaks all sorts of rules but it 74 | # will be good enough for what we are trying to do, which is to 75 | # display a sorted list of candidates and match them to known terms). 76 | # This should be something we can punt out as JSON and ship around; 77 | # it's gonna be too hairy as RDF. 78 | # 79 | # ```json 80 | # { 81 | # "fragment-31337": { 82 | # "count": 123, 83 | # "words": { 84 | # "Foo": { 85 | # "lemma": "foo" 86 | # "count": 12, 87 | # "left": { "": 12 }, 88 | # "right": { "": 8, "Bar": 3, "bar": 1 } 89 | # }, 90 | # "Bar": { 91 | # "lemma": "bar", 92 | # "count": 3, 93 | # "left": { "Foo": 3 }, 94 | # "right": { "": 3 } 95 | # } 96 | # } 97 | # } 98 | # } 99 | # ``` 100 | # ... etc. 101 | # 102 | # Oh also I suppose we can generate TF-IDF scores or whatever with 103 | # that data too. 104 | # 105 | # OK so other stuff: say we have a set of extracted terms and we want 106 | # to compare it with a concept scheme w want to 107 | # 108 | module Intertwingler::NLP 109 | # This class encapsulates a cache of SKOS concepts (either a concept 110 | # scheme, a collection, an ordered collection, or just a bundle of 111 | # concepts) and organizes them by label 112 | class TermCache 113 | # initialize from a scheme or collection 114 | def self.from_scheme repo, subject 115 | end 116 | 117 | # Concepts can be either a hash of the form `{ subject => struct }` 118 | # or just an array of subjects with a repo 119 | def initialize concepts, repo: nil 120 | end 121 | 122 | # Match a label (or labels) to one or more items in the cache. 123 | def match label, fuzzy: false 124 | # step zero: coerce label to array of nfkc strings 125 | # step 1: sort labels from longest to shortest 126 | # now we go: exact match, lemmatized, normalized and lemmatized 127 | end 128 | end 129 | 130 | private 131 | 132 | # https://html.spec.whatwg.org/#usage-summary 133 | HARVEST_DEFAULT = { 134 | 'http://www.w3.org/1999/xhtml' => %i[ 135 | dfn abbr span var kbd samp code q cite data time mark].freeze 136 | }.freeze 137 | 138 | public 139 | 140 | # Recurse into an X(HT?)ML document, harvesting a given set of tags 141 | # for a given namespace. Returns an array of arrays of the form 142 | # `[:name, "text", "alt"]`, which can be manipulated by a 143 | # block. Note the block gets the element itself prepended to the 144 | # array for further processing. 145 | # 146 | # @param node [Nokogiri::XML::Node] the origin node 147 | # @param mapping [Hash] A mapping of namespaces to arrays of tags 148 | # @yieldparam text [String] the element's (flattened) text 149 | # @yieldparam alt [String, nil] the element's alternate text 150 | # (currently hard-coded as the `title` attribute) 151 | # @yieldparam name [Symbol] the element's local name 152 | # @yieldparam node [Nokogiri::XML::Element] the current element 153 | # @yieldreturn [Array] a potentially modified array of inputs 154 | # @return [Array] an array of arrays 155 | # 156 | def harvest_tags node, mapping: HARVEST_DEFAULT, &block 157 | 158 | out = [] 159 | 160 | if node.element? 161 | ns = node.namespace.respond_to?(:href) ? node.namespace.href : nil 162 | name = node.name.to_sym 163 | if mapping[ns] and mapping[ns].include?(name) 164 | text = node.text.strip 165 | text = text.empty? ? nil : text # make this nil if empty 166 | alt = node[:title] # XXX maybe parametrize this? 167 | 168 | # only run the block/append if there is something there 169 | if text or alt 170 | out << (block ? block.call(text, alt, name, node) : [text, alt, name]) 171 | end 172 | end 173 | end 174 | 175 | # recurse lol 176 | out + node.children.map do |c| 177 | harvest_tags c, mapping: mapping, &block 178 | end.flatten(1) # shuck off the first layer of array 179 | end 180 | 181 | def pre_segment element 182 | end 183 | 184 | def segment doc 185 | warn 'wat' 186 | 187 | # get the document body 188 | body = doc.at_xpath('.//html:body[1]', XHTMLNS) or return [] 189 | 190 | # current and result 191 | current = nil 192 | blocks = [] 193 | 194 | body.xpath('.//text()').each do |text| 195 | ancestors = text.xpath( 196 | 'ancestor::*[ancestor-or-self::html:body]', XHTMLNS).select do |e| 197 | # XXX pull this out and put it somewhere common 198 | %w[body main header footer article nav section hgroup h1 h2 h3 h4 h5 h6 199 | div p li dt dd th td caption blockquote aside figure figcaption 200 | form fieldset pre].include? e.name 201 | end 202 | 203 | # warn ancestors.map { |e| e.name }.inspect 204 | 205 | # if the current block is the same, append to last string 206 | if current == ancestors.last 207 | blocks.last << text.content 208 | else 209 | # otherwise set a new current block and add a new last string 210 | current = ancestors.last 211 | blocks << text.content 212 | end 213 | end 214 | 215 | blocks.reduce([]) do |out, b| 216 | b = b.gsub(/\u{2014}+/, ' - ').gsub(/[[:space:]]+/, ' ').strip 217 | out << b unless b.empty? 218 | out 219 | end 220 | end 221 | 222 | # this is dumb but whatever 223 | 224 | def lemmatize text, type = nil 225 | # XXX parameters for these? lol 226 | tag = @@tagger ||= EngTagger.new 227 | lem = @@lemma ||= Lemmatizer.new 228 | 229 | tag.tag_pairs(text.strip).map do |pair| 230 | word, t = pair 231 | unless %i[pos pp sym].any? { |s| s == t } 232 | # if the word is merely capitalized we downcase it (XXX maybe 233 | # do something smarter like check if more than 50% of the 234 | # characters are uppercase rather than just the first one; ie 235 | # more than half, the thing is an acronym) 236 | word = word.downcase if word == word.downcase.capitalize 237 | lem.lemma word, POS_MAP[t] || type 238 | end 239 | end.compact.join ' ' 240 | end 241 | 242 | # make these instance methods available to the module 243 | extend self 244 | end 245 | -------------------------------------------------------------------------------- /lib/intertwingler/params.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/engine' 2 | require 'intertwingler/graphops' 3 | require 'intertwingler/types' 4 | require 'intertwingler/vocab/ci' 5 | require 'intertwingler/vocab/tfo' 6 | require 'intertwingler/vocab/itcv' 7 | require 'params/registry' 8 | 9 | # This is an {Intertwingler}-specific adaptation of the more generic 10 | # {Params::Registry}. In particular, it adds functionality for 11 | # configuring parameters—and groups thereof—out of the graph. 12 | # 13 | # The main motivating factor for this configuration is that while I 14 | # wanted to make it _possible_ for {Params::Registry} to source its 15 | # configuration data from RDF, I didn't want doing so to be 16 | # _necessary_. Intertwingler, however, _only_ sources its 17 | # configuration from RDF, so any RDF configuration business belongs in 18 | # Intertwingler for the time being. 19 | # 20 | # To complete the functionality that reads parameter specs out of the 21 | # RDF graph, we need a mapping that goes from 22 | # [XSD](https://www.w3.org/TR/xmlschema-2/) and/or 23 | # [RDF](https://www.w3.org/TR/rdf-schema/#ch_literal) literal datatypes to 24 | # {Params::Registry::Types}, which this module provides. 25 | # 26 | # @note While {Params::Registry} offers quite a bit of control for 27 | # parameter sets, at the time of this writing we are only interested 28 | # in the simple, single-cardinality, scalar parameters that decorate 29 | # [`tfo:Function`](https://vocab.methodandstructure.com/transformation#Function) 30 | # entities. When this behaviour changes, this note will be removed. 31 | # 32 | # @note It is not clear at the time of this writing how a global 33 | # parameter ordering would be configured (e.g., in the graph), or 34 | # even if that's a desirable thing. Currently the only user of this 35 | # registry are the transform infrastructure, and their parameters 36 | # would get registered one group at a time with no consideration for 37 | # a global sequence. Presumably other handlers would as well. 38 | # 39 | class Intertwingler::Params < Params::Registry 40 | 41 | private 42 | 43 | T = ::Params::Registry::Types 44 | I = Intertwingler::Types 45 | CI = Intertwingler::Vocab::CI 46 | TFO = Intertwingler::Vocab::TFO 47 | XSD = RDF::Vocab::XSD 48 | 49 | public 50 | 51 | # This is the group class with additional functionality for fetching 52 | # configuration from the graph. 53 | class Group < ::Params::Registry::Group 54 | include Intertwingler::GraphOps::Addressable 55 | 56 | private 57 | 58 | def repo ; registry.engine.repo ; end 59 | 60 | public 61 | 62 | alias_method :subject, :id 63 | 64 | # This assignor autovivifies the template from the graph. 65 | # 66 | # @note This may be dumb. 67 | # 68 | # @param id [Object] the template's canonical identifier. 69 | # @param spec [Hash{Symbol => Object}, Params::Registry::Template, nil] 70 | # the template specification. 71 | # 72 | # @return [Params::Registry::Template] the new template 73 | # 74 | def []= id, spec 75 | spec = registry.templates[id] || 76 | registry.template_class.new(registry, id) if spec == id or spec.nil? 77 | 78 | super id, spec 79 | end 80 | 81 | # Refresh the group and (optionally) its constituent parameters. 82 | # 83 | # @param cascade [true, false] whether to cascade into the templates 84 | # 85 | # @return [self] 86 | # 87 | def refresh! cascade: true 88 | # warn subject.inspect 89 | 90 | # only do this if we're a graph buddy 91 | return self unless subject.is_a? RDF::URI 92 | 93 | # fetch the parameters out of the graph 94 | 95 | # XXX TODO better negotiation between ordered and unordered 96 | # (eg subtract ordered from unordered then sort unordered and 97 | # append it to the end of ordered? something like that?) 98 | params = if pl = blanks(TFO['parameter-list']).sort.first 99 | RDF::List.new(subject: pl, graph: repo).to_a.uniq 100 | else 101 | resources(TFO.parameter).sort 102 | end 103 | 104 | # now use the overloaded bulk assign 105 | # templates = params unless params.empty? 106 | 107 | #warn templates.inspect 108 | 109 | # do the templates 110 | params.each do |uri| 111 | if template = self[uri] 112 | template.refresh! 113 | else 114 | # this will trigger it 115 | self[uri] = uri 116 | end 117 | end 118 | 119 | self 120 | end 121 | end 122 | 123 | # This is the template class with additional functionality for fetching 124 | # configuration from the graph. 125 | # 126 | # @note The mapping of XSD tpes 127 | # 128 | class Template < ::Params::Registry::Template 129 | include Intertwingler::GraphOps::Addressable 130 | 131 | private 132 | 133 | # XXX need a solution for object properties, relative URIs, also 134 | # (compact) UUIDs. 135 | MAPPING = { 136 | nil => T::NormalizedString, 137 | RDF::RDFS.Literal => T::String, 138 | RDF::RDFV.langString => T::String, 139 | XSD.string => T::String, 140 | XSD.token => T::Token, 141 | XSD.boolean => T::Bool, 142 | XSD.integer => T::DecimalInteger, 143 | XSD.negativeInteger => T::NegativeInteger, 144 | XSD.positiveInteger => T::PositiveInteger, 145 | XSD.nonNegativeInteger => T::NonNegativeInteger, 146 | XSD.nonPositiveInteger => T::NonPositiveInteger, 147 | XSD.date => T::Date, 148 | XSD.dateTime => T::Time, 149 | RDF::RDFV.List => T::List, 150 | RDF::RDFV.Bag => T::Set, 151 | TFO.Range => T::Range, 152 | TFO[:term] => I::Term, # XXX '#term' is an api method 153 | } 154 | 155 | # this is to actually parse the defaults out of the graph 156 | COMPOSITES = { 157 | RDF::RDFV.List => -> subject { 158 | RDF::List.new(subject: subject, graph: repo).to_a.map do |x| 159 | # convert the literals 160 | x.literal? ? x.object : x 161 | end 162 | }, 163 | RDF::RDFV.Bag => -> subject { 164 | repo.objects_for(subject, RDF::RDFS.member).map do |x| 165 | # convert the literals 166 | x.literal? ? x.object : x 167 | end.to_set 168 | }, 169 | TFO.Range => -> subject { 170 | # XXX we won't mess with open ranges and infimum/supremum right now 171 | lo = repo.objects_for(subject, TFO.low, only: :literal).sort.first 172 | hi = repo.objects_for(subject, TFO.high, only: :literal).sort.first 173 | 174 | # convert the literals if not nil 175 | lo = lo.object if lo 176 | hi = hi.object if hi 177 | 178 | Range.new lo, hi 179 | }, 180 | } 181 | 182 | # XXX we are doing this because apparently the type instances 183 | # don't compare, which is unbelievably fucking annoying 184 | ROOTS = [TFO.Range, RDF::RDFV.Bag, RDF::RDFV.List] 185 | 186 | # this could have been simple, but no 187 | UNWIND = { 188 | T::Range => -> value { value.minmax }, 189 | T::Set => -> value { value.to_a.sort }, 190 | T::List => -> value { value }, 191 | } 192 | 193 | def load_composite subject 194 | # get the domains of the predicates from the struct 195 | types = ( 196 | repo.types_for(subject) + repo.struct_for(subject).keys.select do |p| 197 | p.respond_to? :domain 198 | end.map { |p| p.domain }.flatten).uniq 199 | 200 | candidates = COMPOSITES.keys.reverse & types 201 | 202 | instance_exec subject, &COMPOSITES[candidates.first] unless 203 | candidates.empty? 204 | end 205 | 206 | def repo; registry.engine.repo; end 207 | 208 | # This post-init hook will refresh the template if it has no 209 | # configuration data. 210 | # 211 | def post_init 212 | refresh! if blank? 213 | end 214 | 215 | public 216 | 217 | alias_method :subject, :id 218 | 219 | # Refresh the template from the graph. Currently manages `slug`, 220 | # `aliases`, `type` (mapped from XSD), cardinality, `empty`, and 221 | # `shift`. Will also determine if the entity is composite. 222 | # 223 | # @note Still outstanding are `format`, `depends`, `conflicts`, 224 | # `consumes` `preproc`, `universe`, `complement`, `unwind`, 225 | # and `reverse`. 226 | # 227 | # @note This method does not configure the full feature set of 228 | # {Params::Registry::Template}, because there is currently no 229 | # determination on how the `universe` and `complement` members 230 | # ought to be implemented; likewise stateful type coercions 231 | # (i.e., coercions that may change when something within a 232 | # running instance of Intertwingler changes). 233 | # 234 | # @return [self] because what else do you return 235 | # 236 | def refresh! 237 | # bail out because nothing here to work with otherwise 238 | return super unless subject.is_a? RDF::URI 239 | 240 | if slug = literals(CI['canonical-slug']).sort.first 241 | # i guess this is what we do? lol 242 | @slug = slug = slug.object.to_s.to_sym 243 | end 244 | 245 | @aliases = ( 246 | literals(CI.slug).map { |a| a.object.to_s.to_sym } - [slug]).sort.uniq 247 | 248 | if type? TFO.Composite 249 | comp = resources(RDF::RDFS.range).sort.first || RDF::RDFV.Bag 250 | # root = ROOTS.detect(-> { RDF::RDFV.Bag }) { |c| repo.type_is? comp, c } 251 | 252 | @composite = MAPPING.fetch(comp, T::Set) 253 | @unwfunc = UNWIND[@composite] 254 | # warn @unwfunc.inspect 255 | @type = MAPPING.fetch( 256 | resources(TFO.element).sort.first, T::NormalizedString) 257 | # XXX COMPOSITE DEFAULT ??? 258 | if d = resources(TFO.default).sort.first 259 | @default = load_composite d 260 | end 261 | else 262 | # XXX this may be subtler 263 | @type = MAPPING.fetch( 264 | resources(RDF::RDFS.range).sort.first, T::NormalizedString) 265 | 266 | # XXX is there some less stupid way of doing this 267 | @default = if d = literals(TFO.default).sort.first 268 | d.object 269 | end 270 | end 271 | 272 | # XXX deal with terms a non-stupid way: the problem is we need 273 | # state from the resolver ie the prefix mapping to deal 274 | if @type and @type == I::Term 275 | # we need to do surgery to preproc and format; note these are 276 | # instance_exec'd 277 | @preproc = -> x, _ { 278 | r = registry.engine.resolver 279 | x.map { |t| r.resolve_curie t, noop: true } 280 | } 281 | @format = -> x { registry.engine.resolver.abbreviate x, noop: true } 282 | end 283 | 284 | # we need an unwind for trms 285 | 286 | # cardinality 287 | if c1 = numeric_literals(RDF::OWL.cardinality).sort.first 288 | @min = @max = c1.object.to_i 289 | else 290 | c1 = numeric_literals(RDF::OWL.minCardinality).sort.first 291 | c2 = numeric_literals(RDF::OWL.maxCardinality).sort.last 292 | @min = c1.object.to_i if c1 293 | @max = c2.object.to_i if c2 294 | end 295 | 296 | # empty/multi-value behaviour 297 | @empty = if em = literals(TFO.empty, datatype: XSD.boolean).sort.first 298 | T::Bool[em.object] 299 | end 300 | @shift = if sh = literals(TFO.shift, datatype: XSD.boolean).sort.first 301 | T::Bool[sh.object] 302 | end 303 | 304 | # we return self (well, `super` does) cause there's nothing here to see 305 | super 306 | end 307 | 308 | end 309 | 310 | def refresh! 311 | 312 | # do the templates 313 | super 314 | 315 | # do the groups 316 | groups.each { |g| g.refresh! cascade: false } 317 | 318 | self 319 | end 320 | 321 | def self.configure engine 322 | repo = engine.repo 323 | props = repo.property_set [TFO.parameter, TFO['parameter-list']] 324 | groups = props.map do |p| 325 | repo.query([nil, p, nil]).subjects 326 | end.flatten.select(&:iri?) 327 | 328 | me = self.new engine 329 | 330 | groups.each { |g| me.configure_group g } 331 | 332 | me 333 | end 334 | 335 | # This constructor extends its parent {Params::Registry#initialize} 336 | # by prepending a mandatory `engine` parameter. 337 | # 338 | # @param engine [Intertwingler::Engine] for accessing various useful things. 339 | # @param templates [Hash] the hash of template specifications. 340 | # @param groups [Hash, Array] the hash of groups. 341 | # @param complement [Object, Hash] the identifier for the parameter 342 | # for complementing composites, or otherwise a partial specification. 343 | # 344 | def initialize engine, templates: nil, groups: nil, complement: nil 345 | @engine = engine 346 | 347 | super templates: templates, groups: groups, complement: complement 348 | end 349 | 350 | # @!attribute [r] engine 351 | # @return [Intertwingler::Engine] the engine. 352 | # 353 | attr_reader :engine 354 | 355 | def group_class; Group; end 356 | 357 | def template_class; Template; end 358 | 359 | def self.validate params 360 | (Params::Registry::Types::Array|Params::Registry::Types::TemplateMap)[params] 361 | end 362 | 363 | def configure_group id, params = [] 364 | # this is dumb because dumb 365 | unless group = self[id] 366 | self[id] = params 367 | group = self[id] 368 | end 369 | 370 | group.refresh! 371 | end 372 | end 373 | -------------------------------------------------------------------------------- /lib/intertwingler/representation.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/version' # for the symbol 2 | 3 | require 'forwardable' 4 | require 'mimemagic' 5 | require 'tempfile' 6 | 7 | # This class is a cheap knockoff of a 8 | # {https://en.wikipedia.org/wiki/Monad_(functional_programming) monad} 9 | # for handling successive transformations to a given resource 10 | # representation within the same process. The goal is to mitigate the 11 | # amount of times a (potentially large) object gets serialized and 12 | # reparsed. It works by wrapping a Rack body (either request or 13 | # response, it works on both), and associating it with a parser 14 | # (via parser-specific subclasses). 15 | # 16 | # 17 | # This is a base class for what are called "representations" in [the 18 | # Fielding dissertation](https://www.ics.uci.edu/~fielding/pubs/dissertation/top.htm). 19 | # 20 | # In REST parlance, a resource is a relation between a set of one or 21 | # more identifiers (URIs) and a set of one or more representations. A 22 | # representation has (or rather *is*) a segment of bytes of finite 23 | # length, of a given [content-type](https://www.iana.org/assignments/media-types/). 24 | # If applicable, a representation also may have a [character 25 | # set](https://www.iana.org/assignments/character-sets/) and a 26 | # content-encoding (i.e. compression). 27 | # 28 | # 29 | # 30 | # Of additional interest to us is the fact that a resource's 31 | # representation has to *come* from somewhere: say a file system or 32 | # analogous blob storage. Otherwise, it has to be generated. 33 | # 34 | # Finally, we want representations to be amenable to *transformation*, 35 | # and we want those transformations to be *composable*. Within the 36 | # confines of a running process (and therefore, programming language), 37 | # it is advantageous, where applicable, to have, in addition to the 38 | # byte segment, the parsed representation ready for manipulation, 39 | # rather than the expensive proposition of executing successive 40 | # parsing and serialization operations. 41 | # 42 | # > A couple asides here, since I don't know where else to put them: 43 | # > one is that having multiple in-memory copies of a representation 44 | # > (e.g. the results of successive transformations) is going to be 45 | # > crazy wasteful memory-wise. We could therefore consider the 46 | # > in-memory representation as a *mutable* execution context for 47 | # > transformation (pseudo)functions. In other words, each function 48 | # > nominally *returns* the in-memory representation, but it's 49 | # > actually applying successive operations to the same chunk of 50 | # > memory. The role of the representation object would therefore be 51 | # > to record which transformations have been applied. 52 | # > 53 | # > Another concern here is that a number of (?? i took a shower and 54 | # > lost the thread) (was it that we're going to want to apply a 55 | # > sequence of stock transformations to a number of content types?) 56 | # 57 | # We want to have an "origin" object (pattern? function?) that 58 | # coalesces the possible places where a representation might be 59 | # found. Origins can be things like directories on the ordinary file 60 | # system, repositories under version control (e.g. git), 61 | # content-addressable stores, or even other servers (e.g. reverse 62 | # proxy, in which case it wouldn't be an "origin" as much as a "next 63 | # hop"/"go fish" scenario). For purely-generated ("transparent") 64 | # representations, we could have an origin function that constructs 65 | # and emits a representation from scratch. 66 | # 67 | # > Indeed, all origins can be modeled as functions that take a URI 68 | # > plus `Accept-*` headers (and potentially a few other parameters 69 | # > like version) and resolve them to a representation. 70 | # 71 | # Any sufficiently complex system is going to have at least one origin 72 | # and likely more than one. Some origins will be amenable to 73 | # *manifests* (so you know what's on them). Some will be able to 74 | # respond to requests for variants e.g. by `Content-Type`, 75 | # `Content-Language`, `Content-Encoding`, as well as character set, 76 | # and version. Preference for origin may vary depending on the 77 | # resource in question or any of the `Accept-*` headers. We could 78 | # imagine a successive process of elimination that tests each origin 79 | # for variants, to which interesting return codes are 401, 403, 404, 80 | # 406, or of course 200. Ranged requests are probably not the best 81 | # idea, but they might be okay. Redirects are definitely off-limits. 82 | # 83 | # 1. use (???) 84 | # 85 | class Intertwingler::Representation 86 | extend Forwardable 87 | 88 | # just enough io methods 89 | def_delegators :io, 90 | :each, :read, :gets, :seek, :pos, :tell, :length, :size, :flush, :close 91 | 92 | private 93 | 94 | # subclasses should set this 95 | OBJECT_CLASS = nil 96 | DEFAULT_TYPE = 'application/octet-stream'.freeze 97 | VALID_TYPES = [DEFAULT_TYPE].freeze 98 | 99 | # make a temp file with presets 100 | def tempfile 101 | Tempfile.new 'repr-', encoding: Encoding::BINARY 102 | end 103 | 104 | def coerce_io_like obj 105 | # just give us the io if it's one of ours 106 | return obj.io if obj.is_a? Intertwingler::Representation 107 | 108 | if obj.respond_to? :each 109 | return obj if obj.is_a? IO or 110 | %i[getc read seek close].all? { |m| obj.respond_to? m } 111 | # this would be where we upgrade the IO object to a seekable thing 112 | io = tempfile 113 | obj.each { |x| io << x } 114 | return io 115 | elsif objs.respond_to? :call 116 | # okay then it's a rack streaming response body 117 | obj.call(io = tempfile) 118 | return io 119 | end 120 | 121 | raise ArgumentError, "object of #{obj.class} is not IO-ey enough" 122 | end 123 | 124 | def coerce_type type 125 | # this syntax sugar will automatically noop for MimeMagic objects 126 | type = MimeMagic[type] 127 | 128 | raise ArgumentError, "#{type} is not a valid type" unless 129 | valid_types.any? { |t| type.descendant_of? t } 130 | 131 | type 132 | end 133 | 134 | def coerce_rfc5646 language 135 | # i dunno i don't really feel like being smarter than this 136 | language.to_s.downcase.strip.gsub(/[[:space:]_]/, ?-).tr_s(?-, ?-) 137 | end 138 | 139 | def coerce_charset charset 140 | # i dunno right yet 141 | charset.to_s.downcase.strip 142 | end 143 | 144 | def parse io 145 | raise NotImplementedError, 146 | 'subclasses must implement private method `parse`' 147 | end 148 | 149 | def serialize obj, target 150 | raise NotImplementedError, 151 | 'subclasses must implement private method `serialize`' 152 | end 153 | 154 | public 155 | 156 | def self.object_class 157 | const_get :OBJECT_CLASS 158 | end 159 | 160 | def self.default_type 161 | const_get :DEFAULT_TYPE 162 | end 163 | 164 | def self.valid_types 165 | const_get(:VALID_TYPES).map { |t| MimeMagic[t] } 166 | end 167 | 168 | # Determine if this representation handles a given content type. 169 | # 170 | # @param type [MimeMagic, String] 171 | # 172 | # @return [Bool] 173 | # 174 | def self.handles? type 175 | type = MimeMagic[type] 176 | types = valid_types 177 | 178 | types.any? { |t| type.descendant_of? t } 179 | end 180 | 181 | def object_class 182 | self.class.object_class 183 | end 184 | 185 | def default_type 186 | self.class.default_type 187 | end 188 | 189 | def valid_types 190 | self.class.valid_types 191 | end 192 | 193 | def handles? type 194 | self.class.handles? type 195 | end 196 | 197 | attr_reader :type 198 | attr_accessor :language, :charset 199 | 200 | def initialize obj, type: nil, language: nil, charset: nil, **options 201 | oc = object_class # call this once cause self.class is slow 202 | 203 | raise NotImplementedError, 'Subclasses need an OBJECT_CLASS' unless oc 204 | 205 | if obj.is_a? oc 206 | @object = obj 207 | else 208 | @io = coerce_io_like obj 209 | end 210 | 211 | @type = coerce_type(type || cl.default_type) 212 | @language = coerce_rfc5646 language if language 213 | @charset = coerce_charset charset if charset 214 | end 215 | 216 | def self.coerce io, type: nil, language: nil, charset: nil, **options 217 | if io.is_a? self 218 | # this might be dumb? 219 | io.type = type if type 220 | io.language = language if language 221 | io.charset = charset if charset 222 | return io 223 | end 224 | new io, type: type, language: language, charset: charset, **options 225 | end 226 | 227 | def type= newtype 228 | newtype = coerce_type newtype 229 | 230 | # if this is different we're converting so we need to parse the io 231 | # if we haven't already 232 | if @type != newtype 233 | @type = newtype 234 | 235 | if @io 236 | @io.seek 0 if @io.respond_to? :seek 237 | @object ||= parse @io 238 | end 239 | end 240 | 241 | @type 242 | end 243 | 244 | def io 245 | # warn "hi lol #{caller}" 246 | 247 | if @object 248 | @io = serialize @object, tempfile 249 | @io.seek 0 if @io.respond_to? :seek 250 | @object = nil 251 | end 252 | 253 | @io 254 | end 255 | 256 | def io= obj 257 | @object = nil 258 | @io = coerce_io_like obj 259 | end 260 | 261 | def object= obj 262 | cls = object_class # do this because self.class is slow 263 | raise ArgumentError, 264 | "object must be a #{cls}, not #{obj.class}" unless object.is_a? cls 265 | 266 | # wipe out the stale io 267 | @io = nil if @io 268 | @object = obj 269 | end 270 | 271 | # Return the in-memory representation of the object. 272 | def object 273 | @object ||= parse @io 274 | end 275 | 276 | def inspect 277 | "<#{self.class} type: #{type}, object: #{object.class}>" 278 | end 279 | 280 | end 281 | -------------------------------------------------------------------------------- /lib/intertwingler/representation/nokogiri.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/representation' 2 | 3 | require 'intertwingler/document' # this will also import nokogiri 4 | 5 | require 'stringio' 6 | 7 | class Intertwingler::Representation::Nokogiri < Intertwingler::Representation 8 | private 9 | 10 | OBJECT_CLASS = ::Nokogiri::XML::Node 11 | DEFAULT_TYPE = 'application/xml'.freeze 12 | VALID_TYPES = %w[text/html application/xml].freeze 13 | 14 | public 15 | 16 | # 17 | def each &block 18 | io.each(&block) 19 | end 20 | 21 | def io 22 | return @io unless @object 23 | # XXX we want this to be 24 | out = StringIO.new ''.b, 'wb+' 25 | 26 | @object.write_to out 27 | 28 | out.seek 0 29 | 30 | out 31 | end 32 | 33 | def object 34 | unless @object 35 | io.seek 0 if io.respond_to? :seek 36 | @object = Intertwingler::Document.coerce_doc io 37 | end 38 | 39 | @object 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/intertwingler/representation/vips.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/representation' 2 | 3 | require 'vips' 4 | require 'vips/sourcecustom' 5 | require 'vips/targetcustom' 6 | 7 | require 'stringio' 8 | 9 | class Intertwingler::Representation::Vips < Intertwingler::Representation 10 | private 11 | 12 | OBJECT_CLASS = ::Vips::Image 13 | 14 | DEFAULT_TYPE = 'image/png'.freeze 15 | VALID_TYPES = %w[application/pdf] + 16 | %w[avif gif heic heif jpeg jp2 jxl png tiff webp x-portable-anymap 17 | x-portable-bitmap x-portable-graymap x-portable-pixmap].map do |t| 18 | "image/#{t}".freeze 19 | end.freeze 20 | 21 | def parse io 22 | if io.respond_to? :fileno and io.fileno 23 | # seek and ye shall find 24 | io.seek 0 if io.respond_to? :seek 25 | 26 | # if there's a file descriptor just use it, don't screw around 27 | src = ::Vips::Source.new_from_descriptor io.fileno 28 | else 29 | # this is weird 30 | src = ::Vips::SourceCustom.new 31 | src.on_read do |len| 32 | warn "reading #{len} bytes" 33 | io.read len 34 | end 35 | 36 | src.on_seek do |offset, whence| 37 | warn "seeking #{offset} #{whence}" 38 | io.seek offset, whence 39 | end 40 | end 41 | 42 | ::Vips::Image.new_from_source src, '' 43 | end 44 | 45 | def serialize obj, target 46 | if target.respond_to? :fileno and target.fileno 47 | tgt = ::Vips::Target.new_to_descriptor target.fileno 48 | else 49 | tgt = ::Vips::TargetCustom.new 50 | tgt.on_write { |bytes| target << bytes } 51 | tgt.on_finish do 52 | if target.respond_to? :fsync 53 | target.fsync 54 | elsif target.respond_to? :flush 55 | target.flush 56 | end 57 | end 58 | end 59 | 60 | obj.write_to_target tgt, ".#{type.extensions.first}" 61 | 62 | # culta da cargo 63 | target.fsync if target.respond_to? :fsync 64 | target.flush if target.respond_to? :flush 65 | target.seek 0 if target.respond_to? :seek 66 | 67 | target 68 | end 69 | 70 | public 71 | 72 | end 73 | -------------------------------------------------------------------------------- /lib/intertwingler/resource.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/handler' 2 | require 'intertwingler/graphops' 3 | 4 | # This class encapsulates an individual (presumably generated) HTTP 5 | # resource. It sits between an {Intertwingler::Handler} and whatever 6 | # code generates the response's representation, and handles any 7 | # preprocessing e.g. of query parameters or the request body. This 8 | # class itself is an abstract superclass and is not meant to be used 9 | # directly. Subclasses of this class are intended to be instantiated 10 | # by a handler, when the handler itself is instantiated or otherwise 11 | # refreshed. 12 | # 13 | # To use this class, subclass it and create a method with the same 14 | # name as the request method except transliterated to 15 | # `.downcase.tr_s(?-, ?_)` (so `GET` becomes `get`, or 16 | # `VERSION-CONTROL` becomes `version_control`). The method _must_ 17 | # return a {::Rack::Response} and _may_ raise an 18 | # {Intertwingler::Handler::AnyButSuccess}. 19 | class Intertwingler::Resource 20 | include Intertwingler::GraphOps::Addressable 21 | 22 | private 23 | 24 | # from https://www.iana.org/assignments/http-methods/http-methods.xhtml 25 | # 26 | # any (instance) methods with these names (modulo `.downcase.tr_s(?-,?_)`) 27 | # are automatically routed; otherwise http method FOO is represented by 28 | # `def http_foo`… 29 | # 30 | METHODS = (<<~METH).strip.split.map(&:to_sym) 31 | ACL BASELINE-CONTROL BIND CHECKIN CHECKOUT CONNECT COPY DELETE GET 32 | HEAD LABEL LINK LOCK MERGE MKACTIVITY MKCALENDAR MKCOL MKREDIRECTREF 33 | MKWORKSPACE MOVE OPTIONS ORDERPATCH PATCH POST PRI PROPFIND PROPPATCH 34 | PUT REBIND REPORT SEARCH TRACE UNBIND UNCHECKOUT UNLINK UNLOCK UPDATE 35 | UPDATEREDIRECTREF VERSION-CONTROL 36 | METH 37 | 38 | SUBJECT = nil 39 | 40 | public 41 | 42 | 43 | # @!attribute [r] handler 44 | # @return [Intertwingler::Handler] the associated handler 45 | # 46 | attr_reader :handler, :subject 47 | 48 | # @!attribute [r] subject 49 | # @return [RDF::URI] the (durable, canonical) URI of the resource 50 | # 51 | def self.subject 52 | const_get :SUBJECT 53 | end 54 | 55 | # Initialize a new resource object. 56 | # 57 | def initialize handler, subject = nil, **args 58 | @handler = handler 59 | @subject = resolver.uuid_for(subject || self.class.subject, verify: false) 60 | end 61 | 62 | # @!attribute [r] engine 63 | # @return [Intertwingler::Engine] shortcut for the handler's engine 64 | def engine ; @handler.engine ; end 65 | 66 | # @!attribute [r] resolver 67 | # @return [Intertwingler::Resolver] shortcut for the engine's resolver 68 | def resolver ; engine.resolver ; end 69 | 70 | # @!attribute [r] repo 71 | # @return [Intertwingler::GraphOps] shortcut for the resolver's graph 72 | def repo ; resolver.repo ; end 73 | 74 | # Call the resource with the given request method. Include headers, 75 | # a body, and query parameters when applicable. After preprocessing, 76 | # the call is forwarded to a method defined in the subclass 77 | # 78 | # @param method [Symbol, #to_sym, #to_s] the request method 79 | # @param params [Hash] query parameters, semi-processed by the handler 80 | # @param headers [Hash] any relevant request headers 81 | # @param user [String] the content of `REMOTE_USER` 82 | # @param body [nil, #read, #call, #to_s] something that can pass for a request body 83 | # 84 | # @raise [Intertwingler::Handler::Redirect] when the response needs 85 | # to be redirected 86 | # @raise [Intertwingler::Handler::Error] when there is a client or 87 | # server error 88 | # 89 | # @return [Rack::Response] the response to pass upstream 90 | # 91 | def call method, uri, params: {}, headers: {}, user: nil, body: nil 92 | # keep the original method untouched 93 | to_call = method.dup 94 | # set the method name to `http_whatever` for unregistered methods 95 | to_call = "http_#{to_call}" unless METHODS.include? method.to_s.strip.to_sym 96 | # normalize the request method to a ruby method 97 | to_call = to_call.to_s.strip.downcase.tr_s(?-, ?_).to_sym 98 | 99 | raise Intertwingler::Handler::Error::NotAllowed.new( 100 | "This resource does not respond to #{method} requests.", 101 | method: method) unless respond_to? to_call 102 | 103 | # warn engine.registry.groups.inspect 104 | 105 | # warn "inside resource: #{uri}" 106 | 107 | # handle the params XXX MAY RAISE 108 | # instance = engine.registry[subject].process params 109 | instance = engine.registry.process params || uri.query, defaults: true 110 | 111 | # warn instance.inspect 112 | 113 | # this will already be wrapped in a rescue block upstream 114 | send to_call, uri, params: instance, headers: headers, 115 | user: user, body: body 116 | end 117 | 118 | end 119 | -------------------------------------------------------------------------------- /lib/intertwingler/rubyurn.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/version' # for the namespace 2 | require 'uri/urn' 3 | 4 | # This is a class for representing Ruby modules as URIs. It is 5 | # probably more appropriate for this to be a URI rather than a URN, 6 | # and it was not my first choice to do so, but changes to the URI 7 | # module that ships with Ruby has made it impossible to register a 8 | # scheme that contains a hyphen like `x-ruby`, which is the convention 9 | # for non-standard schemes. Nevertheless this module tries to conform 10 | # to RFC 8141 (with some provisional corrections to its proximate 11 | # dependency). 12 | # 13 | # The canonical form `urn:x-ruby:module/path;Constant::Name?=p1=v1&p2=v2` 14 | # contains enough information to load a module, and return whatever 15 | # constant it points to, along with (not necessarily but what you 16 | # could interpret as) initialization parameters. 17 | # 18 | # Note that there are no constraints for the kinds of path expressions 19 | # or identifiers other than being syntactically valid. If, for 20 | # example, an attacker had access to your file system, they could use 21 | # this system to load a malicious Ruby module. Path expressions of the 22 | # form `foo/bar` must necessarily resolve to something in the module 23 | # search path, but `require` can take absolute and relative paths, so 24 | # the scenario is potentially very mcuh like the Log4J 25 | # vulnerability. I have yet to decide what to do about it. 26 | class Intertwingler::RubyURN < URI::URN::Generic 27 | 28 | private 29 | 30 | # these are just character classes so we can merge them into actual regexes 31 | UNRESERVED = %[-0-9A-Za-z._~].freeze 32 | SUB_DELIMS = %[!$&'()*+,;=].freeze 33 | MY_SUBS = %[!$&'()*+,=].freeze # same minus semicolon 34 | PCHAR_STR = %[[#{UNRESERVED}#{SUB_DELIMS}:@]|%[0-9A-Fa-f]{2}].freeze 35 | MY_PCHAR = %[[#{UNRESERVED}#{MY_SUBS}:@]|%[0-9A-Fa-f]{2}].freeze 36 | 37 | # like this one 38 | PCHAR = /(?:#{PCHAR_STR})/o 39 | NID = /[0-9A-Za-z][0-9A-Za-z-]{,30}[0-9A-Za-z]/o 40 | NSS = /(?:#{PCHAR_STR})(?:#{PCHAR_STR}|\/)*/o 41 | RQ = /(?:#{PCHAR_STR})(?:#{PCHAR_STR}|[\/\?])*/o 42 | FRAG = /(?:#{PCHAR_STR}|[\/\?])*/o 43 | OPAQUE = /\A(#{NID}):(#{NSS})(?:\?\+(#{RQ})?)?(?:\?\=(#{RQ})?)?\z/o 44 | CONST = /[A-Z]\w*(?:::[A-Z]\w*)*/o 45 | PATH = /(?:\/*(?:#{MY_PCHAR})+(?:\/+(?:#{MY_PCHAR})+)*)/o 46 | EXPR = /(?:#{CONST}(?:;(?:#{PCHAR_STR})*)?)/o 47 | 48 | # either a PATH or a CONST or both but not neither 49 | MY_NSS = /\A(?:(#{PATH})(?:;(#{EXPR})?)?|;(#{EXPR}))\z/o 50 | 51 | def check_nss value 52 | MY_NSS.match? value or raise URI::InvalidComponentError, 53 | "Invalid urn:x-ruby: NSS: #{value}" 54 | end 55 | 56 | def check_opaque value 57 | out = OPAQUE.match(value) or raise URI::InvalidComponentError, 58 | "Invalid opaque value for URN: #{value}" 59 | out 60 | end 61 | 62 | def query_hash_for which 63 | URI.decode_www_form(which.to_s).reduce({}) do |hash, pair| 64 | key, value = pair 65 | key = key.to_sym 66 | 67 | if hash.key? key 68 | hash[key] = [hash[key]] unless hash[key].is_a? Array 69 | hash[key] << value 70 | else 71 | hash[key] = value 72 | end 73 | 74 | hash 75 | end 76 | end 77 | 78 | public 79 | 80 | attr_reader :r_component, :q_component 81 | alias_method :query, :q_component 82 | 83 | # Nothing to see here, just `URI.new`. 84 | # 85 | # @param arg [Array] arguments from the `URI` constructor 86 | # 87 | def initialize *arg 88 | super 89 | 90 | # not sure what the point of this is 91 | self.opaque = @opaque if arg[-1] 92 | # it's missing the fragment though 93 | self.fragment = arg[8] if arg[8] 94 | end 95 | 96 | # Retrieve the opaque component which is the NSS plus any R and/or Q 97 | # components. 98 | # 99 | # @return [String] the opaque component. 100 | # 101 | def opaque 102 | out = [nid, nss].join ?: 103 | out << '?+' + r_component if r_component 104 | out << '?=' + q_component if q_component 105 | out 106 | end 107 | 108 | # Set the opaque component. 109 | # 110 | # @param value [String] the new opaque value. 111 | # 112 | # @return [String] helpfully, what you just passed in. 113 | # 114 | def opaque= value 115 | nid, nss, r, q = check_opaque(value).captures 116 | self.nid = nid 117 | self.nss = nss 118 | self.r_component = r 119 | self.q_component = q 120 | 121 | value 122 | end 123 | 124 | # Retrieve the module path. 125 | # 126 | # @return [String] said module path. 127 | # 128 | def path 129 | URI.decode_www_form_component nss.split(?;).first 130 | end 131 | 132 | # Set the module path. 133 | # 134 | # @param value [String] the new module path. 135 | # 136 | # @return [String] helpfully, what you just passed in. 137 | # 138 | def path= value 139 | _, rest = nss.split ?;, 2 140 | self.nss = [URI.encode_www_form_component(value.to_s), rest].join ?; 141 | end 142 | 143 | # Retrieve the R-component as a hash. Keys get transformed into 144 | # symbols. Multiple values for a given key will get collated into an 145 | # array. 146 | # 147 | # @return [Hash{Symbol => (String,Array)}] the R-component. 148 | # 149 | def r_component_hash 150 | query_hash_for r_component 151 | end 152 | 153 | # Ditto Q-component. 154 | # 155 | # @return [Hash{Symbol => (String,Array)}] the R-component. 156 | # 157 | def q_component_hash 158 | query_hash_for q_component 159 | end 160 | 161 | # Set the R-component. 162 | # 163 | # @param value [String] the new R-compoenent. 164 | # 165 | # @return [String] helpfully, what you just passed in. 166 | # 167 | def r_component= value 168 | return @r_component = nil unless value 169 | /\A#{RQ}\z/o.match? value or raise URI::InvalidComponentError, 170 | "Invalid r-component value for URN: #{value}" 171 | @r_component = value 172 | end 173 | 174 | # Set the Q-component. 175 | # 176 | # @param value [String] the new Q-compoenent. 177 | # 178 | # @return [String] helpfully, what you just passed in. 179 | # 180 | def q_component= value 181 | return @q_component = nil unless value 182 | /\A#{RQ}\z/o.match? value or raise URI::InvalidComponentError, 183 | "Invalid q-component value for URN: #{value}" 184 | @q_component = value 185 | end 186 | 187 | alias_method :query=, :q_component= 188 | 189 | # Serialize the URN to a string. 190 | # 191 | # @return [String] said URN. 192 | # 193 | def to_s 194 | out = [scheme, opaque].join ?: 195 | out << ?# + fragment if fragment 196 | out 197 | end 198 | 199 | # it is highly annoying that string coercion only recognizes `to_str` 200 | alias_method :to_str, :to_s 201 | 202 | # Return the string representation of the constant that will be 203 | # returned from #object. 204 | # 205 | # @return [String,nil] the constant name. 206 | # 207 | def constant 208 | if out = nss.split(/[?#]/).first.split(?;)[1] 209 | URI.decode_www_form_component out 210 | end 211 | end 212 | 213 | # Return any identifier that may be present. 214 | # 215 | # @return [nil,String] the identifier 216 | # 217 | def identifier 218 | if out = nss.split(/[?#]/).first.split(?;)[2] 219 | URI.decode_www_form_component out 220 | end 221 | end 222 | 223 | # Apply `require` to the module path, if present. 224 | # 225 | # @note **This performs no security checks.** 226 | # 227 | # @raise [LoadError] if loading the module fails. 228 | # 229 | # @return [false, true] the result of `require`. 230 | # 231 | def require 232 | # wtf i guess pry overloads Kernel.require 233 | super path if path and !path.empty? 234 | end 235 | 236 | # This will #require the #path and then `eval` whatever's in #constant. 237 | # 238 | # @raise [LoadError] if loading the module fails. 239 | # @raise [URI::InvalidComponentError] if the constant is invalid. 240 | # @raise [NameError] if the constant isn't in the execution context. 241 | # 242 | # @return [Object] whatever constant was named in the URN. 243 | # 244 | def object 245 | if ref = constant 246 | self.require # this may raise a LoadError 247 | raise URI::InvalidComponentError, "#{ref} is not a valid constant" unless 248 | /\A\p{Lu}\p{Word}*(?:::\p{Lu}\p{Word}*)*\z/.match? ref 249 | # this may raise a NameError 250 | @object ||= eval ref 251 | end 252 | end 253 | end 254 | 255 | module URI::URN 256 | # even though double-@ variables are outmoded, at least this lets 257 | # you properly represent NIDs (cf https://github.com/ruby/uri/issues/89) 258 | @@nids['X-RUBY'] = Intertwingler::RubyURN 259 | 260 | # XXX WTF, GUY??? This replaces URI::URN.new that has an 261 | # incomprehensibly baroque (and wrong) regex that fails to resolve 262 | # the `x-ruby` NID, instead of just using `split` like a normal person. 263 | def self.new *arg 264 | nid = arg[6].to_s.split(?:).first 265 | @@nids.fetch(nid.to_s.upcase, Generic).new(*arg) 266 | end 267 | end 268 | -------------------------------------------------------------------------------- /lib/intertwingler/source.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/version' # for the symbol 2 | 3 | require 'mimemagic' 4 | require 'http-negotiate' 5 | require 'store-digest' 6 | 7 | # A _source_ is where (opaque) representations of information 8 | # resources originate, either in whole or in part (from the point of 9 | # view of this system). The most familiar kind of source is the file 10 | # system. Other species of sources include content addressable 11 | # storage, as well as version control, and reverse proxies. A source 12 | # (at least in this context) is a read-only component with two salient 13 | # functions: 14 | # 15 | # * `locate`: given a URI and a set of `Accept-*` headers, perform 16 | # content negotiation and return the "best" internal locator. 17 | # 18 | # * `visit`: return an IO object associated with the locator, along 19 | # with at least the following metadata: 20 | # * content (aka MIME) type 21 | # * modification time (or something that can pass for it) 22 | # 23 | # Other sources may elect to include other metadata. 24 | # 25 | class Intertwingler::Source 26 | # A {NotAcceptable} error is intended to be raised when content 27 | # negotiation fails, i.e., when at least one variant is found in the 28 | # source but the rules preclude selecting any one of them. This is 29 | # to distinguish from the variant not being found at all. This error 30 | # is named after and intended to correspond to the HTTP response 31 | # 406, and so bears the same name, even though a name like 32 | # "negotiation failed" or even "no proposal chosen" would be more 33 | # descriptive. 34 | class NotAcceptable < RuntimeError 35 | # XXX do we want to do anything cute in here?? 36 | end 37 | 38 | # Initialize the source. 39 | # 40 | # @param resolver [Intertwingler::Resolver] the URI resolver 41 | # @param options [Hash] catch-all for keyword options 42 | # 43 | def initialize resolver, **options 44 | @resolver = resolver 45 | end 46 | 47 | # Locate the "best" resource representation for a given URI/header 48 | # set. Returns `nil` if a URI can't be found. Otherwise, it will 49 | # return either the internal URI that positively identifies the 50 | # representation, or the pair of this plus the content-type. Raises 51 | # {NotAcceptable} if variants are found but not selected. 52 | # 53 | # @param uri [URI, RDF::URI, #to_s] the URI (will be coerced) 54 | # @param headers [Hash, Rack::Request, #env, #to_h] a header set 55 | # @param pair [false, true] whether to return a pair including the 56 | # content-type of the selected variant or just the variant itself. 57 | # 58 | # @raise [NotAcceptable] The operation may fail to select a variant. 59 | # 60 | # @note There is currently no concept of access control in sources; 61 | # the requestor is assumed to be permitted to retrieve the content. 62 | # 63 | # @return [nil, URI, (URI, String)] URI or URI/content-type pair 64 | # 65 | def locate uri, headers: {}, pair: false 66 | raise NotImplementedError 67 | end 68 | 69 | # Return a minimalist structure containing information suitable for 70 | # downstream processing, including the modification time, the 71 | # content type, and the content {IO} object itself. Keys include but 72 | # are not limited to: 73 | # 74 | # * `:content` — an {IO} object containing the payload, 75 | # * `:mtime` — the modification time (as a {Time} object), 76 | # * `:type` — the content-type, as a string. 77 | # 78 | # Other drivers may include other metadata in their results, like 79 | # the (natural) language, the encoding (compression), and character 80 | # set (where applicable; not to be confused with encoding). 81 | # 82 | # @param uri [URI, RDF::URI, #to_s] the URI (will be coerced) 83 | # @param headers [Hash, Rack::Request, #env, #to_h] a header set 84 | # 85 | # @raise [NotAcceptable] The operation may fail to select a variant. 86 | # 87 | # @note We may eventually move to something like {Dry::Types} for 88 | # the return value to impose some validation on it, but not quite yet. 89 | # 90 | # @return [Hash] a hash containing the information described above. 91 | # 92 | def visit uri, headers: {} 93 | raise NotImplementedError 94 | end 95 | 96 | class FileSystem < Intertwingler::Source 97 | # XXX THIS MIGHT NEED TO BE TUNED 98 | HEADERS = { 99 | 'Accept' => %w[ 100 | application/xhtml+xml application/xml text/html;q=0.8 101 | text/markdown;q=0.5 */*;q=0.1].join(', ').freeze 102 | }.freeze 103 | 104 | def initialize resolver, dir: nil, **options 105 | # coerce dir to pathname 106 | raise ArgumentError, 'root directory must be defined' unless dir 107 | @dir = Pathname(dir).expand_path 108 | raise ArgumentError, 'dir must be a readable directory' unless 109 | dir.readable? and dir.directory? 110 | 111 | # XXX do we want a default language? 112 | 113 | super 114 | end 115 | 116 | def locate uri, headers: HEADERS, pair: false 117 | uri = @resolver.coerce_resource uri 118 | base = @resolver.base 119 | 120 | tu = URI(uri.to_s) # copy of uri for testing content 121 | unless tu.scheme == 'urn' and tu.nid == 'uuid' 122 | raise "could not find UUID for #{uri}" unless 123 | uuid = @resolver.uuid_for(uri) 124 | tu = URI(uri = uuid) 125 | end 126 | 127 | # xxx bail if the uri isn't a subject in the graph 128 | 129 | candidates = [@dir + tu.uuid] 130 | 131 | # try all canonical URIs 132 | (@resolver.uri_for uri, scalar: false, slugs: true).each do |u| 133 | # warn u.inspect 134 | u = URI(u.to_s) 135 | # warn "#{u.hostname} #{base.hostname}".inspect 136 | next unless u.hostname == base.hostname 137 | p = CGI.unescape u.path[/^\/*(.*?)$/, 1] 138 | candidates.push(@dir + p) 139 | end 140 | 141 | variants = candidates.uniq.map do |c| 142 | Pathname.glob(c.to_s + '{,.*,/index{,.*}}') 143 | end.reduce(:+).select { |x| x.file? && x.readable? }.map do |x| 144 | [x, { type: MimeMagic.by_path(x).to_s, size: x.size }] 145 | end.to_h 146 | 147 | # XXX in some future we can imagine telling the difference 148 | # between strictly nonexistent and not readable 149 | return if variants.empty? 150 | 151 | # et voila 152 | chosen = HTTP::Negotiate.negotiate(headers, variants) or 153 | raise NotAcceptable 154 | # XXX do we wanna make this optional? or do something else? 155 | pair ? [chosen, variants[chosen][:type]] : chosen 156 | end 157 | 158 | def visit uri, headers: {} 159 | path, type = locate(uri, headers: headers) 160 | return unless path 161 | 162 | mtime = path.mtime 163 | 164 | # XXX i suppose this could fail 165 | io = path.open 166 | 167 | # this is the minimum set that should be returned 168 | return { 169 | content: io, 170 | mtime: mtime, 171 | type: type, 172 | # charset: charset, 173 | # encoding: encoding, 174 | # language: lang, 175 | } 176 | end 177 | end 178 | 179 | # XXX break this out 180 | class ContentAddressable < Intertwingler::Source 181 | def initialize resolver, store: nil, **options 182 | @store = store || Store::Digest.new(**options) 183 | 184 | super 185 | end 186 | end 187 | 188 | # XXX TODO LOL 189 | class VersionControl < Intertwingler::Source 190 | # URI could have branch and version parameters? 191 | 192 | # struct returned from visit could have vcs-specific metadata 193 | 194 | class Git < VersionControl 195 | end 196 | end 197 | 198 | # XXX ReverseProxy 199 | 200 | end 201 | -------------------------------------------------------------------------------- /lib/intertwingler/surface.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/version' 2 | 3 | # A _surface_ is where {Intertwingler::Representation representations} of 4 | # _information resources_ are _projected_. The behaviour of a surface 5 | # is intentionally opaque, and as such its main functionality is 6 | # accessed through the `call` method. Whatever is on the other side of 7 | # that is assumed to know what to do with it. 8 | class Intertwingler::Surface 9 | 10 | attr_reader context 11 | 12 | # Instantiate the surface. 13 | # 14 | # @param context [Intertwingler::Context] the configuration context 15 | # @param options [Hash] dummy keyword options 16 | # 17 | def initialize context, **options 18 | @context = context 19 | end 20 | 21 | # No-op `call` that should be overridden in a subclass. 22 | def call *args, **options, &block 23 | raise NotImplementedError 24 | end 25 | 26 | # This is the document-root surface, intended for a very particular 27 | # flavour of static website generation. 28 | # 29 | # * Files are deposited in the root as `.`. 30 | # * Three `RewriteMap`s are deposited alongside, respectively to 31 | # handle rewrites, redirections, and `410 Gone` responses. 32 | # * Private files (how you determine access is up to you) are 33 | # deposited in the private directory (which is configured by 34 | # default to be "under" the root but could be elsewhere). 35 | # * Content negotiation (`Options +MultiViews` in Apache; whatever 36 | # equivalent in `nginx` or IIS) is assumed to be enabled. 37 | # 38 | class DocumentRoot < Surface 39 | 40 | private 41 | 42 | PRIVATE = '.private'.freeze 43 | 44 | public 45 | 46 | def initialize context, dir: nil, private: PRIVATE, **options 47 | raise ArgumentError, 'dir must be string or Pathname' unless dir 48 | @dir = (dir.is_a? Pathname ? dir : Pathname(dir)).expand_path 49 | raise ArgumentError, "#{dir} must be a readable directory" unless 50 | @dir.directory? and dir.readable? 51 | # we deal with it this way because `private` is a keyword 52 | @private = @dir + binding.local_variable_get(:private) 53 | 54 | super 55 | end 56 | 57 | # Write a single resource to the document root. 58 | # 59 | # @param resource [Intertwingler::Document, RDF::URI, URI, #to_s] the resource we 60 | # want to write 61 | # @param published [true, false] whether to write the published 62 | # version, if it exists 63 | # @param rehydrate [false, true] whether to run the rehydrate operation 64 | # @param rescan [false, true] whether to rescan the document 65 | # @param sponge [false, true] whether to sponge the RDFa into the graph 66 | # 67 | # @return [Array] the path(s) written to disk. 68 | # 69 | # @note The `rehydrate`, `rescan` and `sponge` parameters are 70 | # probably unnecessary here. 71 | # 72 | # @note While we're at it, is that really the most sensible return value? 73 | # 74 | def write resource, published: true, rehydrate: false, 75 | rescan: false, sponge: false 76 | unless resource.is_a? Intertwingler::Document 77 | begin 78 | resource = @context.visit resource 79 | rescue Intertwingler::Source::NotAcceptable 80 | warn "No variant found for #{uri}" 81 | return 82 | end 83 | 84 | return unless resource 85 | end 86 | 87 | states = [false] 88 | states << true if published && resource.published? 89 | 90 | ok = [] 91 | states.each do |state| 92 | target = state ? @dir : @private 93 | 94 | # XXX this only handles Intertwingler::Document objects; we will 95 | # need to rethink this for the move to the 96 | # Intertwingler::Representation regime (which should have a unified 97 | # interface for serialization no matter what the payload is). 98 | # This is fine for now though. 99 | 100 | doc = resource.transform(published: state, rehydrate: rehydrate, 101 | rescan: rescan, sponge: sponge) or next 102 | 103 | begin 104 | fh = Tempfile.create('xml-', target) 105 | path = Pathname(fh.path) 106 | 107 | # write the doc to the target 108 | doc.write_to fh 109 | fh.close 110 | 111 | uuid = URI(resource.uuid.to_s) 112 | newpath = path.dirname + "#{uuid.uuid}.xml" 113 | ok << newpath 114 | 115 | # XXX do we wanna include umask?? 116 | File.chmod 0644, path 117 | File.rename path, newpath 118 | File.utime resource.mtime, resource.mtime, newpath 119 | rescue Exception => e 120 | # XXX this should only rescue a specific class of errors 121 | # XXX ps do something more intelligent here 122 | warn e.class, e 123 | File.unlink path if path.exist? 124 | end 125 | end 126 | end 127 | 128 | def call *args, **options, &block 129 | # get all the documents 130 | 131 | # write each one out 132 | # call the block 133 | end 134 | end 135 | 136 | # This is literally a rack app. 137 | class Rack < Surface 138 | end 139 | end 140 | -------------------------------------------------------------------------------- /lib/intertwingler/transform/raster.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/transform' 2 | 3 | # the representation 4 | require 'intertwingler/representation/vips' 5 | 6 | class Intertwingler::Transform::Raster < Intertwingler::Transform::Handler 7 | private 8 | 9 | REPRESENTATION = Intertwingler::Representation::Vips 10 | 11 | # these are all the formats that vips will write 12 | OUT = %w[avif gif heic heif jpeg jp2 jxl png tiff webp x-portable-anymap 13 | x-portable-bitmap x-portable-graymap x-portable-pixmap].map do |t| 14 | "image/#{t}".freeze 15 | end.freeze 16 | 17 | # we are not averse to reading pdfs 18 | IN = (%w[application/pdf] + OUT).freeze 19 | 20 | # XXX TODO `page` for pdfs and `frame` for animated gifs, also 21 | # rotate (90deg *and* arbitrary), flip etc. 22 | URI_MAP = { 23 | '4c817a44-005d-48cb-83be-d962604cddda' => [:convert, IN, OUT], 24 | 'deb428cb-2f88-4726-98ea-d4b8d4589f17' => [:crop, IN, OUT], 25 | '5842e610-c5d3-46cd-8ec6-c1c64bf44d3a' => [:scale, IN, OUT], 26 | 'a3fd7171-ecaf-4f2a-a396-ebddf1b65eb4' => [:desaturate, IN, OUT], 27 | '7beb24fb-9708-4fd5-861a-1b2aaa45d46e' => [:posterize, IN, OUT], 28 | 'e43dc4b8-20e1-4739-9150-c1842d64eb5d' => [:knockout, IN, OUT], 29 | 'f77a0a45-2ba6-4a3b-8291-9eaae2a80a82' => [:brightness, IN, OUT], 30 | '973172a1-261b-4621-b27b-98d660e87544' => [:contrast, IN, OUT], 31 | '2fe3049b-bc1e-496f-9abc-dafa45746ef5' => [:gamma, IN, OUT], 32 | }.freeze 33 | 34 | def accept_header req 35 | accept = 36 | req.get_header('HTTP_ACCEPT').to_s.split(/\s*,+\s*/).first or return 37 | 38 | MimeMagic[accept] 39 | end 40 | 41 | # do nothing but convert 42 | def convert req, params 43 | 44 | body = req.body 45 | 46 | # this will have been set 47 | 48 | accept = accept_header req 49 | 50 | # this fast-tracks to 304 upstream 51 | return if body.type == accept 52 | 53 | # setting the body type (if different) invalidates the io 54 | body.type = accept 55 | 56 | body 57 | end 58 | 59 | # crops the image by xywh 60 | def crop req, params 61 | # XXX sanitize params 62 | x, y, width, height = params.values_at(:x, :y, :width, :height).map do |x| 63 | x.first.to_i 64 | end 65 | 66 | body = req.body 67 | img = body.object 68 | img = img.crop x, y, width, height 69 | 70 | body.type = accept_header req 71 | body.object = img 72 | body 73 | end 74 | 75 | # scales the image down 76 | def scale req, params 77 | # XXX sanitize params 78 | # width, height = params.values_at :width, :height 79 | 80 | body = req.body 81 | img = body.object 82 | img = img.thumbnail_image params[:width].first.to_i 83 | 84 | body.type = accept_header req 85 | body.object = img 86 | body 87 | end 88 | 89 | # parameterless desaturate; maybe we roll it into brightness/contrast? iunno 90 | def desaturate req, params 91 | 92 | body = req.body 93 | img = body.object 94 | img = img.colourspace :b_w 95 | 96 | body.type = accept_header req 97 | body.object = img 98 | body 99 | end 100 | 101 | # flatten colours out 102 | def posterize req, params 103 | # okay so apparently posterize involves chonking out the colours; 104 | # the parameter is the number of steps 105 | 106 | body.type = accept_header req 107 | body.object = img 108 | body 109 | end 110 | 111 | # knock out a colour ± radius around it 112 | def knockout req, params 113 | # this one is gonna be tough and require some thought 114 | raise Intertwingler::Handler::Error::Server.new( 115 | 'Transform `knockout` not implemented', status: 501) 116 | end 117 | 118 | # adjust brightness 119 | def brightness req, params 120 | raise Intertwingler::Handler::Error::Server.new( 121 | 'Transform `brightness` not implemented', status: 501) 122 | end 123 | 124 | # adjust contrast 125 | def contrast req, params 126 | raise Intertwingler::Handler::Error::Server.new( 127 | 'Transform `contrast` not implemented', status: 501) 128 | end 129 | 130 | # adjust gamma 131 | def gamma req, params 132 | raise Intertwingler::Handler::Error::Server.new( 133 | 'Transform `gamma` not implemented', status: 501) 134 | end 135 | 136 | # TODO rotate (arbitrary with speedup for 45-degree increments, 137 | # alpha channel), flip (h, v), gaussian blur 138 | 139 | public 140 | end 141 | -------------------------------------------------------------------------------- /lib/intertwingler/transform/rdf.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/transform' 2 | 3 | # mainly (exclusively?) for converting between syntaxes 4 | class Intertwingler::Transform::RDF < Intertwingler::Transform::Handler 5 | 6 | end 7 | -------------------------------------------------------------------------------- /lib/intertwingler/transform/sass.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/transform' 2 | 3 | # all this is gonna do is run the sass thinger. this one will need 4 | # access to the engine to run subrequests. actually since active sass 5 | # development has moved to dart this could be an early candidate for 6 | # reverse-proxying to a handler written in dart. 7 | class Intertwingler::Transform::Sass < Intertwingler::Transform::Handler 8 | 9 | end 10 | -------------------------------------------------------------------------------- /lib/intertwingler/transform/text.rb: -------------------------------------------------------------------------------- 1 | require 'intertwingler/transform' 2 | 3 | # load this anyway cause we'll return it 4 | require 'intertwingler/representation/nokogiri' 5 | 6 | # this is gonna be mainly stuff like markdown->html and tidy since 7 | # they both ingest only text, but we could consider other fun 8 | # plain-text transforms too. 9 | class Intertwingler::Transform::Text < Intertwingler::Transform::Handler 10 | 11 | end 12 | -------------------------------------------------------------------------------- /lib/intertwingler/types.rb: -------------------------------------------------------------------------------- 1 | 2 | require 'dry-schema' 3 | require 'uri' 4 | require 'pathname' 5 | require 'mimemagic' 6 | 7 | require_relative 'util/clean' 8 | require_relative 'rubyurn' 9 | 10 | module Intertwingler 11 | # XXX pop this out into its own module when we're ready 12 | module Types 13 | include Dry::Types() 14 | 15 | Dry::Types.define_builder :hash_default do |type| 16 | reqd = type.keys.select &:required? 17 | 18 | if reqd.empty? 19 | return type.default({}.freeze) unless type.keys.empty? 20 | else 21 | return type.default(reqd.map { |k| [k.name, k.value] }.to_h.freeze) if 22 | reqd.all?(&:default?) 23 | end 24 | 25 | type 26 | end 27 | 28 | # XXX THIS IS A BAD SOLUTION TO THE URI PROBLEM 29 | Dry::Schema::PredicateInferrer::Compiler.infer_predicate_by_class_name false 30 | 31 | private 32 | 33 | HOSTRE = 34 | /[0-9a-z](?:[-0-9a-z]*[0-9a-z])?(?:\.[0-9a-z](?:[-0-9a-z]*[0-9a-z])?)*/i 35 | AUTHRE = /#{HOSTRE}(:\d+)?/o 36 | 37 | UNITS = { nil => 1 } 38 | 'kmgtpe'.split('').each_with_index do |x, i| 39 | UNITS[x] = 1000 ** (i + 1) 40 | UNITS[x.upcase] = 1024 ** (i + 1) 41 | end 42 | UNITS.freeze 43 | 44 | public 45 | 46 | # @!group Atoms 47 | 48 | Term = Types.Constructor(::RDF::URI) { |x| RDF::URI(x) } 49 | 50 | # A relative path name. 51 | RelativePathname = Types.Constructor(::Pathname) { |x| Pathname(x) } 52 | 53 | # A path name that actually exists on the drive. 54 | ExtantPathname = Types.Constructor(::Pathname) do |x| 55 | out = Pathname(x).expand_path 56 | dir = out.dirname 57 | raise Dry::Types::CoercionError, "#{dir} does not exist" unless 58 | out.exist? || dir.exist? 59 | 60 | out 61 | end 62 | 63 | # A path name that is actually writable by the process. 64 | WritablePathname = Types.Constructor(::Pathname) do |x| 65 | out = Pathname(x) 66 | dir = out.expand_path.dirname 67 | raise Dry::Types::CoercionError, "#{dir} is not writable" unless 68 | dir.writable? 69 | raise Dry::Types::CoercionError, "#{out} can't be overwritten" if 70 | out.exist? and !out.writable? 71 | out 72 | end 73 | 74 | # A normalized symbol. Takes its input first to a string, then 75 | # strips it, lowercases it, and transforms whitespace and hyphens 76 | # to underscores. 77 | NormSym = Symbol.constructor do |k| 78 | k.to_s.strip.downcase.tr_s(' _-', ?_).to_sym 79 | end 80 | 81 | # A byte count. Use optional suffixes `K`, `M`, `G`, `T`, `P`, `E` 82 | # to raise to the appropriate power of two. Use lower-case 83 | # suffixes for powers of ten. 84 | Bytes = Integer.constructor do |x| 85 | m = /\A\s*(\d+)([kmgtpeKMGTPE])?\s*\Z/s.match x.to_s 86 | raise Dry::Types::CoercionError, "#{x} not a viable byte size" unless m 87 | 88 | factor, unit = m.captures 89 | factor.to_i * UNITS[unit] 90 | end 91 | 92 | # A hostname is of course a string constrained like `foo.bar.com`. 93 | Hostname = String.constrained(format: /^#{HOSTRE}$/o).constructor do |k| 94 | # XXX you have to do this instead of just say :downcase 95 | k.downcase 96 | end 97 | 98 | Port = Coercible::Integer.constrained(gt: 0, lt: 65536) 99 | 100 | # An authority differs from a hostname in that it can have a port 101 | # number separated by a colon. 102 | Authority = String.constrained(format: /^#{AUTHRE}$/o).constructor do |k| 103 | k.downcase 104 | end 105 | 106 | # A media type, e.g. `text/plain`. 107 | MediaType = Types::Constructor(MimeMagic).constrained( 108 | format: /^[^\/]+\/[^\/]+$/) 109 | 110 | # An unprocessed CURIE or IRI, i.e. prior to prefix expansion. 111 | CURIEOrIRIString = String.constrained format: URI.regexp 112 | 113 | # A "negatable" CURIE/IRI borrows from SPARQL's invert property notation. 114 | # Coerces `^foo:bar` to a pair of the form `["foo:bar", true]`. 115 | NegatableCURIEOrIRI = Array.constructor do |x| 116 | m = /^(\^)?(#{URI.regexp})/o.match(x) or raise Dry::Types::CoercionError, 117 | "#{x} is not a viable CURIE/IRI path" 118 | out = m.captures[0,2].reverse 119 | out[-1] = !!out.last # coerce that to a boolean 120 | out 121 | end 122 | 123 | URI = Types.Constructor(::URI) do |x| 124 | begin 125 | out = ::URI.parse(x) 126 | rescue ::URI::InvalidURIError => e 127 | raise Dry::Types::CoercionError, e 128 | end 129 | 130 | out 131 | end 132 | 133 | # A single RDF vocabulary 134 | Vocab = Types::Constructor(RDF::Vocabulary) do |vocab| 135 | Intertwingler::Util::Clean.sanitize_vocab vocab 136 | end 137 | 138 | RubyURN = URI.constrained format: /\Aurn:x-ruby:/i 139 | 140 | # @!group Molecules 141 | 142 | # A hash where the keys are normalized symbols. 143 | # @note XXX it is fucking stupid that you have to do this. 144 | SymbolHash = Hash.constructor do |h| 145 | h = {} if h.nil? 146 | raise Dry::Types::CoercionError, 147 | "#{h.inspect} is a #{h.class}, not a Hash" unless h.is_a? ::Hash 148 | h.transform_keys { |k| NormSym[k] } 149 | end 150 | 151 | # RDF vocabularies 152 | Vocabs = SymbolHash.constructor do |x| 153 | Intertwingler::Util::Clean.sanitize_prefixes x, nonnil: true 154 | end 155 | 156 | # this is the harness configuration 157 | 158 | LibsConfig = SymbolHash.schema path?: Array.of(RelativePathname), 159 | preload?: Array.of(RubyURN) 160 | 161 | GraphConfig = SymbolHash.schema driver?: RubyURN, 162 | init?: Array.of(RelativePathname) 163 | 164 | StaticConfig = SymbolHash.schema target: ExtantPathname 165 | 166 | DomainConfig = SymbolHash.schema graph?: GraphConfig, static?: StaticConfig 167 | 168 | JWTAlgo = String.default('HS256'.freeze).enum( 169 | *(%w[ES256K ED25519] + 170 | %w[HS ES RS PS].product([256, 384, 512]).map(&:join))) 171 | 172 | JWTConfig = SymbolHash.schema( 173 | algorithm?: JWTAlgo, 174 | secret: String, 175 | ).hash_default 176 | 177 | HarnessConfig = SymbolHash.schema( 178 | host?: Hostname, port?: Port, 179 | libs?: LibsConfig, graph?: GraphConfig, 180 | jwt?: JWTConfig, 181 | authorities?: Hash.map(Hostname, DomainConfig), 182 | ) 183 | end 184 | end 185 | -------------------------------------------------------------------------------- /lib/intertwingler/util.rb: -------------------------------------------------------------------------------- 1 | # bring in the namespace 2 | require 'intertwingler/version' 3 | 4 | require 'mimemagic' 5 | # XXX this is not strictly correct but good enough for now, also 6 | # application/x-www-form-urlencoded is not in the mime types (and thus 7 | # has no `canonical)`, so that was fun to debug (oh look turns out 8 | # multipart/form-data isn't in there either) 9 | [ 10 | ['application/x-www-form-urlencoded', [], %w(text/plain), []], 11 | ['multipart/form-data', [], %w(application/octet-stream), []], 12 | ['text/n3', %w(n3 ttl nt), %w(text/plain), [[0..256, '@prefix']]], 13 | ['application/x-vnd.sass', %w(sass), %w(text/plain), []], 14 | ['application/x-vnd.sass.scss', %w(scss), %w(text/css), []], 15 | ].each do |magic| 16 | MimeMagic.add magic[0], extensions: magic[1], 17 | parents: magic[2], magic: magic[3] 18 | end 19 | 20 | # bring in the patients 21 | require 'intertwingler/util/clean' 22 | require 'intertwingler/util/messy' 23 | 24 | # 2021-12-27: Here's the plan for this thing: 25 | # 26 | # * rename {Intertwingler::Util} to {Intertwingler::Util::Messy} 27 | # 28 | # * create {Intertwingler::Util::Clean} which will eventually be the new 29 | # {Intertwingler::Util} 30 | # 31 | # * create a temporary {Intertwingler::Util} that yokes `Clean` and `Messy` 32 | # back together 33 | # 34 | # * move all genuine bona fide *stateless* utility functions that are 35 | # used in more than one place to {Intertwingler::Util::Clean} 36 | # 37 | # * refactor `Messy` until it ceases to exist 38 | # 39 | # * rename `Clean` to {Intertwingler::Util} 40 | # 41 | module Intertwingler::Util 42 | include Clean 43 | include Messy 44 | 45 | # Look, it was easier to write an LRU cache than figure out which 46 | # off-the-shelf one to use. 47 | # 48 | class LRU < Hash 49 | attr_accessor :capacity 50 | 51 | def initialize default = nil, capacity: nil 52 | @capacity = capacity || Float::INFINITY 53 | super default 54 | end 55 | 56 | def [] key 57 | if key? key 58 | # delete the key from the contents 59 | value = delete key 60 | self[key] = value 61 | end 62 | end 63 | 64 | def []= key, value 65 | n = size - capacity 66 | 67 | keys.take(n).each { |k| delete k } if n > 0 68 | 69 | super key, value 70 | end 71 | 72 | def fetch key, default = nil, &block 73 | return self[key] if key? key 74 | return block.call key if block 75 | return default 76 | 77 | end 78 | 79 | def fetch_values *keys, &block 80 | keys.map { |k| fetch k, &block } 81 | end 82 | 83 | def values_at *keys 84 | fetch_values(*keys) 85 | end 86 | 87 | def to_h 88 | dup 89 | end 90 | end 91 | end 92 | -------------------------------------------------------------------------------- /lib/intertwingler/util/clean.rb: -------------------------------------------------------------------------------- 1 | # bring in the namespace 2 | require 'intertwingler/version' 3 | 4 | require 'rdf' 5 | require 'uri' 6 | 7 | # do this so we don't have to indent all to hell 8 | module Intertwingler::Util; end 9 | 10 | module Intertwingler::Util::Clean 11 | private 12 | 13 | URI_COERCIONS = { 14 | nil => -> t { t.to_s.strip }, 15 | false => -> t { t.to_s.strip }, 16 | uri: -> t { t.is_a?(URI) ? t : URI(t.to_s.strip) }, 17 | rdf: -> t { 18 | return t if t.is_a? RDF::Resource 19 | t = t.to_s.strip 20 | t.start_with?('_:') ? RDF::Node(t.delete_prefix '_:') : RDF::URI(t) 21 | }, 22 | term: -> t { 23 | # if it's a vocab term check if it's also a vocab, otherwise noop 24 | if t.is_a? RDF::Vocabulary::Term 25 | tt = RDF::Vocabulary.find t 26 | t = tt if tt and tt.to_uri == t.to_uri 27 | # skull emoji 28 | t = RDF::RDFV if t == RDF 29 | 30 | return t 31 | end 32 | 33 | # noop if it's a vocab 34 | if t.is_a? RDF::Vocabulary 35 | # not taking any chances 36 | t = RDF::RDFV if t == RDF 37 | return t 38 | end 39 | 40 | # turn to uri or bnode if neither 41 | unless t.is_a? RDF::Resource 42 | t = t.to_s.strip 43 | t = t.start_with?('_:') ? RDF::Node(t.delete_prefix '_:') : RDF::URI(t) 44 | end 45 | 46 | # try to resolve it to a vocab term 47 | if t.uri? 48 | t = (RDF::Vocabulary.find_term(t) rescue t) || t 49 | tt = RDF::Vocabulary.find t 50 | t = tt if tt and tt.to_uri == t.to_uri 51 | 52 | # ugh this thing 53 | t = RDF::RDFV if t == RDF 54 | end 55 | 56 | t 57 | }, 58 | vocab: -> t { 59 | raise NotImplementedError, 'lol vocab coercion not implemented' 60 | }, 61 | } 62 | 63 | URI_COERCION_TYPES = { 64 | nil => String, 65 | false => String, 66 | uri: URI, 67 | rdf: RDF::URI, 68 | term: RDF::Vocabulary::Term, 69 | vocab: RDF::Vocabulary, 70 | } 71 | 72 | public 73 | 74 | def assert_uri_coercion coerce 75 | if coerce 76 | coerce = coerce.to_s.to_sym if coerce.respond_to? :to_s 77 | raise ArgumentError, "coerce must be in #{URI_COERCIONS.keys}" unless 78 | URI_COERCIONS.key?(coerce) 79 | end 80 | coerce 81 | end 82 | 83 | # assertions and coercions 84 | 85 | # Coerce the argument into a resource, either {URI} or {RDF::URI} 86 | # (or {RDF::Node}). The type can be specified 87 | # 88 | # @param arg [#to_s, URI, RDF::URI, RDF::Node] the argument to 89 | # coerce into a resource 90 | # @param as [:rdf, :uri, :term, false, nil] how to coerce the result 91 | # @yieldparam arg [String] the argument for further processing 92 | # @yieldreturn [#to_s] the preprocessed argument 93 | # 94 | # @return [RDF::URI, URI, RDF::Vocabulary::Term, RDF::Vocabulary, String] 95 | # 96 | def coerce_resource arg, as: :rdf, base: nil, &block 97 | # noop if this is already done 98 | return arg if as and arg.is_a? URI_COERCION_TYPES[as] 99 | 100 | arg = arg.to_s.strip 101 | 102 | if arg.start_with? '_:' and as 103 | # override the coercion if this is a blank node 104 | as = :rdf 105 | elsif arg.start_with?(?#) and 106 | uuid = UUID::NCName.from_ncname(arg.delete_prefix(?#), format: :urn) 107 | return URI_COERCIONS[as].call uuid 108 | elsif block 109 | arg = block.call arg 110 | return if arg.nil? 111 | end 112 | 113 | URI_COERCIONS[as].call arg 114 | end 115 | 116 | # Apply #coerce_resource to each element of an array, returning an 117 | # array of resources. If `arg` can't be turned into an array, it 118 | # will be wrapped in one. Returns an array of whatever type `as:` is 119 | # set to return. 120 | # 121 | # @param arg [#to_a, #to_s, URI, RDF::URI, RDF::Node] the thing(s) 122 | # to coerce 123 | # @param as [:rdf, :uri, :term, false, nil] how to coerce the 124 | # result(s) 125 | # 126 | # @return [Array] the coerced elements 128 | # 129 | def coerce_resources arg, as: :rdf, &block 130 | # note nil.to_a is [] 131 | (arg.respond_to?(:to_a) ? arg.to_a : [arg]).map do |c| 132 | coerce_resource c, as: as, &block 133 | end.compact 134 | end 135 | 136 | def assert_term term 137 | raise ArgumentError, "term must be an RDF::Value" unless 138 | term.is_a? RDF::Value 139 | term 140 | end 141 | 142 | # Assert that the given argument is an {RDF::Resource}. 143 | # 144 | # @param term [RDF::Resource] the input being tested 145 | # @param message [String] an overriding error message (not sure if 146 | # dumb) 147 | # @param blank [true, false] whether to permit blank nodes 148 | # @param vocab [false, true] whether to try to resolve the term 149 | # using {RDF::Vocabulary.find_term} 150 | # 151 | # @raise [ArgumentError] if the input is not an {RDF::Resource} 152 | # 153 | # @return [RDF::Resource] the argument 154 | # 155 | def assert_resource term, 156 | message = "Term must be a resource, not #{term.inspect}", 157 | blank: true, vocab: false 158 | raise ArgumentError, message unless 159 | term.is_a?(blank ? RDF::Resource : RDF::URI) 160 | 161 | term = (RDF::Vocabulary.find_term(term) rescue term) || term if 162 | vocab and term.uri? and not term.is_a? RDF::Vocabulary::Term 163 | 164 | term 165 | end 166 | 167 | # Normalize (and assert) the input as an array of {RDF::Resource}s. 168 | # 169 | # @param terms [RDF::Resource, Array] the term(s) 170 | # @param blank [true, false] whether to allow blank nodes ({RDF::Node}) 171 | # @param empty [true, false] whether the list can be empty 172 | # @param unique [true, false] whether to remove duplicates 173 | # @param vocab [false, true] whether to try to resolve the terms 174 | # using {RDF::Vocabulary.find_term} 175 | # 176 | # @raise [ArgumentError] if the input contains non-{RDF::Resource}s 177 | # 178 | # @return [Array] the normalized array of resources 179 | # 180 | def assert_resources terms, blank: true, empty: true, 181 | unique: true, vocab: false 182 | # normalize to array 183 | terms = [] if terms.nil? 184 | terms = terms.respond_to?(:to_a) ? terms.to_a.dup : [terms] 185 | 186 | # ensure everything is a resource (or uri) 187 | tc = blank ? RDF::Resource : RDF::URI 188 | if bad = terms.detect { |t| !t.is_a? tc } 189 | # XXX i know i know but too lazy to do a proper error message 190 | raise ArgumentError, "Term(s) must be an #{tc}, not #{bad.class}" 191 | end 192 | 193 | # ditto lol 194 | raise ArgumentError, "Need at least one term" if terms.empty? and !empty 195 | 196 | # resolve to vocabulary terms 197 | terms = terms.map do |t| 198 | t.uri? ? (RDF::Vocabulary.find_term t rescue t) || t : t 199 | end if vocab 200 | 201 | # prune out any duplicates 202 | terms.uniq! if unique 203 | 204 | # et voilà 205 | terms 206 | end 207 | 208 | # Test that the hash is a struct 209 | # 210 | # @param struct [Hash] (we hope) 211 | # 212 | # @return [Hash] the struct 213 | # 214 | def assert_struct struct 215 | if struct.is_a? Hash and struct.all? do |pair| 216 | pair.first.is_a? RDF::URI and 217 | pair.last.is_a? Array and pair.last.all? { |x| x.is_a? RDF::Value } 218 | end 219 | return struct 220 | end 221 | raise ArgumentError, "struct is not valid: #{struct.inspect}" 222 | end 223 | 224 | # Find a subset of a struct for a given set of predicates, 225 | # optionally inverting to give the objects as keys and predicates as 226 | # values. 227 | # 228 | # @param struct [Hash] 229 | # @param preds [RDF::URI, #to_a] 230 | # @param entail [true, false] whether to entail the predicate(s) 231 | # @param invert [true, false] whether to invert the resulting hash 232 | # 233 | # @return [Hash] the selected subset (which could be empty) 234 | # 235 | def find_in_struct struct, preds, entail: false, invert: false 236 | raise ArgumentError, 'preds must not be nil' if preds.nil? 237 | preds = preds.respond_to?(:to_a) ? preds.to_a : [preds] 238 | preds = predicate_set preds if entail 239 | 240 | struct = struct.select { |p, _| preds.include? p } 241 | 242 | invert ? invert_struct(struct) : struct 243 | end 244 | 245 | # Turns any data structure containing {RDF::Term} objects into a 246 | # flat set thereof. If `:uri` is true, then any {RDF::Literal} 247 | # objects are mined for their datatypes and those are returned 248 | # alongside other {RDF::URI}s instead. 249 | # 250 | # @param struct [RDF::Term, #to_a] the struct 251 | # @param uris [false, true] whether to prune out all but {RDF::URI} 252 | # objects 253 | # 254 | # @return [Set] The set of terms 255 | # 256 | def smush_struct struct, uris: false 257 | out = Set[] 258 | 259 | if struct.is_a? RDF::Term 260 | if uris 261 | case 262 | when struct.literal? 263 | out << struct.datatype if struct.datatype? 264 | when struct.uri? then out << struct 265 | end 266 | else 267 | out << struct 268 | end 269 | elsif struct.respond_to? :to_a 270 | out |= struct.to_a.map do |s| 271 | smush_struct(s, uris: uris).to_a 272 | end.flatten.to_set 273 | end 274 | 275 | out 276 | end 277 | 278 | # Invert a given struct so that `{ predicate => Set[object] }` 279 | # becomes `{ object => Set[predicate] }`. Optionally run a block in 280 | # the inner loop. If the block returns an `Array`, the first two 281 | # values will be assigned to the predicate and object in the 282 | # returned inverted struct. Return an explicit `nil` in the block to 283 | # 284 | # @param struct [Hash{RDF::Resource => Set}] a structure containing 285 | # the predicates and objects for a given subject. 286 | # @yieldparam predicate [RDF::Resource] the predicate of the statement 287 | # @yieldparam object [RDF::Value] the object of the statement 288 | # @yieldreturn [nil, Array] an optional predicate-object pair. 289 | # 290 | # @return [Hash] the inverted struct. 291 | # 292 | def invert_struct struct, &block 293 | nodes = {} 294 | 295 | struct.each do |p, v| 296 | v.each do |o| 297 | # copy the predicate so we don't overwrite it 298 | pi = p 299 | 300 | if block 301 | tmp = block.call pi, o 302 | # assign block return if it has one 303 | pi, o = *tmp if tmp.is_a? Array 304 | end 305 | 306 | # now assign to output 307 | nodes[o] ||= Set.new 308 | nodes[o] << pi 309 | end 310 | end 311 | 312 | nodes 313 | end 314 | 315 | private 316 | 317 | # anything that could possibly be construed as whitespace 318 | WS_RE = /[\s\u{0085 00a0 1680 2028 2029 202f 205f 3000}\u2000-\u200a]+/ 319 | 320 | public 321 | 322 | def normalize_space string 323 | string.gsub(WS_RE, ' ').strip 324 | end 325 | 326 | # Sanitize a term as an {::RDF::Vocabulary}. 327 | # 328 | # @param term [#to_s,RDF::URI,URI] the term to sanitize. 329 | # @param cache [Hash] an optional cache. 330 | # 331 | # @return [RDF::Vocabulary] 332 | # 333 | def sanitize_vocab vocab, cache: nil 334 | # def self.sanitize_vocab vocab, cache: nil 335 | cache = {} unless cache.is_a? Hash 336 | # 2022-05-18 XXX THIS IS A CLUSTERFUCK 337 | # 338 | # what we want is the official vocab if it exists, an 339 | # on-the-fly vocab if it doesn't, and to use RDF::RDFV 340 | # instead of RDF if it shows up 341 | # 342 | # we notice that bibo:status/ resolves to bibo: with .find 343 | # so we need to check if the uri is the same before accepting it 344 | vocab = RDF::URI(vocab) unless vocab.is_a? RDF::URI 345 | vocab = if cache[vocab.to_s] 346 | cache[vocab.to_s] 347 | elsif vocab.is_a?(Class) and 348 | vocab.ancestors.include?(RDF::Vocabulary) 349 | vocab # arrrrghhh 350 | elsif vv = RDF::Vocabulary.find(vocab) # XXX SLOW AF hence cache 351 | vv.to_uri == vocab ? vv : Class.new(RDF::Vocabulary(vocab)) 352 | else 353 | Class.new(RDF::Vocabulary(vocab)) 354 | end 355 | # GRRRR the punning on RDF messes things up so we have to replace it 356 | vocab = RDF::RDFV if vocab == RDF 357 | 358 | cache[vocab.to_s] = vocab 359 | end 360 | 361 | # Return a hash mapping a set of RDF prefixes to their vocabularies. 362 | # 363 | # @param prefixes [Hash, #to_h, String, #to_s] the input prefixes 364 | # @param downcase [true, false] whether to normalize key symbolss to downcase 365 | # @param nonnil [false, true] whether to remove the nil prefix 366 | # @param cache [Hash] an optional cache for the slowness 367 | # 368 | # @return [Hash{Symbol=>RDF::Vocabulary}] sanitized prefix map 369 | # 370 | def sanitize_prefixes prefixes, downcase: true, nonnil: false, cache: nil 371 | # def self.sanitize_prefixes prefixes, downcase: true, nonnil: false, cache: nil 372 | prefixes = {} unless prefixes # noop prefixes 373 | cache = {} unless cache.is_a? Hash # noop cache 374 | 375 | # turn raw text from a `prefix` attribute into a hash 376 | if !prefixes.respond_to?(:to_h) && prefixes.respond_to?(:to_s) 377 | prefixes = prefixes.to_s.strip.split.each_slice(2).map do |k, v| 378 | [k.split(?:).first, v] 379 | end.to_h 380 | end 381 | 382 | raise ArgumentError, 'prefixes must be a hash' unless 383 | prefixes.is_a? Hash or prefixes.respond_to? :to_h 384 | 385 | prefixes = prefixes.to_h.map do |k, v| 386 | unless k.nil? 387 | k = k.to_s.strip 388 | if k.empty? 389 | k = nil 390 | else 391 | k.downcase! if downcase 392 | k = k.to_sym 393 | end 394 | end 395 | [k, sanitize_vocab(v, cache: cache)] if (k or !nonnil) and v 396 | end.compact.to_h 397 | 398 | prefixes 399 | end 400 | 401 | extend self 402 | end 403 | -------------------------------------------------------------------------------- /lib/intertwingler/version.rb: -------------------------------------------------------------------------------- 1 | module Intertwingler 2 | VERSION = '0.2.2' 3 | end 4 | -------------------------------------------------------------------------------- /lib/intertwingler/vocab.rb: -------------------------------------------------------------------------------- 1 | # ensure this is loaded 2 | require 'rdf' 3 | 4 | module Intertwingler 5 | module Vocab 6 | %i[ADMS CI CGTO IBIS ITCV PAV PM QB SCOVO TFO].each do |sym| 7 | autoload sym, "intertwingler/vocab/#{sym.to_s.downcase}.rb" 8 | end 9 | 10 | def self.load_vocabs 11 | constants.each { |c| const_get c } 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/intertwingler/vocab/scovo.rb: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # frozen_string_literal: true 3 | # This file generated automatically using rdf vocabulary format from http://purl.org/NET/scovo# 4 | require 'intertwingler/vocab' 5 | module Intertwingler::Vocab 6 | # @!parse 7 | # # Vocabulary for 8 | # # 9 | # class SCOVO < RDF::StrictVocabulary 10 | # # a statistical dataset 11 | # # @return [RDF::Vocabulary::Term] 12 | # attr_reader :Dataset 13 | # 14 | # # a dimension of a statistical data item 15 | # # @return [RDF::Vocabulary::Term] 16 | # attr_reader :Dimension 17 | # 18 | # # a statistical data item 19 | # # @return [RDF::Vocabulary::Term] 20 | # attr_reader :Item 21 | # 22 | # # @return [RDF::Vocabulary::Term] 23 | # attr_reader :dataset 24 | # 25 | # # @return [RDF::Vocabulary::Term] 26 | # attr_reader :datasetOf 27 | # 28 | # # @return [RDF::Vocabulary::Term] 29 | # attr_reader :dimension 30 | # 31 | # # @return [RDF::Vocabulary::Term] 32 | # attr_reader :max 33 | # 34 | # # @return [RDF::Vocabulary::Term] 35 | # attr_reader :min 36 | # 37 | # end 38 | SCOVO = Class.new(RDF::StrictVocabulary("http://purl.org/NET/scovo#")) do 39 | 40 | # Class definitions 41 | term :Dataset, 42 | comment: "a statistical dataset".freeze, 43 | label: "Dataset".freeze, 44 | type: ["owl:Class".freeze, "rdfs:Class".freeze] 45 | term :Dimension, 46 | comment: "a dimension of a statistical data item".freeze, 47 | label: "Dimension".freeze, 48 | type: ["owl:Class".freeze, "rdfs:Class".freeze] 49 | term :Item, 50 | comment: "a statistical data item".freeze, 51 | label: "Item".freeze, 52 | type: ["owl:Class".freeze, "rdfs:Class".freeze] 53 | 54 | # Property definitions 55 | property :dataset, 56 | domain: "scovo:Item".freeze, 57 | label: "belongs to dataset".freeze, 58 | range: "scovo:Dataset".freeze, 59 | type: "rdf:Property".freeze 60 | property :datasetOf, 61 | domain: "scovo:Dataset".freeze, 62 | label: "is the dataset of".freeze, 63 | range: "scovo:Item".freeze, 64 | type: "rdf:Property".freeze 65 | property :dimension, 66 | domain: "scovo:Item".freeze, 67 | label: "has a dimension".freeze, 68 | range: "scovo:Dimension".freeze, 69 | type: "rdf:Property".freeze 70 | property :max, 71 | domain: "scovo:Dimension".freeze, 72 | label: "has a maximum range value".freeze, 73 | type: "rdf:Property".freeze 74 | property :min, 75 | domain: "scovo:Dimension".freeze, 76 | label: "has a minimum range value".freeze, 77 | type: "rdf:Property".freeze 78 | 79 | RDF::Vocabulary.register :scovo, self if 80 | RDF::Vocabulary.respond_to? :register 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /sample2.conf: -------------------------------------------------------------------------------- 1 | # -*- mode: yaml -*- 2 | live: 3 | listen: localhost:10101 4 | static: 5 | targets: 6 | "doriantaylor.com": 7 | engine: 8 | sites: 9 | "doriantaylor.com": 10 | aliases: 11 | - "www.doriantaylor.com" 12 | - "localhost:5000" 13 | # home directory so we don't have to type it out over and over 14 | home: ~/projects/active/doriantaylor.com 15 | graph: 16 | driver: RDF::LMDB::Repository 17 | options: 18 | dir: /var/lib/rdf-lmdb 19 | mapsize: 128M 20 | # turtle files for instantiating the knowledge graph 21 | init: 22 | - experimental/content-inventory.ttl 23 | - experimental/concept-scheme.ttl 24 | # these are source drivers for resource representations, in order of search 25 | sources: 26 | - driver: ContentAddressable 27 | options: 28 | dir: /var/lib/store-digest 29 | mapsize: 128M 30 | - driver: FileSystem 31 | options: 32 | dir: trunk 33 | # these are rendering surfaces 34 | surfaces: 35 | # we need an identifier to be able to pick out the target 36 | static: 37 | driver: DocumentRoot 38 | options: 39 | dir: target 40 | private: .private # note: this is relative to dir 41 | # this is a sequence of stock transformation functions applied to 42 | # representations of various content types. 43 | transforms: 44 | "application/xhtml+xml": 45 | - name: strip-comments # what it says on the tin 46 | - name: repair-rdfa # scan document for unmapped rdfa prefixes; add 'em 47 | - name: rehydrate # link up terminology etc 48 | - name: add-social-meta # this could be a composition of sdo/ogp/twitter 49 | - name: rewrite-links # give non-dereferenceable URIs a web equivalent 50 | - name: mangle-mailto # obfuscate mailto: URIs eg with javascript 51 | - name: amazon-tag # normalize amazon links and add affiliate tag 52 | params: 53 | tag: doriantaylor-20 54 | - name: normalize-prefixes # prune rdfa prefix mappings to subset used 55 | - name: stylesheet-pi # prepend xml-stylesheet processing instruction 56 | params: 57 | type: text/xsl 58 | href: /transform # this is a default argument 59 | - name: reindent # eliminate funky indenting 60 | params: 61 | char: " " # this is the default 62 | count: 2 # so is this 63 | # All config that follows is global; it would be awfully nice if the 64 | # site-specific config could just delta against it 65 | # 66 | # certain surfaces span multiple sites 67 | surfaces: 68 | rack: 69 | driver: Rack 70 | options: 71 | listen: localhost:10101 72 | # these are the prefix-namespace mappings, including the default vocabulary. 73 | vocab: http://www.w3.org/1999/xhtml/vocab# 74 | prefixes: 75 | rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# 76 | adms: http://www.w3.org/ns/adms# 77 | awol: http://bblfish.net/work/atom-owl/2006-06-06/# 78 | bibo: http://purl.org/ontology/bibo/ 79 | bs: http://purl.org/ontology/bibo/status/ 80 | ci: https://vocab.methodandstructure.com/content-inventory# 81 | dcat: http://www.w3.org/ns/dcat# 82 | dc: http://purl.org/dc/elements/1.1/ 83 | dct: http://purl.org/dc/terms/ 84 | foaf: http://xmlns.com/foaf/0.1/ 85 | http: http://www.w3.org/2011/http# 86 | ibis: https://vocab.methodandstructure.com/ibis# 87 | og: http://ogp.me/ns# 88 | org: http://www.w3.org/ns/org# 89 | owl: http://www.w3.org/2002/07/owl# 90 | qb: http://purl.org/linked-data/cube# 91 | rdfs: http://www.w3.org/2000/01/rdf-schema# 92 | schema: https://schema.org/ 93 | sioc: http://rdfs.org/sioc/ns# 94 | sioct: http://rdfs.org/sioc/types# 95 | skos: http://www.w3.org/2004/02/skos/core# 96 | xhv: http://www.w3.org/1999/xhtml/vocab# 97 | xsd: http://www.w3.org/2001/XMLSchema# 98 | # these are always treated as "documents" 99 | documents: 100 | - "foaf:Document" 101 | - "bibo:Collection" 102 | - "skos:Collection" 103 | - "skos:ConceptScheme" 104 | - "qb:DataSet" 105 | - "sioc:Container" 106 | - "dcat:Resource" 107 | - "adms:Asset" 108 | - "http:Response" 109 | # these are fragments only if they can relate to a document 110 | fragments: 111 | "rdfs:Resource": 112 | - "foaf:isPrimaryTopicOf" 113 | - "^dct:hasPart" 114 | - "dct:isPartOf" 115 | "skos:Concept": 116 | - "skos:topConceptOf" 117 | - "skos:inScheme" 118 | - "^skos:member" 119 | - "^skos:memberList" 120 | "qb:Observation": 121 | - "qb:dataSet" 122 | - "^qb:observation" 123 | -------------------------------------------------------------------------------- /spec/intertwingler/document_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | require 'intertwingler/document' 4 | 5 | RSpec.describe Intertwingler::Document do 6 | end 7 | 8 | RSpec.describe Intertwingler::Document::Parsed do 9 | end 10 | -------------------------------------------------------------------------------- /spec/intertwingler/graphops_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | require 'rdf' 4 | require 'intertwingler/graphops' 5 | 6 | RSpec.describe Intertwingler::GraphOps do 7 | context 'stateless behaviour' do 8 | # type_strata 9 | # all_related (basically a shorthand for type_strata descend: true) 10 | # type_is? 11 | # property_set (née predicate_set) 12 | # symmetric? 13 | end 14 | 15 | context 'basic extended (entailing) graph queries' do 16 | # objects_for 17 | # subjects_for 18 | end 19 | 20 | context 'comparator function factories' do 21 | # cmp_literal 22 | # cmp_label 23 | # cmp_resource 24 | # cmp_term 25 | end 26 | 27 | context 'shorthands' do 28 | # types_for 29 | # rdf_type? 30 | # all_types 31 | # all_of_type 32 | 33 | # these return literals: 34 | # label_for 35 | # authors_for 36 | 37 | # these return non-nodes: 38 | # dates_for 39 | # formats_for 40 | 41 | # these return routing data: 42 | # published? 43 | # indexed? 44 | # host_for 45 | # replacements_for 46 | end 47 | 48 | context 'resource-centric interface' do 49 | # resource factory method 50 | # resource properties 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /spec/intertwingler/representation_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | require 'intertwingler/representation' 4 | 5 | RSpec.describe Intertwingler::Representation do 6 | end 7 | -------------------------------------------------------------------------------- /spec/intertwingler/resolver_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | require 'intertwingler/resolver' 4 | 5 | RSpec.describe Intertwingler::Resolver do 6 | 7 | context 'stateless methods' do 8 | # preproc 9 | # split_pp 10 | # split_qp 11 | # terminal_slug 12 | end 13 | 14 | context 'uuid<->uri resolution methods' do 15 | # uuid_for 16 | # uri_for 17 | end 18 | 19 | context 'curie methods' do 20 | # resolve_curie 21 | # abbreviate 22 | # prefix_subset 23 | end 24 | 25 | context 'proxy methods??' do 26 | # still not sure if we even wanna do this: graph proxy methods 27 | # that also do the uuid<->uri resolution. like, do we care? is it 28 | # more cumbersome than helpful? 29 | 30 | # struct_for 31 | # host_for 32 | # published? 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/intertwingler/source_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | require 'intertwingler/source' 4 | 5 | RSpec.describe Intertwingler::Source do 6 | end 7 | -------------------------------------------------------------------------------- /spec/intertwingler/surface_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | require 'intertwingler/surface' 4 | 5 | RSpec.describe Intertwingler::Surface do 6 | end 7 | -------------------------------------------------------------------------------- /spec/intertwingler/transform_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | require 'rdf' 4 | require 'rdf/turtle' 5 | require 'intertwingler/transform' 6 | 7 | RSpec.describe Intertwingler::Transform do 8 | 9 | root = Pathname(Dir.getwd) 10 | 11 | VOCAB = RDF::Vocabulary.new 'tag:makethingsmakesense.com,2020:transform/' 12 | 13 | repo = RDF::Repository.new 14 | repo.load root + 'example/transforms.ttl' 15 | 16 | TRANSFORMS = %w[subtree cleanup].map { |t| VOCAB[t].to_uri } 17 | PARAMS = [VOCAB.prefix, VOCAB.reindent, VOCAB.xpath] 18 | 19 | let(:harness) { Intertwingler::Transform::Harness.load repo, root } 20 | 21 | context 'harness behaviour' do 22 | it 'initializes' do 23 | expect(harness).to be_a Intertwingler::Transform::Harness 24 | end 25 | 26 | it 'loads all transforms' do 27 | expect(harness.transforms.sort).to eql TRANSFORMS.sort 28 | xf = harness.transforms.map { |t| harness.resolve t } 29 | expect(xf.select(&:implemented?).size).to eql xf.size 30 | end 31 | 32 | it 'loads partials' do 33 | partials = harness.partials 34 | expect(partials).to be_a Intertwingler::Transform::PartialCache 35 | end 36 | end 37 | 38 | context 'resolving transforms' do 39 | it 'finds a transform in the graph' do 40 | transform = harness.resolve VOCAB.subtree 41 | expect(transform).to be_a Intertwingler::Transform 42 | expect(transform.keys.sort).to eql PARAMS 43 | impl = RDF::URI('urn:x-ruby:Intertwingler::Transform::XPath') 44 | expect(transform.implementation).to eql impl 45 | expect(transform.accepts? 'application/xhtml+xml').to be true 46 | end 47 | 48 | it 'resolves an XSLT implementation' do 49 | transform = Intertwingler::Transform.resolve harness, VOCAB.cleanup 50 | expect(transform).to be_a Intertwingler::Transform::XSLT 51 | end 52 | 53 | end 54 | 55 | context 'resolving partials' do 56 | it 'should find a partial with given parameters' do 57 | partial = harness.resolve_partial transform: VOCAB.subtree, 58 | params: { xpath: '//html:main[1]', 59 | prefix: 'html:http://www.w3.org/1999/xhtml' } 60 | expect(partial).to be_a Intertwingler::Transform::Partial 61 | end 62 | 63 | it 'should not find the partial if only some of the parameters are given' do 64 | nothing = harness.resolve_partial transform: VOCAB.subtree, 65 | params: { xpath: '//html:main[1]' } 66 | expect(nothing).to be nil 67 | end 68 | end 69 | 70 | context 'resolving applications' do 71 | input = 72 | RDF::URI('ni:///sha-256;0GHHmDtxh9CRZttXdr-cX78u72auS2P-O6tDXxvz2kU') 73 | output = 74 | RDF::URI('ni:///sha-256;RqgCb4_A3x2ZmjRs65bfXBzsV-4CejRteaxmlPNyWpc') 75 | params = { xpath: '//html:main[1]', 76 | prefix: 'html:http://www.w3.org/1999/xhtml' } 77 | 78 | it 'resolves an application' do 79 | application = harness.resolve_application transform: VOCAB.subtree, 80 | input: input, params: params 81 | expect(application).to be_a Intertwingler::Transform::Application 82 | expect(application.transform).to be_a Intertwingler::Transform 83 | end 84 | 85 | it 'creates a new application from scratch' do 86 | start = Time.now.getgm 87 | stop = start + 1 88 | repo2 = RDF::Repository.new 89 | 90 | transform = harness.resolve VOCAB.subtree 91 | me = RDF::URI('urn:x-dummy:function-application') 92 | app = Intertwingler::Transform::Application.new me, 93 | transform, input, output, params, start: start, stop: stop 94 | app.to_triples.each { |stmt| repo2 << stmt } 95 | # warn repo2.dump :ntriples 96 | 97 | # ehh we'll test this later i can't think of any good tests right now 98 | end 99 | end 100 | 101 | context 'applying transforms' do 102 | 103 | it 'resolves the transform' do 104 | transform = harness.resolve VOCAB.subtree 105 | expect(transform).to_not be_nil 106 | end 107 | 108 | it 'applies a transform against parameters' do 109 | transform = harness.resolve VOCAB.subtree 110 | 111 | params = { 112 | VOCAB.xpath => RDF::Literal('//html:main[1]'), 113 | VOCAB.prefix => RDF::Literal('html:http://www.w3.org/1999/xhtml'), 114 | } 115 | 116 | input = (root + 'example/matches.xhtml').open 117 | 118 | output, parseout = transform.apply input, params 119 | expect(parseout).to be_a Nokogiri::XML::Document 120 | expect(parseout.root.name).to eql 'main' 121 | 122 | nothing = transform.apply input, params, 123 | accept: 'application/x-foo, */*;q=0' 124 | expect(nothing).to be_nil 125 | end 126 | 127 | it 'applies a transform against a partial' do 128 | end 129 | end 130 | 131 | context 'applying XSLT transforms' do 132 | 133 | it 'resolves the transform' do 134 | transform = harness.resolve VOCAB.cleanup 135 | expect(transform).to_not be_nil 136 | end 137 | 138 | it 'applies the transform to the input' do 139 | transform = harness.resolve VOCAB.cleanup 140 | 141 | input = (root + 'example/matches.xhtml').open 142 | 143 | output, parseout = transform.apply input 144 | expect(parseout).to be_a Nokogiri::XML::Document 145 | expect(parseout.root.name).to eql 'main' 146 | end 147 | end 148 | 149 | end 150 | -------------------------------------------------------------------------------- /spec/intertwingler_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | ### 4 | # 5 | # okay since there is not really anywhere better to do this, here is 6 | # the tentative testing plan: 7 | # 8 | # 1. test the extended graph operations 9 | # 10 | # 2. test the URI resolver 11 | # 12 | # 3. test the document parser (Intertwingler::Document::Parsed) 13 | # 14 | # 4. test the markup analysis/surgery 15 | # 16 | # 4.1 augmenting structure/addressability 17 | # 18 | # 4.2 NLP/term harvesting 19 | # 20 | # 5. test the markup generators 21 | # 22 | # 5.1 common traits 23 | # 24 | # 5.1.1 generated document fragments 25 | # 26 | # 5.1.2 seo/social media metadata 27 | # 28 | # 5.1.3 backlinks 29 | # 30 | # 5.2 concept scheme/collection 31 | # 32 | # 5.3 document stats 33 | # 34 | # 5.4 reading list/"books mentioned" 35 | # 36 | # 5.5 social graph/"people mentioned" 37 | # 38 | # 5.6 annotation set 39 | # 40 | # 6. test the target(s) 41 | # 42 | # 6.1 config harness 43 | # 44 | # 6.2 filesystem target 45 | # 46 | # 6.3 web app target 47 | # 48 | # 7. test the transforms 49 | # 50 | # 8. test the scraper 51 | # 52 | # 9. test the CLI 53 | # 54 | ### 55 | 56 | describe Intertwingler do 57 | it 'has a version number' do 58 | expect(Intertwingler::VERSION).not_to be nil 59 | end 60 | 61 | it 'does something useful' do 62 | expect(false).to eq(true) 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) 2 | require 'intertwingler' 3 | --------------------------------------------------------------------------------