├── .github
├── dependabot.yml
└── workflows
│ ├── codeql-analysis.yml
│ └── go.yml
├── .gitignore
├── LICENSE
├── README.md
├── actor-.go
├── actor-WebFinger.go
├── actor-activityStreams.go
├── actor-feed-.go
├── actor-feed-JSON.go
├── actor-feed-RSS.go
├── actor-feed-icon.go
├── actor-feed-links.go
├── actor-feed-microFormats.go
├── authorized-fetch.go
├── client-.go
├── client-applyLinks.go
├── client-clientOption.go
├── client-loadOption.go
├── constants.go
├── document-.go
├── document-activityStream.go
├── document-html-.go
├── document-html-jsonld-.go
├── document-html-jsonld-embedded.go
├── document-html-jsonld-linked.go
├── document-html-microformats.go
├── document-html-oembed.go
├── document-html-opengraph.go
├── document-html-opengraph_test.go
├── document-html-wordpress.go
├── go.mod
├── go.sum
├── htmlparser
├── htmlparser.go
├── opengraph.go
└── opengraph_test.go
├── meta
└── The_Adventure_of_Silver_Blaze.jpg
├── sherlock-extras.go
├── sherlock.go
├── sherlock_local_actor_test.go
├── sherlock_local_document_test.go
├── sherlock_remote_actor_test.go
├── sherlock_remote_document_test.go
├── test-files
├── actor-atom-1.xml
├── actor-json-1.json
├── actor-microformats-1.html
├── actor-microformats-3.html
├── actor-rss-1.html
├── actor-rss-1.xml
├── actor-rss-2.xml
├── document-ap-mastodon.html
├── document-ap-mastodon.json
├── document-microformats-1.html
├── document-microformats-2.html
├── document-microformats-3.html
├── document-microformats-4.html
└── document-opengraph-1.html
├── utils.go
└── utils_test.go
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "gomod" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | target-branch: "main"
11 | schedule:
12 | interval: "daily"
13 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ main ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ main ]
20 | schedule:
21 | - cron: '27 14 * * 1'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'go' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v4
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v3
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 |
53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
54 | # If this step fails, then you should remove it and run the build manually (see below)
55 | - name: Autobuild
56 | uses: github/codeql-action/autobuild@v3
57 |
58 | # ℹ️ Command-line programs to run using the OS shell.
59 | # 📚 https://git.io/JvXDl
60 |
61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 | # and modify them (or add more) to build your code if your project
63 | # uses a compiled language
64 |
65 | #- run: |
66 | # make bootstrap
67 | # make release
68 |
69 | - name: Perform CodeQL Analysis
70 | uses: github/codeql-action/analyze@v3
71 |
--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
1 | name: Go
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | branches: [ main ]
8 |
9 | jobs:
10 |
11 | build:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 |
16 | - name: Set up Go
17 | uses: actions/setup-go@v5
18 | with:
19 | go-version: '1.23'
20 |
21 | - name: Test
22 | run: go test -race -coverprofile=coverage.txt -covermode=atomic -v ./...
23 |
24 | - name: Report Code Coverage
25 | uses: codecov/codecov-action@v5
26 | with:
27 | fail_ci_if_error: true
28 | flags: unittests
29 | token: ${{ secrets.CODECOV_TOKEN }}
30 | verbose: true
31 |
32 | - name: GolangCI-Lint
33 | uses: golangci/golangci-lint-action@v6
34 | with:
35 | # Require: The version of golangci-lint to use.
36 | # When `install-mode` is `binary` (default) the value can be v1.2 or v1.2.3 or `latest` to use the latest version.
37 | # When `install-mode` is `goinstall` the value can be v1.2.3, `latest`, or the hash of a commit.
38 | version: latest
39 | skip-cache: true
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # If you prefer the allow list template instead of the deny list, see community template:
2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
3 | #
4 | # Binaries for programs and plugins
5 | *.exe
6 | *.exe~
7 | *.dll
8 | *.so
9 | *.dylib
10 |
11 | # Test binary, built with `go test -c`
12 | *.test
13 |
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 |
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 |
20 | # Go workspace file
21 | go.work
22 | .DS_Store
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sherlock
2 |
3 |
4 |
5 | [](http://pkg.go.dev/github.com/benpate/sherlock)
6 | [](https://github.com/benpate/sherlock/releases)
7 | [](https://github.com/benpate/sherlock/actions/workflows/go.yml)
8 | [](https://goreportcard.com/report/github.com/benpate/sherlock)
9 | [](https://codecov.io/gh/benpate/sherlock)
10 |
11 | ## Relentless Metadata Inspector
12 |
13 | Sherlock is a Go library that inspects a URL for any and all available metadata, pulling from whatever metadata formats are available, and returning it as an [ActivityStreams 2.0](https://www.w3.org/TR/activitystreams-core/) document.
14 |
15 | The goal is to have a standard interface into all web content, regardless of competing data standards.
16 |
17 | ### Supported Formats
18 |
19 | ✅ [ActivityPub](https://www.w3.org/TR/activitypub/)/[ActivityStreams](https://www.w3.org/TR/activitystreams-core/)
20 |
21 | ✅ [MicroFormats](https://microformats.org)
22 |
23 | ✅ [Open Graph](https://ogp.me)
24 |
25 | ### In Progress
26 |
27 | 🚧 [WebFinger](https://webfinger.net)
28 |
29 | 🚧 [JSON-LD (Linked)](https://json-ld.org/)
30 |
31 | 🚧 [Twitter Metadata](https://developer.twitter.com/en/docs/twitter-for-websites/cards/overview/abouts-cards)
32 |
33 | 🚧 [Microdata](https://html.spec.whatwg.org/multipage/microdata.html#microdata)
34 |
35 | 🚧 [RDFa](https://rdfa.info)
36 |
37 | 🚧 [oEmbed data provider](https://oembed.com)
38 |
39 |
40 | ### Using Sherlock
41 |
42 | ```go
43 | client := sherlock.NewClient()
44 |
45 | // If you only have a URL, then pass it in to .Load()
46 | result, err := client.Load("https://my-url-here")
47 |
48 | // If you have already downloaded a file, then pass it to .Parse()
49 | result, err := sherlock.ParseHTML("https://original-url", &bytes.Buffer)
50 |
51 | ```
52 |
53 | ### Using Sherlock with Hannibal
54 |
55 | Sherlock can also be used as an http client for [Hannibal](https://github.com/benpate/hannibal), the ActivityPub library for Go. This allows many other online resources to *look like* they're ActivityPub-enabled.
56 |
--------------------------------------------------------------------------------
/actor-.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/derp"
5 | "github.com/benpate/hannibal/streams"
6 | "github.com/rs/zerolog/log"
7 | )
8 |
9 | // Actor returns an ActivityPub Actor representation of the provided URL.
10 | // If and ActivityPub Actor cannot be found, it attempts to create a fake one
11 | // using RSS/Atom feeds, and MicroFormats instead.
12 | func (client Client) loadActor(identifier string, config *LoadConfig) (streams.Document, error) {
13 |
14 | const location = "sherlock.Client.Actor"
15 |
16 | // RULE: Prevent too many redirects
17 | if config.MaximumRedirects < 0 {
18 | return streams.NilDocument(), derp.InternalError(location, "Maximum redirects exceeded", identifier)
19 | }
20 |
21 | // Validate the identifier
22 | idType := identifierType(identifier)
23 |
24 | if idType == IdentifierTypeNone {
25 | return streams.NilDocument(), derp.BadRequestError(location, "Invalid identifier", identifier)
26 | }
27 |
28 | log.Trace().Str("loc", location).Str("type", idType).Msg("searching for: " + identifier)
29 |
30 | // 1. If this looks like a username, then try WebFinger
31 | if idType == IdentifierTypeUsername {
32 |
33 | if actor := client.loadActor_WebFinger(identifier, config); actor.NotNil() {
34 | log.Trace().Str("loc", location).Msg("Found via WebFinger")
35 | return actor, nil
36 | }
37 |
38 | // If we can't look up the user via WebFinger, then stop here
39 | return streams.NilDocument(), derp.NotFoundError(location, "Unable to load actor by username", identifier)
40 | }
41 |
42 | // RULE: identifier must begin with a valid protocol
43 | identifier = defaultHTTPS(identifier)
44 |
45 | // 2. Try ActivityStreams
46 | if actor := client.loadActor_ActivityStreams(identifier); actor.NotNil() {
47 | log.Trace().Str("loc", location).Msg("Found via ActivityStream")
48 | return actor, nil
49 | }
50 |
51 | // 3. Try RSS/Atom/JSONFeed/MicroFormats
52 | if actor := client.loadActor_Feed(identifier, config); actor.NotNil() {
53 | log.Trace().Str("loc", location).Msg("Found via Feed")
54 | return actor, nil
55 | }
56 |
57 | // 4. Abject failure. Your mother would be ashamed.
58 | return streams.NilDocument(), derp.NotFoundError(location, "Unable to load actor", identifier)
59 | }
60 |
--------------------------------------------------------------------------------
/actor-WebFinger.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/benpate/digit"
7 | "github.com/benpate/hannibal"
8 | "github.com/benpate/hannibal/streams"
9 | "github.com/rs/zerolog/log"
10 | )
11 |
12 | func (client *Client) loadActor_WebFinger(uri string, config *LoadConfig) streams.Document {
13 |
14 | const location = "sherlock.Client.loadActor_WebFinger"
15 |
16 | // If the ID doesn't look like an email/username then skip this step
17 | if !strings.Contains(uri, "@") {
18 | log.Trace().Str("location", location).Msg("Skipping because uri doesn't look like an email address")
19 | return streams.NilDocument()
20 | }
21 |
22 | // Try to load the Actor via WebFinger
23 | response, err := digit.Lookup(uri, client.RemoteOptions...)
24 |
25 | // If we dont' have a valid response, then return nil (skip this step)
26 | if err != nil {
27 | log.Error().Err(err).Msg("loadActor_WebFinger: skipping because of error")
28 | return streams.NilDocument()
29 | }
30 |
31 | log.Trace().Str("location", location).Interface("response", response).Msg("Found WebFinger response")
32 |
33 | // Search for ActivityPub endpoints
34 | for _, link := range response.Links {
35 | if (link.RelationType == digit.RelationTypeSelf) && (hannibal.IsActivityPubContentType(link.MediaType)) {
36 | if result := client.loadActor_ActivityStreams(link.Href); result.NotNil() {
37 | config.MaximumRedirects--
38 | return result
39 | }
40 | }
41 | }
42 |
43 | // Search for Profile pages (as a backup)
44 | for _, link := range response.Links {
45 | if link.RelationType == digit.RelationTypeProfile {
46 | if result := client.loadActor_Feed(link.Href, config); result.NotNil() {
47 | config.MaximumRedirects--
48 | return result
49 | }
50 | }
51 | }
52 |
53 | // Fall through means we couldn't find any relevant links in the WebFinger response
54 | return streams.NilDocument()
55 | }
56 |
--------------------------------------------------------------------------------
/actor-activityStreams.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/hannibal/streams"
5 | "github.com/benpate/remote"
6 | "github.com/benpate/remote/options"
7 | "github.com/benpate/rosetta/mapof"
8 | "github.com/rs/zerolog/log"
9 | )
10 |
11 | // loadActor_ActivityStreams attempts to load an ActivityStream directly from
12 | // a uri. If the retrieved document is not an ActivityStream, then
13 | // this method returns a NilDocument.
14 | func (client Client) loadActor_ActivityStreams(uri string) streams.Document {
15 |
16 | const location = "sherlock.Client.loadActor_ActivityStreams"
17 |
18 | // Set up the transaction
19 | data := mapof.NewAny()
20 | txn := remote.Get(uri).
21 | UserAgent(client.UserAgent).
22 | Accept(ContentTypeActivityPub).
23 | With(client.RemoteOptions...).
24 | Result(&data)
25 |
26 | if canTrace() {
27 | txn.With(options.Debug())
28 | }
29 |
30 | // Try to load the data from the remote server
31 | if err := txn.Send(); err != nil {
32 | log.Trace().Str("location", location).Msg("Error loading URI: " + uri)
33 | return streams.NilDocument()
34 | }
35 |
36 | // If the response is not an ActivityPub document, then exit
37 | if !isActivityStream(txn.ResponseContentType()) {
38 | if canTrace() {
39 | log.Trace().Str("location", location).Msg("Response is not an ActivityStream: " + txn.ResponseContentType())
40 | }
41 | return streams.NilDocument()
42 | }
43 |
44 | if canTrace() {
45 | log.Trace().Str("location", location).Str("objectId", uri).Msg("Found ActivityStreams document")
46 | }
47 |
48 | // Otherwise, return the Actor with expected metadata
49 | result := streams.NewDocument(
50 | data,
51 | streams.WithClient(client),
52 | streams.WithHTTPHeader(txn.ResponseHeader()),
53 | )
54 |
55 | return result
56 | }
57 |
--------------------------------------------------------------------------------
/actor-feed-.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/hannibal/streams"
5 | "github.com/benpate/remote"
6 | )
7 |
8 | func (client *Client) loadActor_Feed(url string, config *LoadConfig) streams.Document {
9 |
10 | // Retrieve the URL
11 | txn := remote.Get(url).
12 | UserAgent(client.UserAgent).
13 | With(client.RemoteOptions...)
14 |
15 | if err := txn.Send(); err != nil {
16 | return streams.NilDocument()
17 | }
18 |
19 | // Find and follow links in the response.
20 | if document := client.loadActor_Links(txn, config); document.NotNil() {
21 | return document
22 | }
23 |
24 | // 1. Try to generate an Actor from a JSON Feed
25 | if document := client.loadActor_Feed_JSON(txn, config); document.NotNil() {
26 | return document
27 | }
28 |
29 | // 2. Try to generate an Actor from a RSS/Atom Feed
30 | if document := client.loadActor_Feed_RSS(txn, config); document.NotNil() {
31 | return document
32 | }
33 |
34 | // 3. Try to generate an Actor from a HTML MicroFormats
35 | if document := client.loadActor_Feed_MicroFormats(txn); document.NotNil() {
36 | return document
37 | }
38 |
39 | // 4. Failure.
40 | return streams.NilDocument()
41 | }
42 |
--------------------------------------------------------------------------------
/actor-feed-JSON.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "encoding/json"
5 | "net/url"
6 |
7 | "github.com/benpate/hannibal/streams"
8 | "github.com/benpate/hannibal/vocab"
9 | "github.com/benpate/remote"
10 | "github.com/benpate/rosetta/first"
11 | "github.com/benpate/rosetta/html"
12 | "github.com/benpate/rosetta/mapof"
13 | "github.com/benpate/rosetta/slice"
14 | "github.com/kr/jsonfeed"
15 | )
16 |
17 | func (client Client) loadActor_Feed_JSON(txn *remote.Transaction, config *LoadConfig) streams.Document {
18 |
19 | // JSONFeed content only
20 | if !isJSONFeedContentType(txn.ResponseContentType()) {
21 | return streams.NilDocument()
22 | }
23 |
24 | var feed jsonfeed.Feed
25 |
26 | body, err := txn.ResponseBody()
27 | if err != nil {
28 | return streams.NilDocument()
29 | }
30 |
31 | // Parse the JSON feed
32 | if err := json.Unmarshal(body, &feed); err != nil {
33 | return streams.NilDocument()
34 | }
35 |
36 | actorID := first.String(feed.FeedURL, txn.RequestURL())
37 | username := first.String(feed.HomePageURL, txn.RequestURL())
38 | baseURL, _ := url.Parse(actorID)
39 |
40 | // Create an ActivityStream document
41 | result := config.DefaultValue
42 | result[vocab.AtContext] = vocab.ContextTypeActivityStreams
43 | result[vocab.PropertyID] = actorID
44 | result[vocab.PropertyType] = vocab.ActorTypeApplication
45 | result[vocab.PropertyName] = feed.Title
46 | result[vocab.PropertyIcon] = feed.Icon
47 | result[vocab.PropertySummary] = feed.Description
48 | result[vocab.PropertyURL] = username
49 | result[vocab.PropertyOutbox] = mapof.Any{
50 | vocab.PropertyType: vocab.CoreTypeOrderedCollection,
51 | vocab.PropertyTotalItems: len(feed.Items),
52 | vocab.PropertyOrderedItems: slice.Map(feed.Items, func(item jsonfeed.Item) mapof.Any {
53 |
54 | itemURL, _ := baseURL.Parse(item.URL)
55 |
56 | return mapof.Any{
57 | vocab.PropertyType: vocab.ObjectTypePage,
58 | vocab.PropertyID: itemURL,
59 | vocab.PropertyActor: feed.FeedURL,
60 | vocab.PropertyName: item.Title,
61 | vocab.PropertySummary: item.Summary,
62 | vocab.PropertyImage: item.Image,
63 | vocab.PropertyContent: jsonFeedToContentHTML(item),
64 | vocab.PropertyPublished: item.DatePublished.Unix(),
65 | vocab.PropertyAttributedTo: jsonFeedToAuthor(feed, item),
66 | }
67 | }),
68 | }
69 |
70 | // Search for WebSub hubs.
71 | for _, hub := range feed.Hubs {
72 | if hub.Type == "WebSub" {
73 | result[vocab.PropertyEndpoints] = mapof.Any{
74 | "hub": hub.URL,
75 | }
76 | break
77 | }
78 | }
79 |
80 | // Apply links found in the response headers
81 | client.applyLinks(txn, result)
82 |
83 | // Patch icon into the feed (if necessary)
84 | client.loadActor_Feed_FindHomePageIcon(result)
85 |
86 | // Find/Manufacture the icon for the feed
87 | // client.loadActor_Feed_Icon(txn, result)
88 |
89 | return streams.NewDocument(
90 | result,
91 | streams.WithClient(client),
92 | streams.WithHTTPHeader(txn.ResponseHeader()),
93 | )
94 | }
95 |
96 | // Returns TRUE if the contentType is application/activity+json or application/ld+json
97 | func isJSONFeedContentType(contentType string) bool {
98 |
99 | switch contentType {
100 |
101 | case ContentTypeJSONFeed:
102 | return true
103 |
104 | case ContentTypeJSON:
105 | return true
106 |
107 | default:
108 | return false
109 | }
110 | }
111 |
112 | func jsonFeedToAuthor(feed jsonfeed.Feed, item jsonfeed.Item) mapof.Any {
113 |
114 | if item.Author != nil {
115 | return mapof.Any{
116 | vocab.PropertyID: item.Author.URL,
117 | vocab.PropertyName: item.Author.Name,
118 | vocab.PropertyImage: item.Author.Avatar,
119 | }
120 | }
121 |
122 | if feed.Author != nil {
123 | return mapof.Any{
124 | vocab.PropertyID: feed.Author.URL,
125 | vocab.PropertyName: feed.Author.Name,
126 | vocab.PropertyImage: feed.Author.Avatar,
127 | }
128 | }
129 |
130 | return mapof.Any{
131 | vocab.PropertyID: feed.FeedURL,
132 | }
133 | }
134 |
135 | func jsonFeedToContentHTML(item jsonfeed.Item) string {
136 |
137 | var result string
138 |
139 | if item.ContentHTML != "" {
140 | result = item.ContentHTML
141 | } else if item.ContentText != "" {
142 | result = html.FromText(item.ContentText)
143 | }
144 |
145 | return sanitizeHTML(result)
146 | }
147 |
--------------------------------------------------------------------------------
/actor-feed-RSS.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "net/url"
5 | "sort"
6 | "time"
7 |
8 | "github.com/benpate/hannibal/streams"
9 | "github.com/benpate/hannibal/vocab"
10 | "github.com/benpate/remote"
11 | "github.com/benpate/rosetta/convert"
12 | "github.com/benpate/rosetta/first"
13 | "github.com/benpate/rosetta/html"
14 | "github.com/benpate/rosetta/list"
15 | "github.com/benpate/rosetta/mapof"
16 | "github.com/benpate/rosetta/slice"
17 | "github.com/mmcdole/gofeed"
18 | )
19 |
20 | // loadActor_Feed_RSS tries generate an Actor from an RSS or Atom feed
21 | func (client Client) loadActor_Feed_RSS(txn *remote.Transaction, config *LoadConfig) streams.Document {
22 |
23 | // Try to find the RSS feed associated with this link
24 | feed, err := gofeed.NewParser().Parse(txn.ResponseBodyReader())
25 |
26 | if err != nil {
27 | return streams.NilDocument()
28 | }
29 |
30 | // Sort the feed items (oldest first)
31 | sort.Slice(feed.Items, func(i, j int) bool {
32 | if firstPublishDate := feed.Items[i].PublishedParsed; firstPublishDate != nil {
33 | if secondPublishDate := feed.Items[j].PublishedParsed; secondPublishDate != nil {
34 | return firstPublishDate.Before(*secondPublishDate)
35 | }
36 | return false
37 | }
38 | return false
39 | })
40 |
41 | actorID := first.String(feed.FeedLink, feed.Link, txn.RequestURL())
42 |
43 | // Create JSON-LD for the Actor
44 | result := config.DefaultValue
45 | result[vocab.AtContext] = vocab.ContextTypeActivityStreams
46 | result[vocab.PropertyType] = vocab.ActorTypeApplication
47 | result[vocab.PropertyID] = actorID
48 | result[vocab.PropertyName] = feed.Title
49 | result[vocab.PropertySummary] = feed.Description
50 | result[vocab.PropertyURL] = txn.RequestURL()
51 | result[vocab.PropertyOutbox] = mapof.Any{
52 | vocab.PropertyType: vocab.CoreTypeOrderedCollection,
53 | vocab.PropertyTotalItems: len(feed.Items),
54 | vocab.PropertyOrderedItems: slice.Map(feed.Items, feedActivity(actorID, feed)),
55 | }
56 |
57 | // Apply links found in the response headers
58 | client.applyLinks(txn, result)
59 |
60 | // Patch icon into the feed (if necessary)
61 | client.loadActor_Feed_FindHomePageIcon(result)
62 |
63 | // Return the result as a streams.Document
64 | return streams.NewDocument(
65 | result,
66 | streams.WithClient(client),
67 | streams.WithHTTPHeader(txn.ResponseHeader()),
68 | )
69 | }
70 |
71 | // feedActivity populates an Activity object from a gofeed.Feed and gofeed.Item
72 | func feedActivity(actorID string, feed *gofeed.Feed) func(*gofeed.Item) any {
73 |
74 | baseURL, _ := url.Parse(actorID)
75 |
76 | return func(item *gofeed.Item) any {
77 |
78 | // Resolve relative URLs
79 | linkURL, _ := baseURL.Parse(item.Link)
80 |
81 | result := mapof.Any{
82 | vocab.PropertyType: vocab.ObjectTypePage,
83 | vocab.PropertyID: linkURL.String(),
84 | vocab.PropertyName: html.ToText(item.Title),
85 | vocab.PropertyActor: feed.FeedLink,
86 | }
87 |
88 | if item.PublishedParsed != nil {
89 | result[vocab.PropertyPublished] = item.PublishedParsed.Unix()
90 | } else {
91 | result[vocab.PropertyPublished] = time.Now().Unix()
92 | }
93 |
94 | if image := feedImage(item); image != nil {
95 | result[vocab.PropertyImage] = image
96 | }
97 |
98 | if summary := feedSummary(item); summary != "" {
99 | result[vocab.PropertySummary] = summary
100 | }
101 |
102 | if contentHTML := feedContent(item); contentHTML != "" {
103 | result[vocab.PropertyContent] = contentHTML
104 | }
105 |
106 | if attributedTo := feedAuthor(actorID, feed, item); attributedTo != nil {
107 | result[vocab.PropertyAttributedTo] = attributedTo
108 | }
109 |
110 | return result
111 | }
112 | }
113 |
114 | func feedAuthor(actorID string, feed *gofeed.Feed, item *gofeed.Item) mapof.Any {
115 |
116 | // Set up default values to override (if we find something better)
117 | result := mapof.Any{
118 | vocab.PropertyID: actorID,
119 | vocab.PropertyName: feed.Title,
120 | vocab.PropertySummary: feed.Description,
121 | }
122 |
123 | // Try to find the image from the feed. It's weird, but easier this way.
124 | if feed.Image != nil {
125 | result[vocab.PropertyImage] = feed.Image.URL
126 |
127 | } else if webfeeds, ok := feed.Extensions["webfeeds"]; ok {
128 | if icon, ok := webfeeds["icon"]; ok {
129 | for _, element := range icon {
130 | if element.Name == "icon" {
131 | result[vocab.PropertyImage] = element.Value
132 | break
133 | }
134 | }
135 | }
136 | }
137 |
138 | // Try to find the author from various sources in the item
139 | if item.Author != nil {
140 | result[vocab.PropertyName] = html.ToText(item.Author.Name)
141 | result[vocab.PropertySummary] = item.Author.Email
142 | return result
143 | }
144 |
145 | if len(item.Authors) > 0 {
146 | if itemAuthor := item.Authors[0]; itemAuthor != nil {
147 | result[vocab.PropertyName] = itemAuthor.Name
148 | result[vocab.PropertySummary] = itemAuthor.Email
149 | return result
150 | }
151 | }
152 |
153 | // Try to find the author from various sources in the feed
154 | if feed.Author != nil {
155 | result[vocab.PropertyName] = html.ToText(feed.Author.Name)
156 | result[vocab.PropertySummary] = feed.Author.Email
157 | return result
158 | }
159 |
160 | if len(feed.Authors) > 0 {
161 | if feedAuthor := feed.Authors[0]; feedAuthor != nil {
162 | result[vocab.PropertyName] = feedAuthor.Name
163 | result[vocab.PropertySummary] = feedAuthor.Email
164 | return result
165 | }
166 | }
167 |
168 | return result
169 | }
170 |
171 | // feedSummary returns a summary of the item in plain text format
172 | func feedSummary(item *gofeed.Item) string {
173 | return sanitizeText(item.Description)
174 | }
175 |
176 | // feedContent returns a sanitized version of the HTML content for this feed
177 | func feedContent(item *gofeed.Item) string {
178 | return sanitizeHTML(item.Content)
179 | }
180 |
181 | // rssImage returns the URL of the first image in the item's enclosure list.
182 | func feedImage(item *gofeed.Item) map[string]any {
183 |
184 | if item == nil {
185 | return nil
186 | }
187 |
188 | if item.Image != nil {
189 | return map[string]any{
190 | vocab.PropertyType: vocab.ObjectTypeImage,
191 | vocab.PropertyHref: item.Image.URL,
192 | vocab.PropertySummary: item.Image.Title,
193 | }
194 | }
195 |
196 | // Search for an image in the enclosures
197 | for _, enclosure := range item.Enclosures {
198 | if list.Slash(enclosure.Type).First() == "image" {
199 | return map[string]any{
200 | vocab.PropertyType: vocab.ObjectTypeImage,
201 | vocab.PropertyHref: enclosure.URL,
202 | }
203 | }
204 | }
205 |
206 | // Search for media extensions (YouTube uses this)
207 | if media, ok := item.Extensions["media"]; ok {
208 | for _, group := range media {
209 | for _, extension := range group {
210 | if medium := extension.Attrs["medium"]; medium == "image" {
211 | return map[string]any{
212 | vocab.PropertyType: vocab.ObjectTypeImage,
213 | vocab.PropertyHref: extension.Attrs["url"],
214 | vocab.PropertyWidth: convert.Int(extension.Attrs["width"]),
215 | vocab.PropertyHeight: convert.Int(extension.Attrs["height"]),
216 | }
217 | }
218 | }
219 | }
220 | }
221 |
222 | return nil
223 | }
224 |
--------------------------------------------------------------------------------
/actor-feed-icon.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "slices"
5 | "strings"
6 |
7 | "github.com/benpate/digit"
8 | "github.com/benpate/hannibal/vocab"
9 | "github.com/benpate/remote"
10 | "github.com/benpate/rosetta/convert"
11 | "github.com/benpate/rosetta/slice"
12 | "github.com/rs/zerolog/log"
13 | )
14 |
15 | // loadActor_Feed_FindRootLevelIcon searches for an icon from the website homepage
16 | // and adds it to the document if found.
17 | func (client *Client) loadActor_Feed_FindHomePageIcon(document map[string]any) {
18 |
19 | // if the document already has an icon, then NOOP
20 | if icon := convert.String(document[vocab.PropertyIcon]); icon != "" {
21 | return
22 | }
23 |
24 | // Get the document ID from the document
25 | documentID := convert.String(document[vocab.PropertyID])
26 | documentID = hostOnly(documentID)
27 |
28 | // Get the root-level document from the server
29 | txn := remote.Get(documentID)
30 |
31 | if err := txn.Send(); err != nil {
32 | log.Error().Err(err).Str("documentID", documentID).Msg("Error sending request")
33 | return
34 | }
35 |
36 | // Find Icons and apply them to the document
37 | client.loadActor_Feed_FindIcon(txn, document)
38 | }
39 |
40 | // loadActor_Feed_FindIcon searches for an icon from the remote transaction and
41 | // adds it into the document if found.
42 | func (client *Client) loadActor_Feed_FindIcon(txn *remote.Transaction, document map[string]any) {
43 |
44 | // if the document already has an icon, then NOOP
45 | if icon := convert.String(document[vocab.PropertyIcon]); icon != "" {
46 | return
47 | }
48 |
49 | // Find all links in the root-level document
50 | links := client.loadActor_DiscoverLinks(txn)
51 |
52 | // Choose the best icon and add it to the result
53 | if icon := client.loadActor_Feed_FindIconLink(links); icon != "" {
54 | document[vocab.PropertyIcon] = icon
55 | }
56 | }
57 |
58 | // Search for a sitewide Favicon, and add it to the default document if found
59 | func (client *Client) loadActor_Feed_FindIconLink(links digit.LinkSet) string {
60 |
61 | // Find all icon links
62 | icons := slice.Filter(links, func(link digit.Link) bool {
63 | return strings.Contains(link.RelationType, "icon")
64 | })
65 |
66 | // Empty results are empty
67 | if len(icons) == 0 {
68 | return ""
69 | }
70 |
71 | // Find the "best" icon, and set it as the default value
72 | slices.SortFunc(icons, sortImageLinks)
73 | return icons[0].Href
74 |
75 | // Are there other kinds of icons we can search for?
76 | }
77 |
--------------------------------------------------------------------------------
/actor-feed-links.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "net/url"
5 | "strings"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | "github.com/benpate/digit"
9 | "github.com/benpate/hannibal/streams"
10 | "github.com/benpate/remote"
11 | "github.com/tomnomnom/linkheader"
12 | "golang.org/x/net/html"
13 | )
14 |
15 | // loadActor_Links finds and follows all relevant links for an http.Response.
16 | // If it finds a link to an ActivityStream, RSS Feed, or similar, then it returns
17 | // the corresponding Actor document.
18 | // Otherwise, it returns an empty streams.Document that includes metadata for
19 | func (client *Client) loadActor_Links(txn *remote.Transaction, config *LoadConfig) streams.Document {
20 |
21 | // Extranct all Links from the HTTP Header and HTML Document
22 | links := client.loadActor_DiscoverLinks(txn)
23 |
24 | // If links point directly to something we can use (ActivityPub, RSS, etc) then use it
25 | if document := client.loadActor_FollowLinks(txn, links, config); document.NotNil() {
26 | return document
27 | }
28 |
29 | // Otherwise, populate additional links (such as Hubs, Icons, etc)
30 | // and return an empty streams.Document
31 | // TODO: https://trello.com/c/t51YFiA2/234-sherlock-restore-websub-links
32 | return streams.NilDocument()
33 | }
34 |
35 | // loadActor_DiscoverLinks finds all links in a transaction, from both the
36 | // http header and in the HTML document.
37 | func (client *Client) loadActor_DiscoverLinks(txn *remote.Transaction) digit.LinkSet {
38 |
39 | // Retrieve Links in HTTP Header
40 | headerValue := txn.ResponseHeader().Get(HTTPHeaderLink)
41 | links := linkheader.Parse(headerValue)
42 | result := make(digit.LinkSet, 0, len(links))
43 | requestURL := txn.RequestURL()
44 |
45 | for _, link := range links {
46 | result = append(result, digit.Link{
47 | RelationType: link.Rel,
48 | MediaType: link.Param("type"),
49 | Href: getRelativeURL(requestURL, link.URL),
50 | })
51 | }
52 |
53 | // Retrieve Links in HTML Document
54 | if htmlDocument, err := goquery.NewDocumentFromReader(txn.ResponseBodyReader()); err == nil {
55 |
56 | // Get "relevant" links from the document
57 | selection := htmlDocument.Find("[rel=alternate],[rel=self],[rel=feed],[rel=hub],[rel=icon],[rel=apple-touch-icon],[rel=apple-touch-icon-precomposed],[rel=mask-icon]")
58 |
59 | // Add links to the accumulator
60 | for _, link := range selection.Nodes {
61 | result = append(result, digit.Link{
62 | RelationType: nodeAttribute(link, "rel"),
63 | MediaType: nodeAttribute(link, "type"),
64 | Href: getRelativeURL(requestURL, nodeAttribute(link, "href")),
65 | Properties: map[string]string{
66 | "sizes": nodeAttribute(link, "sizes"),
67 | },
68 | })
69 | }
70 | }
71 |
72 | return result
73 | }
74 |
75 | // actor_ScanHTMLForWebMentions tries to load/use any linked feeds
76 | func (client *Client) loadActor_FollowLinks(txn *remote.Transaction, links digit.LinkSet, config *LoadConfig) streams.Document {
77 |
78 | // If the client is not allowed to follow redirects (or has used all of them already),
79 | // then there is nothing to do here. Return an empty document instead.
80 | if config.MaximumRedirects < 1 {
81 | return streams.NilDocument()
82 | }
83 |
84 | // If we have one or more links, then search them in order...
85 | if len(links) > 0 {
86 |
87 | for _, mediaType := range []string{ContentTypeActivityPub, ContentTypeJSONFeed, ContentTypeJSON, ContentTypeAtom, ContentTypeRSS} {
88 |
89 | link := findSelfOrAlternateLink(links, mediaType)
90 |
91 | if link.IsEmpty() {
92 | continue
93 | }
94 |
95 | // If the link points to the same URL as the original request, then we're
96 | // already at the right place. So don't traverse the link.
97 | if link.Href == txn.RequestURL() {
98 | return streams.NilDocument()
99 | }
100 |
101 | if document, err := client.loadActor(link.Href, config); err == nil {
102 | if document.NotNil() {
103 | config.MaximumRedirects--
104 | return document
105 | }
106 | }
107 | }
108 | }
109 |
110 | return streams.NilDocument()
111 | }
112 |
113 | /******************************************
114 | * Helper Functions
115 | ******************************************/
116 |
117 | // nodeAttribute searches for a specific attribute in a node and returns its value
118 | func nodeAttribute(node *html.Node, name string) string {
119 |
120 | if node == nil {
121 | return ""
122 | }
123 |
124 | for _, attr := range node.Attr {
125 | if attr.Key == name {
126 | return attr.Val
127 | }
128 | }
129 |
130 | return ""
131 | }
132 |
133 | // TODO: HIGH: Scan all references and perhaps use https://pkg.go.dev/net/url#URL.ResolveReference instead?
134 | func getRelativeURL(baseURL string, relativeURL string) string {
135 |
136 | // If the relative URL is already absolute, then just return it
137 | if strings.HasPrefix(relativeURL, "http://") || strings.HasPrefix(relativeURL, "https://") {
138 | return relativeURL
139 | }
140 |
141 | // If the relative URL is a root-relative URL, then assume HTTPS (it's 2022, for crying out loud)
142 | if strings.HasPrefix(relativeURL, "//") {
143 | return "https:" + relativeURL
144 | }
145 |
146 | // Parse the base URL so that we can do URL-math on it
147 | baseURLParsed, _ := url.Parse(baseURL)
148 |
149 | // If the relative URL is a path-relative URL, then just replace the path
150 | if strings.HasPrefix(relativeURL, "/") {
151 | baseURLParsed.Path = relativeURL
152 | return baseURLParsed.String()
153 | }
154 |
155 | // Otherwise, join the paths
156 | baseURLParsed.Path, _ = url.JoinPath(baseURLParsed.Path, relativeURL)
157 | return baseURLParsed.String()
158 | }
159 |
160 | func findSelfOrAlternateLink(links []digit.Link, mediaType string) digit.Link {
161 |
162 | for _, link := range links {
163 |
164 | switch link.RelationType {
165 | case LinkRelationSelf, LinkRelationAlternate:
166 | if link.MediaType == mediaType {
167 | return link
168 | }
169 | }
170 | }
171 |
172 | return digit.Link{}
173 | }
174 |
--------------------------------------------------------------------------------
/actor-feed-microFormats.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "net/url"
5 | "time"
6 |
7 | "github.com/benpate/hannibal/streams"
8 | "github.com/benpate/hannibal/vocab"
9 | "github.com/benpate/remote"
10 | "github.com/benpate/rosetta/mapof"
11 | "github.com/benpate/rosetta/slice"
12 | "willnorris.com/go/microformats"
13 | )
14 |
15 | // actor_MicroFormats searches and HTML document for for an h-feed Microformat
16 | func (client Client) loadActor_Feed_MicroFormats(txn *remote.Transaction) streams.Document {
17 |
18 | // Parse the document URL
19 | parsedURL, err := url.Parse(txn.RequestURL())
20 |
21 | if err != nil {
22 | return streams.NilDocument()
23 | }
24 |
25 | // Parse the HTML document
26 | data := microformats.Parse(txn.ResponseBodyReader(), parsedURL)
27 |
28 | // Search Microformats for an h-feed
29 | for _, feed := range data.Items {
30 |
31 | if slice.Contains(feed.Type, "h-feed") {
32 |
33 | items := make([]mapof.Any, 0, len(feed.Children))
34 |
35 | for _, child := range feed.Children {
36 | if slice.Contains(child.Type, "h-entry") {
37 | items = append(items, microformat_Item(feed, child))
38 | }
39 | }
40 |
41 | if len(items) > 0 {
42 |
43 | data := mapof.Any{
44 | vocab.PropertyID: parsedURL.String(),
45 | vocab.PropertyType: vocab.ActorTypeApplication,
46 | vocab.PropertyName: microformat_Property(feed, "name"),
47 | vocab.PropertyImage: microformat_Property(feed, "photo"),
48 | vocab.PropertyAttributedTo: microformat_Property(feed, "author"),
49 | vocab.PropertyOutbox: microformat_Outbox(items),
50 | }
51 |
52 | // Apply links found in the response headers
53 | client.applyLinks(txn, data)
54 |
55 | // Patch icon into the feed (if necessary)
56 | client.loadActor_Feed_FindHomePageIcon(data)
57 |
58 | // Return the (successfully?) parsed document to the caller.
59 | return streams.NewDocument(
60 | data,
61 | streams.WithClient(client),
62 | streams.WithHTTPHeader(txn.ResponseHeader()),
63 | )
64 | }
65 | }
66 | }
67 |
68 | return streams.NilDocument()
69 | }
70 |
71 | // microformat_Outbox wraps a slice of items in an ActivityStreams OrderedCollection
72 | func microformat_Outbox(items []mapof.Any) mapof.Any {
73 |
74 | return mapof.Any{
75 | vocab.PropertyType: vocab.CoreTypeOrderedCollection,
76 | vocab.PropertyTotalItems: len(items),
77 | vocab.PropertyOrderedItems: items,
78 | }
79 | }
80 |
81 | // microformat_Item converts a Microformat entry into an ActivityStreams document
82 | func microformat_Item(feed *microformats.Microformat, entry *microformats.Microformat) mapof.Any {
83 |
84 | result := mapof.Any{
85 | vocab.PropertyID: microformat_Property(entry, "url"),
86 | vocab.PropertyName: microformat_Property(entry, "name"),
87 | vocab.PropertySummary: microformat_Property(entry, "summary"),
88 | }
89 |
90 | // Get properties from entry
91 |
92 | // Get photo from entry, then feed
93 | if photoURL := microformat_Property(entry, "photo"); photoURL != "" {
94 | result[vocab.PropertyImage] = photoURL
95 | } else if photoURL := microformat_Property(feed, "photo"); photoURL != "" {
96 | result[vocab.PropertyImage] = photoURL
97 | }
98 |
99 | // Get author from entry, then feed
100 | if author := microformat_First(entry.Properties["author"]); author != nil {
101 | result[vocab.PropertyAttributedTo] = microformat_Author(author)
102 | } else if author := microformat_First(feed.Properties["author"]); author != nil {
103 | result[vocab.PropertyAttributedTo] = microformat_Author(author)
104 | }
105 |
106 | // Get the publish date from the entry
107 | if published := microformat_Property(entry, "published"); published != "" {
108 | if publishDate, err := time.Parse(time.RFC3339, published); err == nil {
109 | result[vocab.PropertyPublished] = publishDate.Unix()
110 | }
111 | }
112 |
113 | // Default PublishDate just in case
114 | if result[vocab.PropertyPublished] == 0 {
115 | result[vocab.PropertyPublished] = time.Now().Unix()
116 | }
117 |
118 | return result
119 | }
120 |
121 | // microformat_Author converts a Microformat entry into an ActivityStreams document
122 | func microformat_Author(entry *microformats.Microformat) mapof.Any {
123 |
124 | if entry == nil {
125 | return mapof.NewAny()
126 | }
127 |
128 | return mapof.Any{
129 | vocab.PropertyID: microformat_Property(entry, "url"),
130 | vocab.PropertyName: microformat_Property(entry, "name"),
131 | vocab.PropertyImage: microformat_Property(entry, "photo", "logo"),
132 | }
133 | }
134 |
135 | // microformat_First returns the first item in a slice of items
136 | func microformat_First(value any) *microformats.Microformat {
137 |
138 | switch o := value.(type) {
139 | case []any:
140 | if len(o) > 0 {
141 | return microformat_First(o[0])
142 | }
143 |
144 | case *microformats.Microformat:
145 | return o
146 | }
147 |
148 | return nil
149 | }
150 |
151 | // microformat_Property returns the first value of a property
152 | func microformat_Property(entry *microformats.Microformat, names ...string) string {
153 |
154 | if entry == nil {
155 | return ""
156 | }
157 |
158 | for _, name := range names {
159 |
160 | if value, ok := entry.Properties[name]; ok {
161 |
162 | for _, item := range value {
163 | switch o := item.(type) {
164 | case string:
165 | return o
166 |
167 | case *microformats.Microformat:
168 | return o.Value
169 | }
170 | }
171 | }
172 | }
173 |
174 | return ""
175 | }
176 |
--------------------------------------------------------------------------------
/authorized-fetch.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "crypto"
5 | "net/http"
6 |
7 | "github.com/benpate/derp"
8 | "github.com/benpate/hannibal/sigs"
9 | "github.com/benpate/remote"
10 | "github.com/rs/zerolog/log"
11 | )
12 |
13 | // AuthorizedFetch is a remote.Option that signs all outbound requests according to the
14 | // ActivityPub "Authorized Fetch" convention: https://funfedi.dev/testing_tools/http_signatures/
15 | func AuthorizedFetch(publicKeyID string, privateKey crypto.PrivateKey) remote.Option {
16 |
17 | if publicKeyID == "" {
18 | log.Info().Msg("AuthorizedFetch: No publicKeyID provided")
19 | return remote.Option{}
20 | }
21 |
22 | if privateKey == nil {
23 | log.Info().Msg("AuthorizedFetch: No privateKey provided")
24 | return remote.Option{}
25 | }
26 |
27 | return remote.Option{
28 |
29 | // ModifyRequest is called after an http.Request has been generated, but before it is sent to the
30 | // remote server. It can be used to modify the request, or to replace it entirely.
31 | // If it returns a non-nil http.Response, then that is used INSTEAD OF calling the remote server.
32 | // If it returns a nil http.Response, then the request is sent to the remote server as normal.
33 | ModifyRequest: func(t *remote.Transaction, request *http.Request) *http.Response {
34 |
35 | signer := sigs.NewSigner(
36 | publicKeyID,
37 | privateKey,
38 | sigs.SignerFields("(request-target)", "host", "date"),
39 | )
40 |
41 | if err := signer.Sign(request); err != nil {
42 | derp.Report(derp.Wrap(err, "sherlock.AuthorizedFetch", "Error signing request"))
43 | }
44 |
45 | return nil
46 | },
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/client-.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/hannibal/streams"
5 | "github.com/benpate/remote"
6 | )
7 |
8 | // Client implements the hannibal/streams.Client interface, and is used to load JSON-LD documents from remote servers.
9 | // The sherlock client maps additional meta-data into a standard ActivityStreams document.
10 | type Client struct {
11 | UserAgent string // User-Agent string to send with every request
12 | RemoteOptions []remote.Option // Additional options to pass to the remote library
13 | }
14 |
15 | // NewClient returns a fully initialized Client object
16 | func NewClient(options ...ClientOption) Client {
17 |
18 | // Create a default Client
19 | result := Client{
20 | UserAgent: "Sherlock: github.com/benpate/sherlock",
21 | RemoteOptions: make([]remote.Option, 0),
22 | }
23 |
24 | // Apply options
25 | result.WithOptions(options...)
26 |
27 | // Success
28 | return result
29 | }
30 |
31 | // Load retrieves a document from a remote server and returns it as a streams.Document
32 | // It uses either the "Actor" or "Document" methods of generating it ActivityStreams
33 | // result.
34 | // "Document" treats the URL as a single ActivityStreams document, translating
35 | // OpenGraph, MicroFormats, and JSON-LD into an ActivityStreams equivalent.
36 | // "Actor" treats the URL as an Actor, translating RSS, Atom, JSON, and
37 | // MicroFormats feeds into an ActivityStream equivalent.
38 | func (client Client) Load(url string, options ...any) (streams.Document, error) {
39 |
40 | config := NewLoadConfig(options...)
41 |
42 | // If "Actor" is requested, then use that discovery method
43 | if config.DocumentType == LoadDocumentTypeActor {
44 | return client.loadActor(url, &config)
45 | }
46 |
47 | // Otherwise, use "Document" discovery method
48 | return client.loadDocument(url, config)
49 | }
50 |
51 | // WithOptions applies one or more ClientOption functions to the client
52 | func (client *Client) WithOptions(options ...ClientOption) {
53 | for _, option := range options {
54 | option(client)
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/client-applyLinks.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/hannibal/vocab"
5 | "github.com/benpate/remote"
6 | "github.com/benpate/rosetta/mapof"
7 | "github.com/tomnomnom/linkheader"
8 | )
9 |
10 | // applyLinks searches for common link headers in the response, and applies them to the data map
11 | func (client *Client) applyLinks(txn *remote.Transaction, data mapof.Any) {
12 |
13 | links := linkheader.ParseMultiple(txn.Response().Header["Link"])
14 |
15 | for _, link := range links {
16 | switch link.Rel {
17 |
18 | case LinkRelationIcon:
19 |
20 | // Add an icon if it doesn't already exist
21 | if _, ok := data[vocab.PropertyIcon]; !ok {
22 | data[vocab.PropertyIcon] = link.URL
23 | }
24 |
25 | case LinkRelationHub:
26 |
27 | // Guarantee that the `endpoints` value exists
28 | if _, ok := data[vocab.PropertyEndpoints]; !ok {
29 | data[vocab.PropertyEndpoints] = make(map[string]any)
30 | }
31 |
32 | // Set the `endpoints.websub` value
33 | if endpoints, ok := data[vocab.PropertyEndpoints].(map[string]any); ok {
34 | endpoints["websub"] = link.URL
35 | }
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/client-clientOption.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "crypto"
5 |
6 | "github.com/benpate/remote"
7 | )
8 |
9 | // ClientOption defines a functional option that modifies a Client object
10 | type ClientOption func(*Client)
11 |
12 | // WithUserAgent is a ClientOption that sets the UserAgent property on the Client object
13 | func WithUserAgent(userAgent string) ClientOption {
14 | return func(client *Client) {
15 | client.UserAgent = userAgent
16 | }
17 | }
18 |
19 | // WithRemoteOptions is a ClientOption that appends one or more remote.Option
20 | // objects to the Client object RemoteOptions are executed on every remote request
21 | func WithRemoteOptions(options ...remote.Option) ClientOption {
22 | return func(client *Client) {
23 | client.RemoteOptions = append(client.RemoteOptions, options...)
24 | }
25 | }
26 |
27 | // WithActor is a ClientOption that set up the AuthorizedFetch remote middleware,
28 | // which will sign all outbound requests according to the ActivityPub "Authorized Fetch"
29 | // convention: https://funfedi.dev/testing_tools/http_signatures/
30 | func WithActor(publicKeyID string, privateKey crypto.PrivateKey) ClientOption {
31 | return func(client *Client) {
32 | client.RemoteOptions = append(client.RemoteOptions, AuthorizedFetch(publicKeyID, privateKey))
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/client-loadOption.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | const LoadDocumentTypeUnknown = 0
4 |
5 | const LoadDocumentTypeActor = 1
6 |
7 | const LoadDocumentTypeCollection = 2
8 |
9 | const LoadDocumentTypeDocument = 3
10 |
11 | type LoadConfig struct {
12 | DocumentType int
13 | MaximumRedirects int
14 | DefaultValue map[string]any
15 | }
16 |
17 | type LoadOption func(*LoadConfig)
18 |
19 | func NewLoadConfig(options ...any) LoadConfig {
20 | result := LoadConfig{
21 | MaximumRedirects: 6,
22 | DocumentType: LoadDocumentTypeUnknown,
23 | DefaultValue: make(map[string]any),
24 | }
25 |
26 | for _, option := range options {
27 | if typed, ok := option.(LoadOption); ok {
28 | typed(&result)
29 | }
30 | }
31 | return result
32 | }
33 |
34 | func AsActor() LoadOption {
35 | return asDocumentType(LoadDocumentTypeActor)
36 | }
37 |
38 | func AsDocument() LoadOption {
39 | return asDocumentType(LoadDocumentTypeDocument)
40 | }
41 |
42 | func AsCollection() LoadOption {
43 | return asDocumentType(LoadDocumentTypeCollection)
44 | }
45 |
46 | func asDocumentType(documentType int) LoadOption {
47 | return func(config *LoadConfig) {
48 | config.DocumentType = documentType
49 | }
50 | }
51 |
52 | func WithMaximumRedirects(maximumRedirects int) LoadOption {
53 | return func(config *LoadConfig) {
54 | config.MaximumRedirects = maximumRedirects
55 | }
56 | }
57 |
58 | func WithDefaultValue(defaultValue map[string]any) LoadOption {
59 | return func(config *LoadConfig) {
60 | config.DefaultValue = defaultValue
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/constants.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | /******************************************
4 | * ContentTypes
5 | ******************************************/
6 |
7 | // ContentType is the string used in the HTTP header to designate a MIME type
8 | const ContentType = "Content-Type"
9 |
10 | // ContentTypeActivityPub is the standard MIME type for ActivityPub content
11 | const ContentTypeActivityPub = "application/activity+json"
12 |
13 | // ContentTypeAtom is the standard MIME Type for Atom Feeds
14 | const ContentTypeAtom = "application/atom+xml"
15 |
16 | // ContentTypeForm is the standard MIME Type for Form encoded content
17 | const ContentTypeForm = "application/x-www-form-urlencoded"
18 |
19 | // ContentTypeHTML is the standard MIME type for HTML content
20 | const ContentTypeHTML = "text/html"
21 |
22 | // ContentTypeJSON is the standard MIME Type for JSON content
23 | const ContentTypeJSON = "application/json"
24 |
25 | // ContentTypeJSONFeed is the standard MIME Type for JSON Feed content
26 | // https://en.wikipedia.org/wiki/JSON_Feed
27 | const ContentTypeJSONFeed = "application/feed+json"
28 |
29 | // ContentTypeJSONLD is the standard MIME Type for JSON-LD content
30 | // https://en.wikipedia.org/wiki/JSON-LD
31 | const ContentTypeJSONLD = "application/ld+json"
32 |
33 | // ContentTypeJSONResourceDescriptor is the standard MIME Type for JSON Resource Descriptor content
34 | // which is used by WebFinger: https://datatracker.ietf.org/doc/html/rfc7033#section-10.2
35 | const ContentTypeJSONResourceDescriptor = "application/jrd+json"
36 |
37 | // ContentTypePlain is the default plaintext MIME type
38 | const ContentTypePlain = "text/plain"
39 |
40 | // ContentTypeRSS is the standard MIME Type for RSS Feeds
41 | const ContentTypeRSS = "application/rss+xml"
42 |
43 | // ContentTypeXML is the standard MIME Type for XML content
44 | const ContentTypeXML = "application/xml"
45 |
46 | /******************************************
47 | * Document Formats
48 | ******************************************/
49 |
50 | const FormatActivityStream = "ACTIVITYSTREAM"
51 |
52 | const FormatRSS = "RSS"
53 |
54 | const FormatJSONFeed = "JSONFEED"
55 |
56 | const FormatMicroFormats = "MICROFORMATS"
57 |
58 | /******************************************
59 | * HTTP Headers
60 | ******************************************/
61 |
62 | // HTTPHeaderAccept is the string used in the HTTP header to request a response be encoded as a MIME type
63 | const HTTPHeaderAccept = "Accept"
64 |
65 | const HTTPHeaderCacheControl = "Cache-Control"
66 |
67 | const HTTPHeaderLink = "Link"
68 |
69 | /******************************************
70 | * Link Relations
71 | ******************************************/
72 |
73 | const LinkRelationAlternate = "alternate"
74 |
75 | const LinkRelationFeed = "feed"
76 |
77 | const LinkRelationIcon = "icon"
78 |
79 | const LinkRelationHub = "hub"
80 |
81 | const LinkRelationSelf = "self"
82 |
83 | /******************************************
84 | * Identifier Types
85 | ******************************************/
86 |
87 | const IdentifierTypeUsername = "USERNAME"
88 |
89 | const IdentifierTypeURL = "URL"
90 |
91 | const IdentifierTypeNone = "NONE"
92 |
--------------------------------------------------------------------------------
/document-.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/derp"
5 | "github.com/benpate/hannibal/streams"
6 | )
7 |
8 | // LoadDocument tries to retrieve a URL from the internet, then return it into a streams.Document.
9 | // If the remote resource is not already an ActivityStreams document, it will attempt to convert from
10 | // RSS, Atom, JSONFeed, and HTML MicroFormats.
11 | func (client Client) loadDocument(url string, config LoadConfig) (streams.Document, error) {
12 |
13 | const location = "sherlock.Client.loadDocument"
14 |
15 | // RULE: url must not be empty
16 | if url == "" {
17 | return streams.NilDocument(), derp.BadRequestError(location, "Empty URI")
18 | }
19 |
20 | // RULE: Prevent too many redirects
21 | if config.MaximumRedirects < 0 {
22 | return streams.NilDocument(), derp.InternalError(location, "Maximum redirects exceeded", url)
23 | }
24 |
25 | // RULE: url must begin with a valid protocol
26 | url = defaultHTTPS(url)
27 |
28 | // 1. If we can load the document as an ActivityStream, then there you go.
29 | if document := client.loadDocument_ActivityStream(url); document.NotNil() {
30 | return document, nil
31 | }
32 |
33 | // 2. If we can load the document as HTML, then that will do.
34 | if document := client.loadDocument_HTML(url, config.DefaultValue); document.NotNil() {
35 | return document, nil
36 | }
37 |
38 | // 3. If the default value is good enough, then use that.
39 | // This may happen when RSS feeds have *some* information, but a website CAPTCHA
40 | // block us from loading more details.
41 | if len(config.DefaultValue) > 0 {
42 | return streams.NewDocument(config.DefaultValue, streams.WithClient(client)), nil
43 | }
44 |
45 | // 4. Abject failure.
46 | return streams.NilDocument(), derp.BadRequestError(location, "Unable to load document", url, config)
47 | }
48 |
--------------------------------------------------------------------------------
/document-activityStream.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/hannibal/streams"
5 | "github.com/benpate/hannibal/vocab"
6 | "github.com/benpate/remote"
7 | "github.com/benpate/rosetta/mapof"
8 | )
9 |
10 | // loadDocument_ActivityStream tries to load a remote document as an ActivityStream
11 | // If successful, it will return a streams.Document with the appropriate metadata.
12 | // Otherwise, it returns a nil document.
13 | func (client *Client) loadDocument_ActivityStream(uri string) streams.Document {
14 |
15 | data := mapof.NewAny()
16 |
17 | txn := remote.Get(uri).
18 | UserAgent(client.UserAgent).
19 | Accept(vocab.ContentTypeActivityPub).
20 | With(client.RemoteOptions...).
21 | Result(&data)
22 |
23 | if err := txn.Send(); err != nil {
24 | return streams.NilDocument()
25 | }
26 |
27 | if !isActivityStream(txn.ResponseContentType()) {
28 | return streams.NilDocument()
29 | }
30 |
31 | return streams.NewDocument(
32 | data,
33 | streams.WithClient(client),
34 | streams.WithHTTPHeader(txn.ResponseHeader()),
35 | )
36 | }
37 |
--------------------------------------------------------------------------------
/document-html-.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "github.com/benpate/hannibal/streams"
5 | "github.com/benpate/hannibal/vocab"
6 | "github.com/benpate/remote"
7 | "github.com/benpate/rosetta/mapof"
8 | )
9 |
10 | // loadDocument_HTML tries to mimic an ActivityPub document by parsing meta-data on
11 | // a remote HTML page. The `data` argument is a map that may already contain some
12 | // data, and will be updated with any new data that is discovered.
13 | func (client *Client) loadDocument_HTML(uri string, data mapof.Any) streams.Document {
14 |
15 | // Retrieve the HTML document
16 | txn := remote.Get(uri).
17 | UserAgent(client.UserAgent).
18 | With(client.RemoteOptions...)
19 |
20 | if err := txn.Send(); err != nil {
21 | return streams.NilDocument()
22 | }
23 |
24 | // Read the response body
25 | body, err := txn.ResponseBody()
26 |
27 | if err != nil {
28 | return streams.NilDocument()
29 | }
30 |
31 | // Default values for Web Pages
32 | data[vocab.PropertyID] = uri
33 | data[vocab.PropertyURL] = uri
34 | data[vocab.PropertyType] = vocab.ObjectTypePage
35 |
36 | // Apply links found in the response headers
37 | client.applyLinks(txn, data)
38 |
39 | // Add JSON-LD data to the data
40 | client.loadDocument_JSONLD(body, data)
41 |
42 | // Add OpenGraph (via HTMLInfo) data to the data
43 | client.loadDocument_OpenGraph(uri, body, data)
44 |
45 | // Add Microformats2 data to the data
46 | client.loadDocument_MicroFormats(uri, body, data)
47 |
48 | // Return success!
49 | return streams.NewDocument(data,
50 | streams.WithClient(client),
51 | streams.WithHTTPHeader(txn.ResponseHeader()),
52 | )
53 | }
54 |
--------------------------------------------------------------------------------
/document-html-jsonld-.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "bytes"
5 |
6 | "github.com/PuerkitoBio/goquery"
7 | )
8 |
9 | func (client *Client) loadDocument_JSONLD(body []byte, result map[string]any) {
10 |
11 | // Search the returned HTML for JSON-LD
12 | if gqDoc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)); err == nil {
13 |
14 | if client.loadDocument_JSONLD_Embedded(gqDoc, result) {
15 | withContext(result)
16 | return
17 | }
18 |
19 | if client.loadDocument_JSONLD_Linked(gqDoc, result) {
20 | withContext(result)
21 | return
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/document-html-jsonld-embedded.go:
--------------------------------------------------------------------------------
1 | package sherlock
2 |
3 | import (
4 | "encoding/json"
5 |
6 | "github.com/PuerkitoBio/goquery"
7 | "github.com/benpate/rosetta/mapof"
8 | )
9 |
10 | // loadDocument_JSONLD_Embedded searches the GoQuery document for links to ActivityPub-like documents.
11 | func (client *Client) loadDocument_JSONLD_Embedded(document *goquery.Document, result mapof.Any) bool {
12 | // TODO: LOW: Add support for JSON-LD metadata embedded in a
50 |
51 |
52 |
53 |
54 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
66 |
101 |
102 |
103 |
Published on
168 |
169 | under the Fun with Words category.
170 |
171 |
172 |
173 |
174 |
Afficionados of lexicons, linguistics, and all things literary, here I am with another edition of Fun with Words. This is the first post of which I can recall in which the first sentence was architected to use a power-of-three featuring three words that begin with "l" (lexicon, linguistics, literary). In any case, I have saved a few words in my notes -- nestled between "Crane: thought it was a dinosaur" and "surprisal embeddings" in my TODO list [^1] -- to feature in this post.
175 |
Here are the words for today:
176 |
177 |
Sans: Without.
178 |
Artificer: A skilled craftsperson.
179 |
Eludidate: Explain, with the purpose of making something clear.
180 |
Deuteragonist: The second most important character in a story.
181 |
Corpora: A collection of texts. This collection may be grouped by a theme, such as articles written by a single author, published by a specific publication, or articles relating to a given subject matter.
182 |
183 |
[^1]: My TODO list takes the form of both tasks, notes, and words for this series. I need to sort out the list to move some notes from my recent foray into quantitative linguistics and into another document.