├── .github ├── FUNDING.yml └── workflows │ ├── gradle-test.yml │ ├── dependency-review.yml │ └── release-jar.yml ├── .gitattributes ├── settings.gradle.kts ├── gradle ├── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties └── libs.versions.toml ├── .gitignore ├── test_data ├── image_extractor_css_style.html ├── image_extractor_simple_img.html ├── facebook_notification_single.html └── youtube.html ├── gradle.properties.sample ├── src ├── main │ └── kotlin │ │ └── com │ │ └── chimbori │ │ └── crux │ │ ├── common │ │ ├── NumberExtensions.kt │ │ ├── JsoupExtensions.kt │ │ ├── Log.kt │ │ ├── StringExtensions.kt │ │ ├── OkHttpExtensions.kt │ │ └── HttpUrlExtensions.kt │ │ ├── plugins │ │ ├── FacebookUrlRewriter.kt │ │ ├── GoogleUrlRewriter.kt │ │ ├── FaviconExtractor.kt │ │ ├── DocumentFetcher.kt │ │ ├── AmpRedirector.kt │ │ ├── TrackingParameterRemover.kt │ │ ├── WebAppManifestParser.kt │ │ └── HtmlMetadataExtractor.kt │ │ ├── extractors │ │ ├── LinkUrlExtractor.kt │ │ ├── ImageUrlExtractor.kt │ │ └── MetadataHelpers.kt │ │ ├── api │ │ ├── Plugins.kt │ │ ├── Resource.kt │ │ └── Fields.kt │ │ └── Crux.kt └── test │ └── kotlin │ └── com │ └── chimbori │ ├── crux │ ├── common │ │ ├── NumberExtensionsTest.kt │ │ ├── TestHelper.kt │ │ ├── OkHttpExtensionsTest.kt │ │ ├── StringExtensionsTest.kt │ │ └── HttpUrlExtensionsTest.kt │ ├── api │ │ └── ResourceTest.kt │ ├── plugins │ │ ├── TrackingParameterRemoverTest.kt │ │ ├── GoogleUrlRewriterTest.kt │ │ ├── FacebookUrlRewriterTest.kt │ │ ├── FaviconExtractorTest.kt │ │ ├── HtmlMetadataExtractorTest.kt │ │ ├── WebAppManifestParserTest.kt │ │ └── AmpRedirectorTest.kt │ ├── extractors │ │ ├── LinkUrlExtractorTest.kt │ │ ├── ImageUrlExtractorTest.kt │ │ └── MetadataHelpersTest.kt │ └── CruxTest.kt │ └── sample │ └── KotlinPublicAPITest.kt ├── gradle.properties ├── CONTRIBUTING.md ├── RELEASING.md ├── gradlew.bat ├── .editorconfig ├── CLA.md ├── gradlew ├── README.md └── LICENSE.txt /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: chimbori 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | **/*.html linguist-detectable=false -------------------------------------------------------------------------------- /settings.gradle.kts: -------------------------------------------------------------------------------- 1 | rootProject.name = "Crux" 2 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chimbori/crux/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | *.private 3 | .classpath 4 | .DS_Store 5 | .gradle/ 6 | .idea/ 7 | .project 8 | .settings/ 9 | bin/ 10 | build/ 11 | local.properties 12 | -------------------------------------------------------------------------------- /test_data/image_extractor_css_style.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /test_data/image_extractor_simple_img.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
Hermit
5 |
6 | -------------------------------------------------------------------------------- /gradle.properties.sample: -------------------------------------------------------------------------------- 1 | # Place this file at `~/.gradle/gradle.properties` and fill in the REDACTED credentials. 2 | 3 | signing.keyId=REDACTED 4 | signing.password=REDACTED 5 | signing.secretKeyRingFile=~/.gnupg/secring.gpg 6 | mavenCentralUsername=chimbori 7 | mavenCentralPassword=REDACTED 8 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.12-bin.zip 4 | networkTimeout=10000 5 | validateDistributionUrl=true 6 | zipStoreBase=GRADLE_USER_HOME 7 | zipStorePath=wrapper/dists 8 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/common/NumberExtensions.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import kotlin.math.ceil 4 | import kotlin.math.roundToInt 5 | 6 | /** Cannot use [TimeUnit.MILLISECONDS.toMinutes()]; it rounds down, so anything under 1 min is reported as 0. */ 7 | public fun Int?.millisecondsToMinutes(): Int = this?.let { milliseconds -> 8 | ceil(milliseconds.toDouble() / 60_000).roundToInt() 9 | } ?: 0 10 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/common/JsoupExtensions.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import org.jsoup.nodes.Element 4 | import org.jsoup.select.Elements 5 | 6 | internal fun Element.parseAttrAsInt(attr: String) = try { 7 | attr(attr).toInt() 8 | } catch (e: NumberFormatException) { 9 | 0 10 | } 11 | 12 | internal fun Elements.anyChildTagWithAttr(attribute: String): String? = 13 | firstOrNull { element -> element.attr(attribute).isNotBlank() } 14 | ?.attr(attribute) 15 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/common/NumberExtensionsTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import org.junit.Assert.assertEquals 4 | import org.junit.Test 5 | 6 | class NumberExtensionsTest { 7 | @Test 8 | fun testMillisecondsToMinutes() { 9 | assertEquals(0, 0.millisecondsToMinutes()) 10 | assertEquals(1, 1.millisecondsToMinutes()) 11 | assertEquals(10, (10 * 60 * 1000).millisecondsToMinutes()) 12 | assertEquals(-1, (-1 * 60 * 1000).millisecondsToMinutes()) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /.github/workflows/gradle-test.yml: -------------------------------------------------------------------------------- 1 | name: Gradle Test 2 | on: 3 | push: 4 | branches: [ main ] 5 | pull_request: 6 | branches: [ main ] 7 | 8 | jobs: 9 | run-tests: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout Source 13 | uses: actions/checkout@v4 14 | - name: Set up JDK 15 | uses: actions/setup-java@v4 16 | with: 17 | java-version: '17' 18 | distribution: 'temurin' 19 | - name: Run Gradle Checks 20 | run: ./gradlew check --info 21 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/FacebookUrlRewriter.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Rewriter 4 | import okhttp3.HttpUrl 5 | import okhttp3.HttpUrl.Companion.toHttpUrlOrNull 6 | 7 | public class FacebookUrlRewriter : Rewriter { 8 | private fun canRewrite(url: HttpUrl) = url.host.endsWith(".facebook.com") && url.encodedPath == "/l.php" 9 | 10 | override fun rewrite(url: HttpUrl): HttpUrl { 11 | if (!canRewrite(url)) return url 12 | 13 | var outputUrl: HttpUrl = url 14 | do { 15 | outputUrl = outputUrl.queryParameter("u")?.toHttpUrlOrNull() ?: outputUrl 16 | } while (canRewrite(outputUrl)) 17 | return outputUrl 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/GoogleUrlRewriter.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Rewriter 4 | import okhttp3.HttpUrl 5 | import okhttp3.HttpUrl.Companion.toHttpUrlOrNull 6 | 7 | public class GoogleUrlRewriter : Rewriter { 8 | private fun canRewrite(url: HttpUrl) = url.host.endsWith(".google.com") && url.encodedPath == "/url" 9 | 10 | override fun rewrite(url: HttpUrl): HttpUrl { 11 | if (!canRewrite(url)) return url 12 | 13 | var outputUrl: HttpUrl = url 14 | do { 15 | outputUrl = (outputUrl.queryParameter("q") ?: outputUrl.queryParameter("url")) 16 | ?.toHttpUrlOrNull() 17 | ?: outputUrl 18 | } while (canRewrite(outputUrl)) 19 | return outputUrl 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/common/Log.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import org.jsoup.nodes.Node 4 | 5 | internal object Log { 6 | private const val DEBUG = false 7 | 8 | private const val TRUNCATE = true 9 | 10 | fun i(message: String, vararg args: Any?) { 11 | if (DEBUG) { 12 | System.err.println(String.format(message, *args)) 13 | } 14 | } 15 | 16 | fun i(reason: String, node: Node) { 17 | if (DEBUG) { 18 | val nodeToString = if (TRUNCATE) { 19 | node.outerHtml().take(80).replace("\n", "") 20 | } else { 21 | "\n------\n${node.outerHtml()}\n------\n" 22 | } 23 | i("%s [%s]", reason, nodeToString) 24 | } 25 | } 26 | 27 | fun printAndRemove(reason: String, node: Node) { 28 | i(reason, node) 29 | node.remove() 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | GROUP=com.chimbori.crux 2 | POM_ARTIFACT_ID=crux 3 | VERSION_NAME=5.1.0 4 | 5 | POM_NAME=Crux 6 | POM_DESCRIPTION=Crux offers a flexible plugin-based API & implementation to extract metadata from Web pages. 7 | POM_INCEPTION_YEAR=2016 8 | 9 | POM_URL=https://github.com/chimbori/crux 10 | POM_SCM_URL=https://github.com/chimbori/crux 11 | POM_SCM_CONNECTION=scm:git:git://github.com/chimbori/crux.git 12 | POM_SCM_DEV_CONNECTION=scm:git:ssh://github.com:chimbori/crux.git 13 | 14 | POM_LICENSE_NAME=The Apache Software License, Version 2.0 15 | POM_LICENSE_URL=https://www.apache.org/licenses/LICENSE-2.0.txt 16 | POM_LICENSE_DIST=repo 17 | 18 | POM_DEVELOPER_ID=chimbori 19 | POM_DEVELOPER_NAME=Chimbori 20 | POM_DEVELOPER_URL=https://github.com/chimbori 21 | 22 | SONATYPE_HOST=DEFAULT 23 | SONATYPE_AUTOMATIC_RELEASE=true 24 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/FaviconExtractor.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Extractor 4 | import com.chimbori.crux.api.Fields.FAVICON_URL 5 | import com.chimbori.crux.api.Resource 6 | import com.chimbori.crux.common.isLikelyArticle 7 | import com.chimbori.crux.extractors.extractCanonicalUrl 8 | import com.chimbori.crux.extractors.extractFaviconUrl 9 | import okhttp3.HttpUrl 10 | 11 | public class FaviconExtractor : Extractor { 12 | /** Skip handling any file extensions that are unlikely to be HTML pages. */ 13 | public override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle() 14 | 15 | override suspend fun extract(request: Resource): Resource { 16 | val canonicalUrl = request.document?.extractCanonicalUrl()?.let { request.url?.resolve(it) } ?: request.url 17 | return Resource(metadata = mapOf(FAVICON_URL to request.document?.extractFaviconUrl(canonicalUrl))).removeNullValues() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /gradle/libs.versions.toml: -------------------------------------------------------------------------------- 1 | [versions] 2 | kotlin = "1.9.0" 3 | coroutines = "1.7.2" 4 | okhttp = "4.11.0" 5 | 6 | [libraries] 7 | kotlin-plugin = { module = "org.jetbrains.kotlin:kotlin-gradle-plugin", version.ref = "kotlin" } 8 | coroutines-core = { module = "org.jetbrains.kotlinx:kotlinx-coroutines-core", version.ref = "coroutines" } 9 | jsoup = "org.jsoup:jsoup:1.16.1" 10 | klaxon = "com.beust:klaxon:5.6" 11 | okhttp = { module = "com.squareup.okhttp3:okhttp", version.ref = "okhttp" } 12 | junit = "junit:junit:4.13.2" 13 | okhttp-logging = { module = "com.squareup.okhttp3:logging-interceptor", version.ref = "okhttp" } 14 | okhttp-mockwebserver = { module = "com.squareup.okhttp3:mockwebserver", version.ref = "okhttp" } 15 | 16 | [plugins] 17 | kotlin-jvm = { id = "org.jetbrains.kotlin.jvm", version.ref = "kotlin" } 18 | ben-manes-versions = { id = "com.github.ben-manes.versions", version = "0.47.0" } 19 | maven-publish = { id = "com.vanniktech.maven.publish", version = "0.24.0" } 20 | -------------------------------------------------------------------------------- /.github/workflows/dependency-review.yml: -------------------------------------------------------------------------------- 1 | # Dependency Review Action 2 | # 3 | # This Action will scan dependency manifest files that change as part of a Pull Request, surfacing 4 | # known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the 5 | # workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked 6 | # from merging. 7 | # 8 | # Source repository: https://github.com/actions/dependency-review-action 9 | # Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement 10 | 11 | name: 'Dependency Review' 12 | on: [pull_request] 13 | 14 | permissions: 15 | contents: read 16 | 17 | jobs: 18 | dependency-review: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: 'Checkout Repository' 22 | uses: actions/checkout@v4 23 | - name: 'Dependency Review' 24 | uses: actions/dependency-review-action@v2 25 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/extractors/LinkUrlExtractor.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.extractors 2 | 3 | import com.chimbori.crux.common.anyChildTagWithAttr 4 | import com.chimbori.crux.common.nullIfBlank 5 | import okhttp3.HttpUrl 6 | import org.jsoup.nodes.Element 7 | 8 | /** 9 | * Given a single DOM Element root, this extractor inspects the sub-tree and returns the best possible link URL 10 | * available within it. The use case for this application is to pick a single representative link from a DOM sub-tree, 11 | * in a way that works without explicit CSS selector foo. Check out the test cases for markup that is supported. 12 | */ 13 | @Suppress("unused") 14 | public class LinkUrlExtractor(private val url: HttpUrl, private val root: Element) { 15 | public var linkUrl: HttpUrl? = null 16 | private set 17 | 18 | public fun findLink(): LinkUrlExtractor { 19 | ( 20 | root.attr("abs:href").nullIfBlank() 21 | ?: root.select("*").anyChildTagWithAttr("href") 22 | )?.let { 23 | linkUrl = url.resolve(it) 24 | } 25 | return this 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/api/ResourceTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.api 2 | 3 | import com.chimbori.crux.api.Fields.BANNER_IMAGE_URL 4 | import com.chimbori.crux.api.Fields.CANONICAL_URL 5 | import com.chimbori.crux.api.Fields.DESCRIPTION 6 | import com.chimbori.crux.api.Fields.TITLE 7 | import okhttp3.HttpUrl.Companion.toHttpUrl 8 | import org.junit.Assert.assertEquals 9 | import org.junit.Assert.assertNull 10 | import org.junit.Test 11 | 12 | class ResourceTest { 13 | @Test 14 | fun testResourceMetadataApiExamples() { 15 | val resource = Resource( 16 | url = "https://chimbori.com/".toHttpUrl(), 17 | metadata = mapOf( 18 | TITLE to "Life, the Universe, and Everything", 19 | DESCRIPTION to "42", 20 | CANONICAL_URL to "https://chimbori.com/".toHttpUrl() 21 | ) 22 | ) 23 | assertEquals("Life, the Universe, and Everything", resource[TITLE]) 24 | assertEquals("42", resource[DESCRIPTION]) 25 | assertEquals("https://chimbori.com/".toHttpUrl(), resource[CANONICAL_URL]) 26 | assertNull(resource[BANNER_IMAGE_URL]) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/plugins/TrackingParameterRemoverTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import okhttp3.HttpUrl.Companion.toHttpUrl 4 | import org.junit.Assert.assertEquals 5 | import org.junit.Test 6 | 7 | class TrackingParameterRemoverTest { 8 | @Test 9 | fun testThatParametersAreRemoved() { 10 | val trackingRemover = TrackingParameterRemover() 11 | mapOf( 12 | "https://example.org/" to null, 13 | "https://example.org?utm_source" to "https://example.org/", 14 | "https://www.example.com/?utm_source=summer-mailer&utm_medium=email&utm_campaign=summer-sale" 15 | to "https://www.example.com/", 16 | "http://www.example.com/?utm_source=newsletter1&utm_medium=email&utm_campaign=summer-sale&utm_content=toplink" 17 | to "http://www.example.com/", 18 | "https://www.example.com/?utm_source=tracker&non-tracking-parameter=dont-remove" 19 | to "https://www.example.com/?non-tracking-parameter=dont-remove", 20 | ).forEach { (key, value) -> 21 | assertEquals((value ?: key).toHttpUrl(), trackingRemover.rewrite(key.toHttpUrl())) 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /test_data/facebook_notification_single.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 | 6 |
7 | Person Name mentioned you in a comment. 8 |
9 |   10 | 45 minutes ago 11 |
12 |
13 |
14 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/plugins/GoogleUrlRewriterTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import okhttp3.HttpUrl.Companion.toHttpUrl 4 | import org.junit.Assert.assertEquals 5 | import org.junit.Test 6 | 7 | class GoogleUrlRewriterTest { 8 | @Test 9 | fun testGoogleRedirectorPlugin() { 10 | val googleRedirectorPlugin = GoogleUrlRewriter() 11 | mapOf( 12 | "http://example.com/" to null, 13 | "https://plus.url.google.com/url?q=https://arstechnica.com/business/2017/01/before-the-760mph-hyperloop-dream-there-was-the-atmospheric-railway/&rct=j&ust=1485739059621000&usg=AFQjCNH6Cgp4iU0NB5OoDpT3OtOXds7HQg" 14 | to "https://arstechnica.com/business/2017/01/before-the-760mph-hyperloop-dream-there-was-the-atmospheric-railway/", 15 | "https://www.google.com/url?q=https://www.google.com/url?rct%3Dj%26sa%3Dt%26url%3Dhttps://www.facebook.com/permalink.php%253Fid%253D111262459538815%2526story_fbid%253D534292497235807%26ct%3Dga%26cd%3DCAEYACoTOTQxMTQ5NzcyMzExMjAwMTEyMzIcZWNjZWI5M2YwM2E5ZDJiODpjb206ZW46VVM6TA%26usg%3DAFQjCNFSwGsQjcbeVCaSO2rg90RgBpQvzA&source=gmail&ust=1589164930980000&usg=AFQjCNF37pEGpMAz7azFCry-Ib-hwR0VVw" 16 | to "https://www.facebook.com/permalink.php?id=111262459538815&story_fbid=534292497235807", 17 | ).forEach { (key, value) -> 18 | assertEquals((value ?: key).toHttpUrl(), googleRedirectorPlugin.rewrite(key.toHttpUrl())) 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/common/TestHelper.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import com.chimbori.crux.api.Resource 4 | import java.io.File 5 | import okhttp3.HttpUrl 6 | import okhttp3.OkHttpClient 7 | import okhttp3.logging.HttpLoggingInterceptor 8 | import okhttp3.logging.HttpLoggingInterceptor.Level.BASIC 9 | import org.jsoup.Jsoup 10 | import org.junit.Assert.fail 11 | 12 | internal val loggingOkHttpClient: OkHttpClient = OkHttpClient.Builder() 13 | .followRedirects(true) 14 | .followSslRedirects(true) 15 | .retryOnConnectionFailure(true) 16 | .addNetworkInterceptor { chain -> 17 | chain.proceed( 18 | chain.request().newBuilder() 19 | .header("User-Agent", CHROME_USER_AGENT).build() 20 | ) 21 | } 22 | .addInterceptor(HttpLoggingInterceptor().apply { level = BASIC }) 23 | .build() 24 | 25 | internal fun Resource.Companion.fromTestData(url: HttpUrl, testFile: String) = Resource( 26 | url = url, 27 | document = Jsoup.parse(File("test_data/$testFile"), "UTF-8", url.toString()), 28 | ) 29 | 30 | internal fun assertStartsWith(expected: String, actual: String?) { 31 | if (actual?.startsWith(expected) == false) { 32 | fail("Expected \n[$expected]\n at start of \n[$actual]\n") 33 | } 34 | } 35 | 36 | internal fun assertContains(expected: String, actual: String?) { 37 | if (actual?.contains(expected) == false) { 38 | fail("Expected \n[$expected]\n in \n[$actual]\n") 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/common/StringExtensions.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import java.lang.Character.isLetter 4 | import java.util.concurrent.TimeUnit.MINUTES 5 | import kotlin.math.ceil 6 | 7 | internal fun String.countMatches(substring: String): Int { 8 | var count = 0 9 | val indexOf = indexOf(substring) 10 | if (indexOf >= 0) { 11 | count++ 12 | count += substring(indexOf + substring.length).countMatches(substring) 13 | } 14 | return count 15 | } 16 | 17 | /** Remove more than two spaces or newlines */ 18 | internal fun String.removeWhiteSpace() = replace("\\s+".toRegex(), " ").trim { it <= ' ' } 19 | 20 | internal fun String.countLetters() = count { isLetter(it) } 21 | 22 | public fun String.nullIfBlank(): String? = ifBlank { null } 23 | 24 | internal fun String.cleanTitle() = if (lastIndexOf("|") > length / 2) { 25 | substring(0, indexOf("|")).trim() 26 | } else { 27 | removeWhiteSpace() 28 | } 29 | 30 | public fun String.estimatedReadingTimeMs(): Int { 31 | val wordCount = split("\\s+".toRegex()).size 32 | return ((wordCount * MINUTES.toMillis(1)) / AVERAGE_WORDS_PER_MINUTE).toInt() 33 | } 34 | 35 | public fun String.estimatedReadingTimeMinutes(): Int { 36 | val wordCount = split("\\s+".toRegex()).size 37 | return ceil((wordCount / AVERAGE_WORDS_PER_MINUTE).toDouble()).toInt() 38 | } 39 | 40 | /** Number of words that can be read by an average person in one minute. */ 41 | internal const val AVERAGE_WORDS_PER_MINUTE = 275 42 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/DocumentFetcher.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Extractor 4 | import com.chimbori.crux.api.Fields.CANONICAL_URL 5 | import com.chimbori.crux.api.Resource 6 | import com.chimbori.crux.common.fetchFromUrl 7 | import com.chimbori.crux.common.isLikelyArticle 8 | import com.chimbori.crux.extractors.extractCanonicalUrl 9 | import okhttp3.HttpUrl 10 | import okhttp3.OkHttpClient 11 | 12 | /** 13 | * Fetches an HTML document from a remote URL, if not already fetched. 14 | * If a parsed JSoup Document is already available, this is a no-op. 15 | */ 16 | public class DocumentFetcher(private val okHttpClient: OkHttpClient) : Extractor { 17 | /** Skip handling any file extensions that are unlikely to be HTML pages. */ 18 | public override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle() 19 | 20 | override suspend fun extract(request: Resource): Resource { 21 | val resourceToUse = if (request.document != null) { 22 | request 23 | } else if (request.url != null) { 24 | Resource.fetchFromUrl(request.url, okHttpClient) 25 | } else { 26 | Resource() 27 | } 28 | 29 | val canonicalUrl = resourceToUse.document?.extractCanonicalUrl() 30 | ?.let { resourceToUse.url?.resolve(it) } 31 | ?: resourceToUse.url 32 | 33 | return Resource( 34 | url = canonicalUrl, 35 | document = resourceToUse.document, 36 | metadata = mapOf(CANONICAL_URL to canonicalUrl) 37 | ).removeNullValues() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribute to Crux 2 | 3 | Crux is a modern, robust library for parsing HTML articles. With all the content on the Web out 4 | there, there is always lots of room for improvement. We will gladly accept your pull requests that 5 | make parsing more accurate, or add new features & metadata detection. 6 | 7 | To maintain the integrity of the library, we have a few simple expectations from all code submitted. 8 | 9 | 1. Before sending a pull request, please open an issue to discuss your changes. Maintainers 10 | will offer feedback and help validate your idea as well as overall design before you spend any 11 | time writing code. 12 | 1. The expected style for code formatting is available in the repo using the 13 | [EditorConfig](https://editorconfig.org/) standard. We recommend using a JetBrains IDE for 14 | Kotlin, and configuring it to automatically use the `.editorconfig` file included in this 15 | repository. 16 | 1. Crux is fully unit-tested, and we want to keep it that way. All new code should include unit 17 | tests. 18 | 1. For parsing improvements, Crux’s rich suite of integration tests should be updated to reflect 19 | the parsing changes. Authors may either choose to test their improvements with existing HTML test 20 | files, or add new ones, as appropriate. 21 | 1. All current tests should continue to pass. Either update the tests in the same commit, or modify 22 | new code so that existing tests continue to pass. 23 | 1. Changes should be self-contained as far as possible. When implementing multiple independent 24 | improvements, each one should be in its own commit. 25 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/AmpRedirector.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Extractor 4 | import com.chimbori.crux.api.Fields.CANONICAL_URL 5 | import com.chimbori.crux.api.Resource 6 | import com.chimbori.crux.common.fetchFromUrl 7 | import com.chimbori.crux.common.isLikelyArticle 8 | import com.chimbori.crux.common.nullIfBlank 9 | import okhttp3.HttpUrl 10 | import okhttp3.HttpUrl.Companion.toHttpUrlOrNull 11 | import okhttp3.OkHttpClient 12 | 13 | /** 14 | * If the current page is an AMP page, then [AmpRedirector] extracts the canonical URL & replaces the DOM tree for the AMP 15 | * page with the DOM tree for the canonical page. 16 | */ 17 | public class AmpRedirector( 18 | private val refetchContentFromCanonicalUrl: Boolean, 19 | private val okHttpClient: OkHttpClient 20 | ) : Extractor { 21 | /** Skip handling any file extensions that are unlikely to be an HTML page. */ 22 | override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle() 23 | 24 | override suspend fun extract(request: Resource): Resource? { 25 | request.document?.select("link[rel=canonical]")?.attr("abs:href")?.nullIfBlank()?.let { 26 | val canonicalUrl = it.toHttpUrlOrNull() 27 | if (canonicalUrl != request.url) { // Only redirect if this is not already the canonical URL. 28 | return if (refetchContentFromCanonicalUrl && canonicalUrl != null) { 29 | Resource.fetchFromUrl(url = canonicalUrl, okHttpClient = okHttpClient) 30 | } else { 31 | Resource(url = canonicalUrl, metadata = mapOf(CANONICAL_URL to canonicalUrl)) 32 | } 33 | } 34 | } 35 | return null 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/extractors/LinkUrlExtractorTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.extractors 2 | 3 | import com.chimbori.crux.api.Resource 4 | import com.chimbori.crux.common.fromTestData 5 | import java.io.IOException 6 | import okhttp3.HttpUrl 7 | import okhttp3.HttpUrl.Companion.toHttpUrl 8 | import org.junit.Assert.assertEquals 9 | import org.junit.Assert.fail 10 | import org.junit.Test 11 | 12 | class LinkUrlExtractorTest { 13 | @Test 14 | fun testFindLink() { 15 | assertEquals( 16 | "https://m.facebook.com/story.php?story_fbid=11111111111111111&id=1111111111&comment_id=11111111111111111¬if_t=comment_mention¬if_id=1111111111111111&ref=m_notif#11111111111111111".toHttpUrl(), 17 | extractFromTestFile( 18 | "https://m.facebook.com/notifications".toHttpUrl(), 19 | "facebook_notification_single.html" 20 | )?.findLink()?.linkUrl 21 | ) 22 | assertEquals( 23 | "https://hermit.chimbori.com/test-url".toHttpUrl(), 24 | extractFromTestFile( 25 | "https://hermit.chimbori.com".toHttpUrl(), 26 | "image_extractor_simple_img.html" 27 | )?.findLink()?.linkUrl 28 | ) 29 | assertEquals( 30 | "https://hermit.chimbori.com/test".toHttpUrl(), 31 | extractFromTestFile( 32 | "https://hermit.chimbori.com".toHttpUrl(), 33 | "image_extractor_css_style.html" 34 | )?.findLink()?.linkUrl 35 | ) 36 | } 37 | 38 | private fun extractFromTestFile(baseUrl: HttpUrl, testFile: String) = try { 39 | val resource = Resource.fromTestData(baseUrl, testFile) 40 | LinkUrlExtractor(baseUrl, resource.document!!.body()) 41 | } catch (e: IOException) { 42 | fail(e.message) 43 | null 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /test_data/youtube.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | YouTube 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/extractors/ImageUrlExtractorTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.extractors 2 | 3 | import com.chimbori.crux.api.Resource 4 | import com.chimbori.crux.common.fromTestData 5 | import java.io.IOException 6 | import okhttp3.HttpUrl 7 | import okhttp3.HttpUrl.Companion.toHttpUrl 8 | import org.junit.Assert.assertEquals 9 | import org.junit.Assert.fail 10 | import org.junit.Test 11 | 12 | class ImageUrlExtractorTest { 13 | @Test 14 | fun testFindImage() { 15 | assertEquals( 16 | "https://scontent-sea1-1.xx.fbcdn.net/v/t1.0-1/cp0/e15/q65/c30.0.120.120/p120x120/1111111_11111111111111111_1111111111_n.jpg?efg=abcdefghijk1&oh=1234567890abcdef1234567890abcdef&oe=ABCDEF12".toHttpUrl(), 17 | extractFromTestFile( 18 | baseUrl = "https://m.facebook.com/notifications".toHttpUrl(), 19 | testFile = "facebook_notification_single.html" 20 | )?.findImage()?.imageUrl 21 | ) 22 | assertEquals( 23 | "https://hermit.chimbori.com/static/media/test.jpg".toHttpUrl(), 24 | extractFromTestFile( 25 | baseUrl = "https://hermit.chimbori.com".toHttpUrl(), 26 | testFile = "image_extractor_simple_img.html" 27 | )?.findImage()?.imageUrl 28 | ) 29 | assertEquals( 30 | "https://hermit.chimbori.com/static/media/test.jpg".toHttpUrl(), 31 | extractFromTestFile( 32 | baseUrl = "https://hermit.chimbori.com".toHttpUrl(), 33 | testFile = "image_extractor_css_style.html" 34 | )?.findImage()?.imageUrl 35 | ) 36 | } 37 | 38 | private fun extractFromTestFile(baseUrl: HttpUrl, testFile: String) = try { 39 | val resource = Resource.fromTestData(baseUrl, testFile) 40 | ImageUrlExtractor(baseUrl, resource.document!!.body()) 41 | } catch (e: IOException) { 42 | fail(e.message) 43 | null 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /RELEASING.md: -------------------------------------------------------------------------------- 1 | # Publish a New Release 2 | 3 | ## Create a New Release 4 | 5 | 1. Ensure all tests pass & CI indicates that the status is green. 6 | 1. Update `VERSION_NAME` in `gradle.properties`. 7 | 1. Create a new commit for the version number change, naming it `Bump version to x.y.z`. 8 | 1. Tag that commit as `vx.y.z` (must match `v[0-9]+.[0-9]+.[0-9]+`). 9 | 1. Push all commits & tags to GitHub. 10 | 11 | ## Publish to Maven Central 12 | 13 | Confirm that the `gradle.properties` file in the home directory (`~/.gradle/gradle.properties`) is 14 | present and [set up correctly](#set-up-key-signing-on-a-new-machine). 15 | 16 | ```shell 17 | ./gradlew publish 18 | ``` 19 | ## Close & Release Manually 20 | 21 | ### Manually 22 | 23 | Assuming `./gradlew publish` has been run after a new version has been released. 24 | 25 | - Go to https://oss.sonatype.org/#stagingRepositories, login as `chimbori`. 26 | - Select the `comchimboricrux-xxxx` repo 27 | - Click on `Close` from the top toolbar, wait for it to complete. 28 | - Click on `Release` from the top toolbar. 29 | 30 | ## Set Up Key Signing on a New Machine 31 | 32 | ### Signing 33 | 34 | 1. Install GPG, e.g. `brew install gpg` on macOS. 35 | 1. Locate stored credentials from private storage. 36 | 1. Run `restore-keys.sh` from the stored credentials directory. 37 | 1. Enter the password for `chimbori` when prompted. This password is different from the Sonatype/Nexus password. 38 | 39 | ### Credentials 40 | 41 | 1. Copy `gradle.properties.sample` to `~/.gradle/gradle.properties` and fill in the missing redacted credentials. 42 | 1. If `gradle.properties.private` exists, it may be used instead. `gradle.properties.private` is configured to be 43 | `.gitignore`d, so make sure it is never pushed to a public repo. 44 | 1. The new machine is now ready and configured for pushing to Maven Central. 45 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/api/Plugins.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.api 2 | 3 | import okhttp3.HttpUrl 4 | 5 | public sealed interface Plugin 6 | 7 | /** 8 | * Rewriters are plugins that can modify the URL before it’s processed by other plugins. They should not have access 9 | * to the network, and should execute quickly on the main thread if necessary. 10 | */ 11 | public fun interface Rewriter : Plugin { 12 | public fun rewrite(url: HttpUrl): HttpUrl 13 | } 14 | 15 | /** 16 | * Crux is designed as a chain of plugins, each of which can optionally handle URLs passed to it. Each plugin is 17 | * provided a fully-parsed HTML DOM to extract fields from, and can also make additional HTTP requests if necessary to 18 | * retrieve additional metadata or to follow redirects. 19 | * 20 | * Metadata fields can be set via the [Resource.metadata] property. Plugins can also rewrite the canonical URL, and can 21 | * provide an updated DOM tree if the canonical URL is changed. The updated URL and DOM tree will be passed on to the 22 | * next plugin in sequence, so the exact ordering of plugins is important. 23 | */ 24 | public interface Extractor : Plugin { 25 | /** 26 | * @param url URL for the resource being processed by Crux. 27 | * @return true if this plugin can handle the URL, false otherwise. Plugins can only inspect the [HttpUrl], without 28 | * being able to peek at the content. 29 | */ 30 | public fun canExtract(url: HttpUrl): Boolean 31 | 32 | /** 33 | * @param request metadata & DOM content for the request being handled. 34 | * @return a partially populated [Resource] with newly-extracted fields. Include only those fields that need to be 35 | * set or updated; they will be merged with the set of previously-extracted fields. If no fields need to be updated, 36 | * return `null`. 37 | */ 38 | public suspend fun extract(request: Resource): Resource? 39 | } 40 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/common/OkHttpExtensionsTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import com.chimbori.crux.api.Resource 4 | import com.chimbori.crux.createCruxOkHttpClient 5 | import kotlinx.coroutines.runBlocking 6 | import okhttp3.mockwebserver.Dispatcher 7 | import okhttp3.mockwebserver.MockResponse 8 | import okhttp3.mockwebserver.MockWebServer 9 | import okhttp3.mockwebserver.RecordedRequest 10 | import org.junit.After 11 | import org.junit.Assert.assertEquals 12 | import org.junit.Assert.assertNotEquals 13 | import org.junit.Before 14 | import org.junit.Test 15 | 16 | class OkHttpExtensionsTest { 17 | private val okHttpClient = createCruxOkHttpClient() 18 | private lateinit var mockWebServer: MockWebServer 19 | 20 | @Before 21 | fun setUp() { 22 | mockWebServer = MockWebServer().apply { 23 | dispatcher = object : Dispatcher() { 24 | override fun dispatch(request: RecordedRequest) = MockResponse().setBody("${request.path}") 25 | } 26 | start() 27 | } 28 | } 29 | 30 | @After 31 | fun tearDown() { 32 | mockWebServer.shutdown() 33 | } 34 | 35 | @Test 36 | fun testHttpRedirectUrlReturnedInsteadOfOriginalUrl() { 37 | val originalUrl = mockWebServer.url("/original") 38 | val redirectedUrl = mockWebServer.url("/redirected") 39 | mockWebServer.dispatcher = object : Dispatcher() { 40 | override fun dispatch(request: RecordedRequest) = when (request.path) { 41 | originalUrl.encodedPath -> MockResponse().setResponseCode(302).setHeader("Location", redirectedUrl) 42 | redirectedUrl.encodedPath -> MockResponse().setBody("") 43 | else -> MockResponse().setResponseCode(404) 44 | } 45 | } 46 | 47 | val resource = runBlocking { 48 | Resource.fetchFromUrl(originalUrl, okHttpClient) 49 | } 50 | assertNotEquals(originalUrl, resource.url) 51 | assertEquals(redirectedUrl, resource.url) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/common/StringExtensionsTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import java.net.MalformedURLException 4 | import java.net.URL 5 | import org.junit.Assert.assertEquals 6 | import org.junit.Test 7 | 8 | class StringExtensionsTest { 9 | @Test 10 | fun testInnerTrim() { 11 | assertEquals("", " ".removeWhiteSpace()) 12 | assertEquals("t", " t ".removeWhiteSpace()) 13 | assertEquals("t t t", "t t t ".removeWhiteSpace()) 14 | assertEquals("t t", "t \nt ".removeWhiteSpace()) 15 | assertEquals("t peter", "t peter ".removeWhiteSpace()) 16 | assertEquals("t t", "t \n t ".removeWhiteSpace()) 17 | } 18 | 19 | @Test 20 | fun testCount() { 21 | assertEquals(1, "hi wie &test; gehts".countMatches("&test;")) 22 | assertEquals(1, "&test;".countMatches("&test;")) 23 | assertEquals(2, "&test;&test;".countMatches("&test;")) 24 | assertEquals(2, "&test; &test;".countMatches("&test;")) 25 | assertEquals(3, "&test; test; &test; plu &test;".countMatches("&test;")) 26 | } 27 | 28 | @Test 29 | fun testImageProtocolRelative() { 30 | val result = try { 31 | URL( 32 | URL("http://de.wikipedia.org/wiki/Griechenland"), 33 | "//upload.wikimedia.org/wikipedia/commons/thumb/5/5c/Flag_of_Greece.svg/150px-Flag_of_Greece.svg.png" 34 | ).toString() 35 | } catch (e: MalformedURLException) { 36 | "//upload.wikimedia.org/wikipedia/commons/thumb/5/5c/Flag_of_Greece.svg/150px-Flag_of_Greece.svg.png" 37 | } 38 | assertEquals( 39 | "http://upload.wikimedia.org/wikipedia/commons/thumb/5/5c/Flag_of_Greece.svg/150px-Flag_of_Greece.svg.png", 40 | result 41 | ) 42 | } 43 | 44 | @Test 45 | fun testCleanTitle() { 46 | assertEquals( 47 | "World stock markets surge amid confidence Clinton will win US election", 48 | "World stock markets surge amid confidence Clinton will win US election | Business | The Guardian".cleanTitle() 49 | ) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/api/Resource.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.api 2 | 3 | import okhttp3.HttpUrl 4 | import org.jsoup.nodes.Document 5 | import org.jsoup.nodes.Element 6 | 7 | /** A [Resource] encapculates metadata and content related to an HTTP resource. */ 8 | public data class Resource( 9 | /** Canonical URL for this resource. */ 10 | val url: HttpUrl? = null, 11 | 12 | /** Parsed DOM tree for this resource, if available. */ 13 | val document: Document? = null, 14 | 15 | /** 16 | * Extracted and cleaned-up DOM tree for this resource, if available. 17 | * If this is null, then article extraction has not been performed, or has failed. 18 | */ 19 | val article: Element? = null, 20 | 21 | /** A holder for any kind of custom objects that library users may want to use. */ 22 | val metadata: Map = emptyMap(), 23 | ) { 24 | /** @return value of a named field in [Resource.metadata]. */ 25 | public operator fun get(key: String): Any? = metadata[key] 26 | 27 | /** 28 | * Merges non-null fields from another [Resource] with this object, and returns a new immutable object. Prefer to use 29 | * this operator instead of manually merging the two objects, so that all fields are correctly merged and not clobbered. 30 | */ 31 | public operator fun plus(anotherResource: Resource?): Resource = Resource( 32 | url = anotherResource?.url ?: url, 33 | document = anotherResource?.document ?: document, 34 | article = anotherResource?.article ?: article, 35 | metadata = if (anotherResource?.metadata == null) metadata else metadata + anotherResource.metadata, 36 | ) 37 | 38 | /** Removes an immutable copy of this [Resource] that only contains non-null values for each key in [metadata]. */ 39 | public fun removeNullValues(): Resource = copy( 40 | metadata = metadata.filterValues { it != null }, 41 | ) 42 | 43 | /** For any potential extension functions to be defined on the companion object. */ 44 | public companion object 45 | } 46 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/api/Fields.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.api 2 | 3 | /** Well-known keys to use in [Resource.metadata]. */ 4 | public object Fields { 5 | public const val TITLE: String = "title" 6 | public const val DESCRIPTION: String = "description" 7 | public const val SITE_NAME: String = "site-name" 8 | public const val LANGUAGE: String = "language" 9 | public const val DISPLAY: String = "display" 10 | public const val ORIENTATION: String = "orientation" 11 | public const val PUBLISHED_AT: String = "published_at" 12 | public const val MODIFIED_AT: String = "modified_at" 13 | 14 | public const val THEME_COLOR_HEX: String = "theme-color-hex" 15 | public const val THEME_COLOR_HTML: String = "theme-color-html" // Named colors like "aliceblue" 16 | public const val BACKGROUND_COLOR_HEX: String = "background-color-hex" 17 | public const val BACKGROUND_COLOR_HTML: String = "background-color-html" // Named colors like "aliceblue" 18 | 19 | public const val CANONICAL_URL: String = "canonical-url" 20 | public const val AMP_URL: String = "amp-url" 21 | public const val FAVICON_URL: String = "favicon-url" 22 | public const val BANNER_IMAGE_URL: String = "banner-image-url" 23 | public const val FEED_URL: String = "feed-url" 24 | public const val VIDEO_URL: String = "video-url" 25 | public const val WEB_APP_MANIFEST_URL: String = "web-app-manifest-url" // https://www.w3.org/TR/appmanifest/ 26 | public const val NEXT_PAGE_URL: String = "next-page-url" 27 | public const val PREVIOUS_PAGE_URL: String = "previous-page-url" 28 | 29 | // For image or video resources only. 30 | public const val ALT_TEXT: String = "alt-text" 31 | public const val WIDTH_PX: String = "width-px" 32 | public const val HEIGHT_PX: String = "height-px" 33 | 34 | // For articles (estimated reading time) and audio/video content (playback duration). 35 | public const val DURATION_MS: String = "duration-ms" 36 | 37 | public const val TWITTER_HANDLE: String = "twitter-handle" 38 | public const val KEYWORDS_CSV: String = "keywords-csv" 39 | } 40 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/plugins/FacebookUrlRewriterTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import okhttp3.HttpUrl.Companion.toHttpUrl 4 | import org.junit.Assert.assertEquals 5 | import org.junit.Test 6 | 7 | class FacebookUrlRewriterTest { 8 | @Test 9 | fun testFacebookRedirectorPlugin() { 10 | val facebookPlugin = FacebookUrlRewriter() 11 | mapOf( 12 | "http://example.com" to null, 13 | "http://www.facebook.com/l.php?u=http%3A%2F%2Fwww.bet.com%2Fcollegemarketingreps&h=42263" 14 | to "http://www.bet.com/collegemarketingreps", 15 | "https://lm.facebook.com/l.php?u=https%3A%2F%2Fwww.wired.com%2F2014%2F08%2Fmaryam-mirzakhani-fields-medal%2F&h=ATMfLBdoriaBcr9HOvzkEe68VZ4hLhTiFINvMmq5_e6fC9yi3xe957is3nl8VJSWhUO_7BdOp7Yv9CHx6MwQaTkwbZ1CKgSQCt45CROzUw0C37Tp4V-2EvDSBuBM2H-Qew&enc=AZPhspzfaWR0HGkmbExT_AfCFThsP829S0z2UWadB7ponM3YguqyJXgtn2E9BAv_-IdZvW583OnNC9M6WroEsV1jlilk3FXS4ppeydAzaJU_o9gq6HvoGMj0N_SiIKHRE_Gamq8xVdEGPnCJi078X8fTEW_jrkwpPC6P6p5Z3gv6YkFZfskU6J9qe3YRyarG4dgM25dJFnVgxxH-qyHlHsYbMD69i2MF8QNreww1J6S84y6VbIxXC-m9dVfFlNQVmtWMUvJKDLcPmYNysyQSYvkknfZ9SgwBhimurLFmKWhf39nNNVYjjCszCJ1XT57xX0Q&s=1" 16 | to "https://www.wired.com/2014/08/maryam-mirzakhani-fields-medal/", 17 | "http://lm.facebook.com/l.php?u=http%3A%2F%2Fwww.cnn.com%2F2017%2F01%2F25%2Fpolitics%2Fscientists-march-dc-trnd%2Findex.html&h=ATO7Ln_rl7DAjRcqSo8yfpOvrFlEmKZmgeYHsOforgXsUYPLDy3nC1KfCYE-hev5oJzz1zydvvzI4utABjHqU1ruwDfw49jiDGCTrjFF-EyE6xfcbWRmDacY_6_R-lSi9g&enc=AZP1hkQfMXuV0vOHa1VeY8kdip2N73EjbXMKx3Zf4Ytdb1MrGHL48by4cl9_DShGYj9nZXvNt9xad9_4jphO9QBpRJLNGoyrRMBHI09eoFyPmxxjw7hHBy5Ouez0q7psi1uvjiphzOKVxjxyYBWnTJKD7m8rvhFz0HespmfvCf-fUiCpi6NDpxwYEw7vZ99fcjOpkiQqaFM_Gvqeat7r0e8axnqM-pJGY0fkjgWvgwTyfiB4fNMRhH3IaAmyL7DXl0xeYMoYSHuITkjTY9aU5dkiETfDVwBABOO9FJi2nTnRMw92E-gMMbiHFoHENlaSVJc&s=1" 18 | to "http://www.cnn.com/2017/01/25/politics/scientists-march-dc-trnd/index.html", 19 | ).forEach { (key, value) -> 20 | assertEquals((value ?: key).toHttpUrl(), facebookPlugin.rewrite(key.toHttpUrl())) 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /.github/workflows/release-jar.yml: -------------------------------------------------------------------------------- 1 | name: Release JAR 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v[0-9]+.[0-9]+.[0-9]+" # Matches "v1.2.3" but not "v1.2.3.4" 7 | # The JAR filename cannot be changed from "crux-1.2.3.jar" because it’s 8 | # also published to Maven, so use the tag name to derive the JAR filename. 9 | # WARNING: This only works if the tag name format is "v1.2.3". 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | steps: 15 | # ---- Create a release first, even if build fails ------------------------- 16 | - name: Create Release 17 | id: create_release 18 | uses: actions/create-release@v1 19 | env: 20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 21 | with: 22 | tag_name: ${{ github.ref }} 23 | release_name: ${{ github.ref }} 24 | 25 | # ---- Set up environment -------------------------------------------------- 26 | - name: Checkout Source 27 | uses: actions/checkout@v4 28 | - name: Set up JDK 29 | uses: actions/setup-java@v4 30 | with: 31 | java-version: '17' 32 | distribution: 'temurin' 33 | 34 | # ---- Build the JAR ------------------------------------------------------- 35 | - name: Build the JAR 36 | run: ./gradlew jar 37 | 38 | # ---- Compute the JAR filename -------------------------------------------- 39 | - name: Compute the JAR filename 40 | # Remove the "v" prefix from the tag. ("v1.2.3" --> "crux-1.2.3.jar") 41 | run: | 42 | JAR_FILENAME=crux-$(echo "${{ github.ref }}" | sed -e 's|^refs/tags/v||').jar 43 | echo "JAR_FILENAME=$JAR_FILENAME" >> $GITHUB_ENV 44 | 45 | # ---- Upload JAR to GitHub Release ---------------------------------------- 46 | - name: Upload JAR to GitHub Release 47 | uses: actions/upload-release-asset@v1 48 | env: 49 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 50 | with: 51 | upload_url: ${{ steps.create_release.outputs.upload_url }} 52 | asset_path: build/libs/crux.jar 53 | asset_name: ${{ env.JAR_FILENAME }} 54 | asset_content_type: application/java-archive 55 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/extractors/ImageUrlExtractor.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.extractors 2 | 3 | import com.chimbori.crux.common.anyChildTagWithAttr 4 | import com.chimbori.crux.common.nullIfBlank 5 | import java.util.regex.Pattern 6 | import okhttp3.HttpUrl 7 | import org.jsoup.nodes.Element 8 | import org.jsoup.parser.Parser.unescapeEntities 9 | import org.jsoup.select.Elements 10 | 11 | /** 12 | * Given a single DOM Element root, this extractor inspects the sub-tree and returns the best possible image URL 13 | * candidate available within it. The use case for this application is to pick a single representative image from a DOM 14 | * sub-tree, in a way that works without explicit CSS selector foo. Check out the test cases for markup that is 15 | * supported. 16 | */ 17 | @Suppress("unused") 18 | public class ImageUrlExtractor(private val url: HttpUrl, private val root: Element) { 19 | public var imageUrl: HttpUrl? = null 20 | private set 21 | 22 | public fun findImage(): ImageUrlExtractor { 23 | ( 24 | root.attr("src").nullIfBlank() 25 | ?: root.attr("data-src").nullIfBlank() 26 | ?: root.select("img").anyChildTagWithAttr("src") 27 | ?: root.select("img").anyChildTagWithAttr("data-src") 28 | ?: root.select("*").anyChildTagWithAttr("src") 29 | ?: root.select("*").anyChildTagWithAttr("data-src") 30 | ?: parseImageUrlFromStyleAttr(root.select("[role=img]")) 31 | ?: parseImageUrlFromStyleAttr(root.select("*")) 32 | )?.let { imageUrl = url.resolve(it) } 33 | return this 34 | } 35 | 36 | private fun parseImageUrlFromStyleAttr(elements: Elements): String? { 37 | elements.forEach { element -> 38 | var styleAttr = element.attr("style") 39 | if (styleAttr.isNullOrEmpty()) { 40 | return@forEach 41 | } 42 | styleAttr = unescapeEntities(styleAttr, true) 43 | val cssUrlMatcher = CSS_URL.matcher(styleAttr) 44 | if (cssUrlMatcher.find()) { 45 | return cssUrlMatcher.group(1) 46 | } 47 | } 48 | return null 49 | } 50 | 51 | public companion object { 52 | private val CSS_URL = Pattern.compile("url\\([\\\"']{0,1}(.+?)[\\\"']{0,1}\\)") 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/plugins/FaviconExtractorTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Fields.FAVICON_URL 4 | import com.chimbori.crux.api.Resource 5 | import com.chimbori.crux.common.fetchFromUrl 6 | import com.chimbori.crux.common.fromTestData 7 | import com.chimbori.crux.common.loggingOkHttpClient 8 | import kotlinx.coroutines.runBlocking 9 | import okhttp3.HttpUrl.Companion.toHttpUrl 10 | import okhttp3.mockwebserver.Dispatcher 11 | import okhttp3.mockwebserver.MockResponse 12 | import okhttp3.mockwebserver.MockWebServer 13 | import okhttp3.mockwebserver.RecordedRequest 14 | import org.junit.After 15 | import org.junit.Assert.assertEquals 16 | import org.junit.Assert.assertTrue 17 | import org.junit.Before 18 | import org.junit.Test 19 | 20 | class FaviconExtractorTest { 21 | private lateinit var mockWebServer: MockWebServer 22 | private lateinit var faviconExtractor: FaviconExtractor 23 | 24 | @Before 25 | fun setUp() { 26 | faviconExtractor = FaviconExtractor() 27 | mockWebServer = MockWebServer().apply { start() } 28 | } 29 | 30 | @After 31 | fun tearDown() { 32 | mockWebServer.shutdown() 33 | } 34 | 35 | @Test 36 | fun testFaviconPlugin() { 37 | mockWebServer.dispatcher = object : Dispatcher() { 38 | override fun dispatch(request: RecordedRequest) = MockResponse().setBody( 39 | """| 40 | | 41 | | 42 | | 43 | | 44 | | 45 | | 46 | |""".trimMargin() 47 | ) 48 | } 49 | 50 | val candidateUrl = mockWebServer.url("/") 51 | assertTrue(faviconExtractor.canExtract(candidateUrl)) 52 | 53 | runBlocking { 54 | val parsed = faviconExtractor.extract( 55 | Resource.fetchFromUrl(candidateUrl, loggingOkHttpClient) 56 | ) 57 | assertEquals(mockWebServer.url("/favicon.png"), parsed[FAVICON_URL]) 58 | } 59 | } 60 | 61 | @Test 62 | fun testYouTubeFavicon() { 63 | val candidateUrl = "https://youtube.com".toHttpUrl() 64 | assertTrue(faviconExtractor.canExtract(candidateUrl)) 65 | runBlocking { 66 | val parsed = faviconExtractor.extract(Resource.fromTestData(candidateUrl, "youtube.html")) 67 | assertEquals( 68 | "https://www.youtube.com/s/desktop/c01ea7e3/img/logos/favicon_144x144.png".toHttpUrl(), 69 | parsed[FAVICON_URL] 70 | ) 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/TrackingParameterRemover.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Rewriter 4 | import okhttp3.HttpUrl 5 | 6 | public class TrackingParameterRemover(private val trackingParameters: Array = TRACKING_PARAMETERS) : Rewriter { 7 | override fun rewrite(url: HttpUrl): HttpUrl = url.newBuilder().apply { 8 | url.queryParameterNames.filter { it in trackingParameters }.forEach { 9 | removeAllQueryParameters(it) 10 | } 11 | }.build() 12 | 13 | public companion object { 14 | public val TRACKING_PARAMETERS: Array = arrayOf( 15 | "__hsfp", 16 | "__hssc", 17 | "__hstc", 18 | "__s", 19 | "_hsenc", 20 | "_hsmi", 21 | "_openstat", 22 | "action_object_map", 23 | "action_ref_map", 24 | "action_type_map", 25 | "cvid", 26 | "dclid", 27 | "fb_action_ids", 28 | "fb_action_types", 29 | "fb_ref", 30 | "fb_source", 31 | "fbclid", 32 | "ga_campaign", 33 | "ga_content", 34 | "ga_medium", 35 | "ga_place", 36 | "ga_source", 37 | "ga_term", 38 | "gbraid", 39 | "gclid", 40 | "gs_l", 41 | "hsa_acc", 42 | "hsa_ad", 43 | "hsa_cam", 44 | "hsa_grp", 45 | "hsa_kw", 46 | "hsa_mt", 47 | "hsa_net", 48 | "hsa_src", 49 | "hsa_tgt", 50 | "hsa_ver", 51 | "hsCtaTracking", 52 | "ICID", 53 | "igshid", 54 | "mc_cid", 55 | "mc_eid", 56 | "mkt_tok", 57 | "ml_subscriber", 58 | "ml_subscriber_hash", 59 | "msclkid", 60 | "oicd", 61 | "oly_anon_id", 62 | "oly_enc_id", 63 | "otc", 64 | "rb_clickid", 65 | "s_cid", 66 | "soc_src", 67 | "soc_trk", 68 | "stm_campaign", 69 | "stm_cid", 70 | "stm_content", 71 | "stm_medium", 72 | "stm_name", 73 | "stm_reader", 74 | "stm_referrer", 75 | "stm_social", 76 | "stm_social-type", 77 | "stm_source", 78 | "stm_term", 79 | "twclid", 80 | "utm_brand", 81 | "utm_campaign", 82 | "utm_cid", 83 | "utm_content", 84 | "utm_id", 85 | "utm_medium", 86 | "utm_name", 87 | "utm_place", 88 | "utm_pubreferrer", 89 | "utm_reader", 90 | "utm_referrer", 91 | "utm_social", 92 | "utm_social-type", 93 | "utm_source", 94 | "utm_swu", 95 | "utm_term", 96 | "utm_userid", 97 | "utm_viz_id", 98 | "vero_conv", 99 | "vero_id", 100 | "wbraid", 101 | "wickedid", 102 | "yclid", 103 | ) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/extractors/MetadataHelpersTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.extractors 2 | 3 | import com.chimbori.crux.common.cleanTitle 4 | import org.jsoup.Jsoup 5 | import org.junit.Assert.assertEquals 6 | import org.junit.Test 7 | 8 | class MetadataHelpersTest { 9 | @Test 10 | fun testCleanTitle() { 11 | assertEquals("mytitle irgendwas", "mytitle irgendwas | Facebook".cleanTitle()) 12 | assertEquals("mytitle irgendwas", "mytitle irgendwas | Irgendwas".cleanTitle()) 13 | 14 | // This should fail as most sites do store their name after the post. 15 | assertEquals("Irgendwas | mytitle irgendwas", "Irgendwas | mytitle irgendwas".cleanTitle()) 16 | } 17 | 18 | @Test 19 | fun testParseSize() { 20 | assertEquals(0, parseSize(null)) 21 | assertEquals(0, parseSize("")) 22 | assertEquals(0, parseSize(" ")) 23 | assertEquals(0, parseSize("x")) 24 | assertEquals(0, parseSize("1")) 25 | assertEquals(128, parseSize("128x128")) 26 | assertEquals(128, parseSize("128x64")) 27 | assertEquals(256, parseSize("128x256")) 28 | assertEquals(128, parseSize("128X128")) 29 | assertEquals(0, parseSize("x 16")) 30 | assertEquals(48, parseSize("16x16 24x24 32x32 48x48")) 31 | assertEquals(128, parseSize("16x16 24x24 128x32 48x48")) 32 | assertEquals(48, parseSize("16x16 24x48")) 33 | assertEquals(16, parseSize("16x16 24")) 34 | assertEquals(0, parseSize("Some string with a 'x' in between")) 35 | } 36 | 37 | @Test 38 | fun testFindLargestIcon() { 39 | assertEquals( 40 | "https://example.org/144.png", 41 | findLargestIcon( 42 | Jsoup.parse( 43 | """ 44 | | 45 | | 46 | | 47 | | 48 | | 49 | """.trimMargin(), "https://example.org/" 50 | ).select("*") 51 | ) 52 | ) 53 | 54 | assertEquals( 55 | "https://example.org/512.png", 56 | findLargestIcon( 57 | Jsoup.parse( 58 | """ 59 | | 60 | | 61 | | 62 | | 63 | | 64 | 65 | """.trimMargin(), "https://example.org/" 66 | ).select("*") 67 | ) 68 | ) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/common/OkHttpExtensions.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import com.chimbori.crux.api.Resource 4 | import java.io.IOException 5 | import java.net.UnknownHostException 6 | import kotlinx.coroutines.Dispatchers 7 | import kotlinx.coroutines.withContext 8 | import okhttp3.HttpUrl 9 | import okhttp3.OkHttpClient 10 | import okhttp3.Request 11 | import okhttp3.Response 12 | import org.jsoup.Jsoup 13 | import org.jsoup.nodes.Document 14 | 15 | private const val LATEST_KNOWN_CHROME_MAJOR_VERSION = 131 16 | 17 | internal const val CHROME_USER_AGENT = 18 | "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/$LATEST_KNOWN_CHROME_MAJOR_VERSION.0.0.0 Mobile Safari/537.36" 19 | 20 | public suspend fun OkHttpClient.safeCall(request: Request): Response? = withContext(Dispatchers.IO) { 21 | try { 22 | newCall(request).execute() 23 | } catch (e: IOException) { 24 | null 25 | } catch (e: NullPointerException) { 26 | // OkHttp sometimes tries to read a cookie which is null, causing an NPE here. The root cause 27 | // has not been identified, but this only happens with Twitter so far. 28 | null 29 | } catch (e: IllegalArgumentException) { 30 | // The URL is something like "https://" (no hostname, no path, etc.) which is clearly invalid. 31 | null 32 | } catch (e: UnknownHostException) { 33 | // Device is offline, or this host is unreachable. 34 | null 35 | } catch (t: Throwable) { 36 | // Something else really bad happened, e.g. [java.net.SocketTimeoutException]. 37 | null 38 | } 39 | } 40 | 41 | public suspend fun OkHttpClient.safeHttpGet(url: HttpUrl): Response? = 42 | safeCall(Request.Builder().url(url).get().build()) 43 | 44 | public suspend fun OkHttpClient.safeHttpHead(url: HttpUrl): Response? = 45 | safeCall(Request.Builder().url(url).head().build()) 46 | 47 | public suspend fun OkHttpClient.httpGetContent(url: HttpUrl, onError: ((t: Throwable) -> Unit)? = null): String? = 48 | withContext(Dispatchers.IO) { 49 | safeHttpGet(url)?.use { response -> 50 | if (response.isSuccessful && response.body != null) { 51 | try { 52 | response.body!!.string() 53 | } catch (t: Throwable) { 54 | onError?.invoke(t) 55 | null 56 | } 57 | } else null 58 | } 59 | } 60 | 61 | public suspend fun Resource.Companion.fetchFromUrl(url: HttpUrl, okHttpClient: OkHttpClient) 62 | : Resource = withContext(Dispatchers.IO) { 63 | 64 | val httpResponse = okHttpClient.safeHttpGet(url) 65 | 66 | // If the HTTP request resulted in an HTTP redirect, use the redirected URL. 67 | val urlToUse = if (httpResponse?.isSuccessful == true && httpResponse.request.url != url) { 68 | httpResponse.request.url 69 | } else url 70 | 71 | val docToUse: Document? = try { 72 | httpResponse?.body?.let { 73 | Jsoup.parse(it.byteStream(), "UTF-8", urlToUse.toString()) 74 | } 75 | } catch (t: Throwable) { 76 | null 77 | } 78 | 79 | Resource(url = urlToUse, document = docToUse) 80 | } 81 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractorTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Fields.CANONICAL_URL 4 | import com.chimbori.crux.api.Fields.DESCRIPTION 5 | import com.chimbori.crux.api.Fields.NEXT_PAGE_URL 6 | import com.chimbori.crux.api.Fields.PREVIOUS_PAGE_URL 7 | import com.chimbori.crux.api.Fields.TITLE 8 | import com.chimbori.crux.api.Resource 9 | import com.chimbori.crux.common.fetchFromUrl 10 | import com.chimbori.crux.common.loggingOkHttpClient 11 | import kotlinx.coroutines.runBlocking 12 | import okhttp3.HttpUrl.Companion.toHttpUrl 13 | import okhttp3.mockwebserver.Dispatcher 14 | import okhttp3.mockwebserver.MockResponse 15 | import okhttp3.mockwebserver.MockWebServer 16 | import okhttp3.mockwebserver.RecordedRequest 17 | import org.junit.After 18 | import org.junit.Assert.assertEquals 19 | import org.junit.Assert.assertFalse 20 | import org.junit.Assert.assertTrue 21 | import org.junit.Before 22 | import org.junit.Test 23 | 24 | class HtmlMetadataExtractorTest { 25 | private lateinit var mockWebServer: MockWebServer 26 | private lateinit var htmlMetadataExtractor: HtmlMetadataExtractor 27 | 28 | @Before 29 | fun setUp() { 30 | mockWebServer = MockWebServer().apply { start() } 31 | htmlMetadataExtractor = HtmlMetadataExtractor() 32 | } 33 | 34 | @After 35 | fun tearDown() { 36 | mockWebServer.shutdown() 37 | } 38 | 39 | @Test 40 | fun testParseValidTitleAndBlankDescription() { 41 | mockWebServer.dispatcher = object : Dispatcher() { 42 | override fun dispatch(request: RecordedRequest) = 43 | MockResponse().setBody("Crux Test\r\t\n ") 44 | } 45 | 46 | val candidateUrl = mockWebServer.url("/") 47 | assertTrue(htmlMetadataExtractor.canExtract(candidateUrl)) 48 | 49 | runBlocking { 50 | val parsed = htmlMetadataExtractor.extract( 51 | Resource.fetchFromUrl(candidateUrl, loggingOkHttpClient) 52 | ) 53 | assertEquals(candidateUrl, parsed.url) 54 | assertEquals("Crux Test", parsed[TITLE]) 55 | assertFalse(parsed.metadata.containsKey(DESCRIPTION)) 56 | } 57 | } 58 | 59 | @Test 60 | fun testPaginationLinks() { 61 | mockWebServer.dispatcher = object : Dispatcher() { 62 | override fun dispatch(request: RecordedRequest) = 63 | MockResponse().setBody( 64 | """ 65 | | 66 | | 67 | |""".trimMargin() 68 | ) 69 | } 70 | 71 | val candidateUrl = mockWebServer.url("/") 72 | assertTrue(htmlMetadataExtractor.canExtract(candidateUrl)) 73 | 74 | runBlocking { 75 | val parsed = htmlMetadataExtractor.extract( 76 | Resource.fetchFromUrl(candidateUrl, loggingOkHttpClient) 77 | ) 78 | assertEquals("http://www.example.com/page=2".toHttpUrl(), parsed[CANONICAL_URL]) 79 | assertEquals("http://www.example.com/page=3".toHttpUrl(), parsed[NEXT_PAGE_URL]) 80 | assertEquals("http://www.example.com/page=1".toHttpUrl(), parsed[PREVIOUS_PAGE_URL]) 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | @rem SPDX-License-Identifier: Apache-2.0 17 | @rem 18 | 19 | @if "%DEBUG%"=="" @echo off 20 | @rem ########################################################################## 21 | @rem 22 | @rem Gradle startup script for Windows 23 | @rem 24 | @rem ########################################################################## 25 | 26 | @rem Set local scope for the variables with windows NT shell 27 | if "%OS%"=="Windows_NT" setlocal 28 | 29 | set DIRNAME=%~dp0 30 | if "%DIRNAME%"=="" set DIRNAME=. 31 | @rem This is normally unused 32 | set APP_BASE_NAME=%~n0 33 | set APP_HOME=%DIRNAME% 34 | 35 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 36 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 37 | 38 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 39 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 40 | 41 | @rem Find java.exe 42 | if defined JAVA_HOME goto findJavaFromJavaHome 43 | 44 | set JAVA_EXE=java.exe 45 | %JAVA_EXE% -version >NUL 2>&1 46 | if %ERRORLEVEL% equ 0 goto execute 47 | 48 | echo. 1>&2 49 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 50 | echo. 1>&2 51 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 52 | echo location of your Java installation. 1>&2 53 | 54 | goto fail 55 | 56 | :findJavaFromJavaHome 57 | set JAVA_HOME=%JAVA_HOME:"=% 58 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 59 | 60 | if exist "%JAVA_EXE%" goto execute 61 | 62 | echo. 1>&2 63 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 64 | echo. 1>&2 65 | echo Please set the JAVA_HOME variable in your environment to match the 1>&2 66 | echo location of your Java installation. 1>&2 67 | 68 | goto fail 69 | 70 | :execute 71 | @rem Setup the command line 72 | 73 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 74 | 75 | 76 | @rem Execute Gradle 77 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 78 | 79 | :end 80 | @rem End local scope for the variables with windows NT shell 81 | if %ERRORLEVEL% equ 0 goto mainEnd 82 | 83 | :fail 84 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 85 | rem the _cmd.exe /c_ return code! 86 | set EXIT_CODE=%ERRORLEVEL% 87 | if %EXIT_CODE% equ 0 set EXIT_CODE=1 88 | if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% 89 | exit /b %EXIT_CODE% 90 | 91 | :mainEnd 92 | if "%OS%"=="Windows_NT" endlocal 93 | 94 | :omega 95 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/WebAppManifestParser.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.beust.klaxon.JsonArray 4 | import com.beust.klaxon.JsonObject 5 | import com.beust.klaxon.Parser 6 | import com.chimbori.crux.api.Extractor 7 | import com.chimbori.crux.api.Fields.BACKGROUND_COLOR_HEX 8 | import com.chimbori.crux.api.Fields.BACKGROUND_COLOR_HTML 9 | import com.chimbori.crux.api.Fields.DISPLAY 10 | import com.chimbori.crux.api.Fields.FAVICON_URL 11 | import com.chimbori.crux.api.Fields.LANGUAGE 12 | import com.chimbori.crux.api.Fields.ORIENTATION 13 | import com.chimbori.crux.api.Fields.THEME_COLOR_HEX 14 | import com.chimbori.crux.api.Fields.THEME_COLOR_HTML 15 | import com.chimbori.crux.api.Fields.TITLE 16 | import com.chimbori.crux.api.Fields.WEB_APP_MANIFEST_URL 17 | import com.chimbori.crux.api.Resource 18 | import com.chimbori.crux.common.httpGetContent 19 | import com.chimbori.crux.common.isLikelyArticle 20 | import com.chimbori.crux.common.nullIfBlank 21 | import com.chimbori.crux.extractors.extractCanonicalUrl 22 | import com.chimbori.crux.extractors.parseSize 23 | import okhttp3.HttpUrl 24 | import okhttp3.HttpUrl.Companion.toHttpUrlOrNull 25 | import okhttp3.OkHttpClient 26 | 27 | public class WebAppManifestParser(private val okHttpClient: OkHttpClient) : Extractor { 28 | override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle() 29 | 30 | override suspend fun extract(request: Resource): Resource? { 31 | val canonicalUrl = request.document?.extractCanonicalUrl()?.let { request.url?.resolve(it) } ?: request.url 32 | val webAppManifestUrl = request.document?.select("link[rel=manifest]")?.attr("abs:href")?.nullIfBlank() 33 | ?.let { canonicalUrl?.resolve(it) ?: it.toHttpUrlOrNull() } 34 | ?: return null 35 | 36 | val manifest: JsonObject? = okHttpClient.httpGetContent(webAppManifestUrl)?.let { rawJSON -> 37 | try { 38 | Parser.default().parse(StringBuilder(rawJSON)) as JsonObject 39 | } catch (t: Throwable) { 40 | // Silently ignore all JSON errors, since they are not recoverable. 41 | null 42 | } 43 | } 44 | 45 | val themeColorHtml = manifest.element("theme_color") 46 | val backgroundColorHtml = manifest.element("background_color") 47 | return Resource( 48 | metadata = mapOf( 49 | WEB_APP_MANIFEST_URL to webAppManifestUrl, 50 | TITLE to manifest.element("name"), 51 | LANGUAGE to manifest.element("lang"), 52 | DISPLAY to manifest.element("display"), 53 | ORIENTATION to manifest.element("orientation"), 54 | FAVICON_URL to getLargestIconUrl(webAppManifestUrl, manifest?.array("icons")), 55 | (if (themeColorHtml?.startsWith("#") == true) THEME_COLOR_HEX else THEME_COLOR_HTML) to themeColorHtml, 56 | (if (backgroundColorHtml?.startsWith("#") == true) BACKGROUND_COLOR_HEX else BACKGROUND_COLOR_HTML) to backgroundColorHtml, 57 | ) 58 | ).removeNullValues() 59 | } 60 | 61 | private fun getLargestIconUrl(baseUrl: HttpUrl?, icons: JsonArray?): HttpUrl? { 62 | icons 63 | ?.maxByOrNull { sizeElement -> parseSize((sizeElement as? JsonObject)?.string("sizes")) } 64 | .let { iconElement -> iconElement?.string("src") } 65 | ?.let { iconUrl -> return if (baseUrl != null) baseUrl.resolve(iconUrl) else iconUrl.toHttpUrlOrNull() } 66 | ?: return null 67 | } 68 | 69 | private fun JsonObject?.element(name: String): String? = this?.string(name)?.trim() 70 | } 71 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractor.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Extractor 4 | import com.chimbori.crux.api.Fields.AMP_URL 5 | import com.chimbori.crux.api.Fields.BANNER_IMAGE_URL 6 | import com.chimbori.crux.api.Fields.CANONICAL_URL 7 | import com.chimbori.crux.api.Fields.DESCRIPTION 8 | import com.chimbori.crux.api.Fields.FEED_URL 9 | import com.chimbori.crux.api.Fields.KEYWORDS_CSV 10 | import com.chimbori.crux.api.Fields.MODIFIED_AT 11 | import com.chimbori.crux.api.Fields.NEXT_PAGE_URL 12 | import com.chimbori.crux.api.Fields.PREVIOUS_PAGE_URL 13 | import com.chimbori.crux.api.Fields.PUBLISHED_AT 14 | import com.chimbori.crux.api.Fields.SITE_NAME 15 | import com.chimbori.crux.api.Fields.THEME_COLOR_HEX 16 | import com.chimbori.crux.api.Fields.TITLE 17 | import com.chimbori.crux.api.Fields.VIDEO_URL 18 | import com.chimbori.crux.api.Resource 19 | import com.chimbori.crux.common.isLikelyArticle 20 | import com.chimbori.crux.extractors.extractAmpUrl 21 | import com.chimbori.crux.extractors.extractCanonicalUrl 22 | import com.chimbori.crux.extractors.extractDescription 23 | import com.chimbori.crux.extractors.extractFeedUrl 24 | import com.chimbori.crux.extractors.extractImageUrl 25 | import com.chimbori.crux.extractors.extractKeywords 26 | import com.chimbori.crux.extractors.extractModifiedAt 27 | import com.chimbori.crux.extractors.extractPaginationUrl 28 | import com.chimbori.crux.extractors.extractPublishedAt 29 | import com.chimbori.crux.extractors.extractSiteName 30 | import com.chimbori.crux.extractors.extractThemeColor 31 | import com.chimbori.crux.extractors.extractTitle 32 | import com.chimbori.crux.extractors.extractVideoUrl 33 | import okhttp3.HttpUrl 34 | 35 | /** 36 | * Extracts common well-defined metadata fields from an HTML DOM tree. Includes support for: 37 | * - Twitter Cards Metadata: https://developer.twitter.com/en/docs/twitter-for-websites/cards/overview/markup 38 | * - Open Graph Protocol: https://ogp.me/ 39 | * - AMP Spec: https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/ 40 | */ 41 | public class HtmlMetadataExtractor : Extractor { 42 | /** Skip handling any file extensions that are unlikely to be HTML pages. */ 43 | public override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle() 44 | 45 | override suspend fun extract(request: Resource): Resource { 46 | val canonicalUrl = request.document?.extractCanonicalUrl() 47 | ?.let { request.url?.resolve(it) } 48 | ?: request.url 49 | 50 | return Resource( 51 | url = canonicalUrl, 52 | document = request.document, 53 | metadata = mapOf( 54 | CANONICAL_URL to canonicalUrl, 55 | TITLE to request.document?.extractTitle(), 56 | DESCRIPTION to request.document?.extractDescription(), 57 | SITE_NAME to request.document?.extractSiteName(), 58 | THEME_COLOR_HEX to request.document?.extractThemeColor(), 59 | PUBLISHED_AT to request.document?.extractPublishedAt(), 60 | MODIFIED_AT to request.document?.extractModifiedAt(), 61 | KEYWORDS_CSV to request.document?.extractKeywords()?.joinToString(separator = ","), 62 | NEXT_PAGE_URL to request.document?.extractPaginationUrl(request.url, "next"), 63 | PREVIOUS_PAGE_URL to request.document?.extractPaginationUrl(request.url, "prev"), 64 | BANNER_IMAGE_URL to request.document?.extractImageUrl(canonicalUrl), 65 | FEED_URL to request.document?.extractFeedUrl(canonicalUrl), 66 | AMP_URL to request.document?.extractAmpUrl(canonicalUrl), 67 | VIDEO_URL to request.document?.extractVideoUrl(canonicalUrl), 68 | ) 69 | ).removeNullValues() 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/common/HttpUrlExtensionsTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import okhttp3.HttpUrl.Companion.toHttpUrl 4 | import okhttp3.HttpUrl.Companion.toHttpUrlOrNull 5 | import org.junit.Assert.assertEquals 6 | import org.junit.Assert.assertNotNull 7 | import org.junit.Test 8 | 9 | class HttpUrlExtensionsTest { 10 | @Test 11 | fun testIsLikelyType() { 12 | assertEquals(true, "http://example.com/video.mp4".toHttpUrl().isLikelyVideo()) 13 | assertEquals(true, "http://example.com/video.mpg".toHttpUrl().isLikelyVideo()) 14 | assertEquals(true, "http://example.com/video.avi".toHttpUrl().isLikelyVideo()) 15 | assertEquals(false, "http://example.com/test.txt".toHttpUrl().isLikelyVideo()) 16 | assertEquals(false, "http://example.com/test.tmp".toHttpUrl().isLikelyVideo()) 17 | assertEquals(false, "http://example.com/test.log".toHttpUrl().isLikelyVideo()) 18 | } 19 | 20 | @Test 21 | fun testURLsRejectedByJavaNetURIsStrictParser() { 22 | assertNotNull("http://example.com/?parameter={invalid-character}".toHttpUrlOrNull()) 23 | } 24 | 25 | @Test 26 | fun testNoOpRedirects() { 27 | val exampleNoRedirects = "http://example.com".toHttpUrl().resolveRedirects() 28 | assertEquals("http://example.com/", exampleNoRedirects.toString()) 29 | assertEquals(true, exampleNoRedirects.isLikelyArticle()) 30 | } 31 | 32 | @Test 33 | fun testRedirects() { 34 | assertEquals( 35 | "http://www.bet.com/collegemarketingreps", 36 | "http://www.facebook.com/l.php?u=http%3A%2F%2Fwww.bet.com%2Fcollegemarketingreps&h=42263" 37 | .toHttpUrl().resolveRedirects().toString() 38 | ) 39 | assertEquals( 40 | "https://www.wired.com/2014/08/maryam-mirzakhani-fields-medal/", 41 | "https://lm.facebook.com/l.php?u=https%3A%2F%2Fwww.wired.com%2F2014%2F08%2Fmaryam-mirzakhani-fields-medal%2F&h=ATMfLBdoriaBcr9HOvzkEe68VZ4hLhTiFINvMmq5_e6fC9yi3xe957is3nl8VJSWhUO_7BdOp7Yv9CHx6MwQaTkwbZ1CKgSQCt45CROzUw0C37Tp4V-2EvDSBuBM2H-Qew&enc=AZPhspzfaWR0HGkmbExT_AfCFThsP829S0z2UWadB7ponM3YguqyJXgtn2E9BAv_-IdZvW583OnNC9M6WroEsV1jlilk3FXS4ppeydAzaJU_o9gq6HvoGMj0N_SiIKHRE_Gamq8xVdEGPnCJi078X8fTEW_jrkwpPC6P6p5Z3gv6YkFZfskU6J9qe3YRyarG4dgM25dJFnVgxxH-qyHlHsYbMD69i2MF8QNreww1J6S84y6VbIxXC-m9dVfFlNQVmtWMUvJKDLcPmYNysyQSYvkknfZ9SgwBhimurLFmKWhf39nNNVYjjCszCJ1XT57xX0Q&s=1" 42 | .toHttpUrl().resolveRedirects().toString() 43 | ) 44 | assertEquals( 45 | "http://www.cnn.com/2017/01/25/politics/scientists-march-dc-trnd/index.html", 46 | "http://lm.facebook.com/l.php?u=http%3A%2F%2Fwww.cnn.com%2F2017%2F01%2F25%2Fpolitics%2Fscientists-march-dc-trnd%2Findex.html&h=ATO7Ln_rl7DAjRcqSo8yfpOvrFlEmKZmgeYHsOforgXsUYPLDy3nC1KfCYE-hev5oJzz1zydvvzI4utABjHqU1ruwDfw49jiDGCTrjFF-EyE6xfcbWRmDacY_6_R-lSi9g&enc=AZP1hkQfMXuV0vOHa1VeY8kdip2N73EjbXMKx3Zf4Ytdb1MrGHL48by4cl9_DShGYj9nZXvNt9xad9_4jphO9QBpRJLNGoyrRMBHI09eoFyPmxxjw7hHBy5Ouez0q7psi1uvjiphzOKVxjxyYBWnTJKD7m8rvhFz0HespmfvCf-fUiCpi6NDpxwYEw7vZ99fcjOpkiQqaFM_Gvqeat7r0e8axnqM-pJGY0fkjgWvgwTyfiB4fNMRhH3IaAmyL7DXl0xeYMoYSHuITkjTY9aU5dkiETfDVwBABOO9FJi2nTnRMw92E-gMMbiHFoHENlaSVJc&s=1" 47 | .toHttpUrl().resolveRedirects().toString() 48 | ) 49 | assertEquals( 50 | "https://arstechnica.com/business/2017/01/before-the-760mph-hyperloop-dream-there-was-the-atmospheric-railway/", 51 | "https://plus.url.google.com/url?q=https://arstechnica.com/business/2017/01/before-the-760mph-hyperloop-dream-there-was-the-atmospheric-railway/&rct=j&ust=1485739059621000&usg=AFQjCNH6Cgp4iU0NB5OoDpT3OtOXds7HQg" 52 | .toHttpUrl().resolveRedirects().toString() 53 | ) 54 | } 55 | 56 | @Test 57 | fun testGoogleRedirectors() { 58 | assertEquals( 59 | "https://www.facebook.com/permalink.php?id=111262459538815&story_fbid=534292497235807", 60 | "https://www.google.com/url?q=https://www.google.com/url?rct%3Dj%26sa%3Dt%26url%3Dhttps://www.facebook.com/permalink.php%253Fid%253D111262459538815%2526story_fbid%253D534292497235807%26ct%3Dga%26cd%3DCAEYACoTOTQxMTQ5NzcyMzExMjAwMTEyMzIcZWNjZWI5M2YwM2E5ZDJiODpjb206ZW46VVM6TA%26usg%3DAFQjCNFSwGsQjcbeVCaSO2rg90RgBpQvzA&source=gmail&ust=1589164930980000&usg=AFQjCNF37pEGpMAz7azFCry-Ib-hwR0VVw" 61 | .toHttpUrl().resolveRedirects().toString() 62 | ) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/Crux.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux 2 | 3 | import com.chimbori.crux.api.Extractor 4 | import com.chimbori.crux.api.Plugin 5 | import com.chimbori.crux.api.Resource 6 | import com.chimbori.crux.api.Rewriter 7 | import com.chimbori.crux.common.CHROME_USER_AGENT 8 | import com.chimbori.crux.plugins.AmpRedirector 9 | import com.chimbori.crux.plugins.DocumentFetcher 10 | import com.chimbori.crux.plugins.FacebookUrlRewriter 11 | import com.chimbori.crux.plugins.FaviconExtractor 12 | import com.chimbori.crux.plugins.GoogleUrlRewriter 13 | import com.chimbori.crux.plugins.HtmlMetadataExtractor 14 | import com.chimbori.crux.plugins.TrackingParameterRemover 15 | import com.chimbori.crux.plugins.WebAppManifestParser 16 | import kotlinx.coroutines.Dispatchers 17 | import kotlinx.coroutines.withContext 18 | import okhttp3.HttpUrl 19 | import okhttp3.OkHttpClient 20 | import org.jsoup.nodes.Document 21 | 22 | /** 23 | * An ordered list of default plugins configured in Crux. Callers can override and provide their own list, or pick and 24 | * choose from the set of available default plugins to create their own configuration. 25 | */ 26 | public fun createDefaultPlugins(okHttpClient: OkHttpClient): List = listOf( 27 | // Rewriters ---------------------------------------------------------------- 28 | 29 | // Static redirectors go first, to avoid getting stuck into CAPTCHAs. 30 | GoogleUrlRewriter(), 31 | FacebookUrlRewriter(), 32 | // Remove any tracking parameters remaining. 33 | TrackingParameterRemover(), 34 | 35 | // Fetcher ------------------------------------------------------------------ 36 | // Fetches the Web page, so this must be the first [Extractor]. 37 | DocumentFetcher(okHttpClient), 38 | 39 | // Extractors --------------------------------------------------------------- 40 | 41 | // Parses many standard HTML metadata attributes. 42 | HtmlMetadataExtractor(), 43 | // Prefer canonical URLs over AMP URLs. 44 | AmpRedirector(refetchContentFromCanonicalUrl = true, okHttpClient), 45 | // Fetches and parses the Web Manifest. May replace existing favicon URL with one from the manifest.json. 46 | WebAppManifestParser(okHttpClient), 47 | // Extracts the best possible favicon from all the markup available on the page itself. 48 | FaviconExtractor(), 49 | ) 50 | 51 | /** 52 | * Crux can be configured with a set of plugins, including custom ones, in sequence. Each plugin can optionally process 53 | * resource metadata, can make additional HTTP requests if necessary, and pass along updated metadata to the next plugin 54 | * in the chain. 55 | */ 56 | public class Crux( 57 | /** Select from available plugins, or provide custom plugins for Crux to use. */ 58 | private val plugins: List? = null, 59 | 60 | /** If the calling app has its own instance of [OkHttpClient], use it, otherwise Crux can create and use its own. */ 61 | okHttpClient: OkHttpClient = createCruxOkHttpClient(), 62 | ) { 63 | 64 | private val activePlugins: List = plugins ?: createDefaultPlugins(okHttpClient) 65 | 66 | /** 67 | * Processes the provided URL, and returns a metadata object containing custom fields. 68 | * @param originalUrl the URL to extract metadata and content from. 69 | * @param parsedDoc if the calling app already has access to a parsed DOM tree, Crux can reuse it instead of 70 | * re-parsing it. If a custom [Document] is provided, Crux will not make any HTTP requests itself, and may not follow 71 | * HTTP redirects (but plugins may still optionally make additional HTTP requests themselves.) 72 | */ 73 | public suspend fun extractFrom(originalUrl: HttpUrl, parsedDoc: Document? = null): Resource = 74 | withContext(Dispatchers.IO) { 75 | val rewrittenUrl = activePlugins 76 | .filterIsInstance() 77 | .fold(originalUrl) { rewrittenUrl, rewriter -> rewriter.rewrite(rewrittenUrl) } 78 | 79 | activePlugins 80 | .filterIsInstance() 81 | .fold(Resource(url = rewrittenUrl, document = parsedDoc)) { resource, extractor -> 82 | if (extractor.canExtract(resource.url ?: rewrittenUrl)) { 83 | resource + extractor.extract(resource) 84 | } else { 85 | resource 86 | } 87 | }.removeNullValues() 88 | } 89 | } 90 | 91 | internal fun createCruxOkHttpClient(): OkHttpClient = OkHttpClient.Builder() 92 | .followRedirects(true) 93 | .followSslRedirects(true) 94 | .retryOnConnectionFailure(true) 95 | .addNetworkInterceptor { chain -> 96 | chain.proceed( 97 | chain.request().newBuilder() 98 | .header("User-Agent", CHROME_USER_AGENT).build() 99 | ) 100 | } 101 | .build() 102 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/sample/KotlinPublicAPITest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.sample 2 | 3 | import com.chimbori.crux.Crux 4 | import com.chimbori.crux.api.Extractor 5 | import com.chimbori.crux.api.Fields.BANNER_IMAGE_URL 6 | import com.chimbori.crux.api.Fields.FAVICON_URL 7 | import com.chimbori.crux.api.Fields.TITLE 8 | import com.chimbori.crux.api.Resource 9 | import com.chimbori.crux.extractors.ImageUrlExtractor 10 | import com.chimbori.crux.extractors.LinkUrlExtractor 11 | import kotlinx.coroutines.runBlocking 12 | import okhttp3.HttpUrl 13 | import okhttp3.HttpUrl.Companion.toHttpUrl 14 | import org.jsoup.Jsoup 15 | import org.jsoup.nodes.Document 16 | import org.junit.Assert.assertEquals 17 | import org.junit.Test 18 | 19 | /** 20 | * Tests that Crux classes have the proper visibility to be used outside of the 21 | * `com.chimbori.crux` package, so this is a separate package. 22 | */ 23 | class KotlinPublicAPITest { 24 | @Test 25 | fun testKotlinPluginApi() { 26 | // Create a reusable object configured with the default set of plugins. 27 | val crux = Crux() 28 | 29 | val httpURL = "https://chimbori.com/".toHttpUrl() 30 | 31 | // You can provide prefetched raw HTML content yourself, or have Crux fetch it for you. 32 | val htmlContent = """ 33 | | 34 | | 35 | | Chimbori 36 | | 37 | | 38 | | 39 | | 40 | | 41 | |""".trimMargin() 42 | 43 | // Crux runs inside a `suspend` function as a Kotlin Coroutine. 44 | val extractedMetadata = runBlocking { 45 | crux.extractFrom(originalUrl = httpURL, parsedDoc = Jsoup.parse(htmlContent, httpURL.toString())) 46 | } 47 | 48 | // Metadata fields such as the Title and Description are available from the returned [Resource] object. 49 | assertEquals("Chimbori", extractedMetadata[TITLE]) 50 | 51 | // Well-known URLs related to this page are available either as strings or OkHttp [HttpUrl]s. 52 | assertEquals("https://chimbori.com/media/favicon.png".toHttpUrl(), extractedMetadata[FAVICON_URL]) 53 | 54 | // Extra markup fields like Twitter Cards metadata or Open Graph metadata are available as metadata fields as well. 55 | assertEquals("https://chimbori.com/media/cover-photo.png".toHttpUrl(), extractedMetadata[BANNER_IMAGE_URL]) 56 | } 57 | 58 | @Test 59 | fun testWithCustomPlugin() { 60 | // If you write a new plugin yourself, you can add any custom fields to the `Resource` object yourself, 61 | // and consume them in your own app. 62 | val customerNumberExtractorPlugin = object : Extractor { 63 | // Indicate that your plugin can handle all URLs on your site, but no others. 64 | override fun canExtract(url: HttpUrl): Boolean = url.topPrivateDomain() == "your-website.com" 65 | 66 | // Fields in the returned [Resource] overwrite those in the input [request]. If no changes are to be made, then 67 | // return null from your plugin. Otherwise, only return those fields that are new or changed from the input. 68 | override suspend fun extract(request: Resource) = Resource( 69 | metadata = mapOf(CUSTOMER_NUMBER_FIELD to request.url?.queryParameter("customer-number")) 70 | ) 71 | 72 | val CUSTOMER_NUMBER_FIELD = "customer-number" 73 | } 74 | 75 | val cruxWithCustomPlugin = Crux(listOf(customerNumberExtractorPlugin)) 76 | val orderDetailsUrl = "https://www.your-website.com/orders?customer-number=42".toHttpUrl() 77 | 78 | val metadata = runBlocking { 79 | cruxWithCustomPlugin.extractFrom(orderDetailsUrl, Document(orderDetailsUrl.toString())) 80 | } 81 | // Input URL was unchanged and is available in the output metadata. 82 | assertEquals(orderDetailsUrl, metadata.url) 83 | // Data extracted by the custom plugin is available as a custom field. 84 | assertEquals("42", metadata[customerNumberExtractorPlugin.CUSTOMER_NUMBER_FIELD]) 85 | } 86 | 87 | @Test 88 | fun testCallersCanAccessImageExtractorAPI() { 89 | val url = "https://chimbori.com/".toHttpUrl() 90 | val content = "" // Intentionally malformed. 91 | val imageUrl = ImageUrlExtractor(url, Jsoup.parse(content, url.toString()).body()).findImage().imageUrl 92 | assertEquals("https://chimbori.com/test.jpg".toHttpUrl(), imageUrl) 93 | } 94 | 95 | @Test 96 | fun testCallersCanAccessLinkExtractorAPI() { 97 | val url = "https://chimbori.com/".toHttpUrl() 98 | val content = "" // Intentionally malformed. 99 | val linkUrl = LinkUrlExtractor(url, Jsoup.parse(content, url.toString()).body()).findLink().linkUrl 100 | assertEquals("https://chimbori.com/test".toHttpUrl(), linkUrl) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/plugins/WebAppManifestParserTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Fields.BACKGROUND_COLOR_HEX 4 | import com.chimbori.crux.api.Fields.BACKGROUND_COLOR_HTML 5 | import com.chimbori.crux.api.Fields.DISPLAY 6 | import com.chimbori.crux.api.Fields.FAVICON_URL 7 | import com.chimbori.crux.api.Fields.LANGUAGE 8 | import com.chimbori.crux.api.Fields.ORIENTATION 9 | import com.chimbori.crux.api.Fields.THEME_COLOR_HEX 10 | import com.chimbori.crux.api.Fields.THEME_COLOR_HTML 11 | import com.chimbori.crux.api.Fields.TITLE 12 | import com.chimbori.crux.api.Fields.WEB_APP_MANIFEST_URL 13 | import com.chimbori.crux.api.Resource 14 | import com.chimbori.crux.common.fetchFromUrl 15 | import com.chimbori.crux.common.loggingOkHttpClient 16 | import kotlinx.coroutines.runBlocking 17 | import okhttp3.mockwebserver.Dispatcher 18 | import okhttp3.mockwebserver.MockResponse 19 | import okhttp3.mockwebserver.MockWebServer 20 | import okhttp3.mockwebserver.RecordedRequest 21 | import org.junit.After 22 | import org.junit.Assert.assertEquals 23 | import org.junit.Assert.assertNotNull 24 | import org.junit.Assert.assertNull 25 | import org.junit.Assert.assertTrue 26 | import org.junit.Before 27 | import org.junit.Test 28 | 29 | class WebAppManifestParserTest { 30 | private lateinit var mockWebServer: MockWebServer 31 | private lateinit var webAppManifestParser: WebAppManifestParser 32 | 33 | @Before 34 | fun setUp() { 35 | mockWebServer = MockWebServer().apply { start() } 36 | webAppManifestParser = WebAppManifestParser(loggingOkHttpClient) 37 | } 38 | 39 | @After 40 | fun tearDown() { 41 | mockWebServer.shutdown() 42 | } 43 | 44 | @Test 45 | fun testWebAppManifestLinkTag() { 46 | mockWebServer.dispatcher = object : Dispatcher() { 47 | override fun dispatch(request: RecordedRequest) = MockResponse().setBody( 48 | """| 49 | | 50 | | 51 | | 52 | | 53 | |""".trimMargin() 54 | ) 55 | } 56 | 57 | val candidateUrl = mockWebServer.url("/") 58 | assertTrue(webAppManifestParser.canExtract(candidateUrl)) 59 | 60 | runBlocking { 61 | val parsedResource = webAppManifestParser.extract( 62 | Resource.fetchFromUrl(candidateUrl, loggingOkHttpClient) 63 | ) 64 | assertEquals( 65 | mockWebServer.url("/static/sub/directory/manifest.json"), 66 | parsedResource?.get(WEB_APP_MANIFEST_URL) 67 | ) 68 | } 69 | } 70 | 71 | @Test 72 | fun testWebManifestJson() { 73 | // Example JSON from https://w3c.github.io/manifest/#typical-structure 74 | val manifestJson = """| 75 | |{ 76 | | "lang": "en", 77 | | "dir": "ltr", 78 | | "name": "Super Racer 3000", 79 | | "short_name": "Racer3K", 80 | | "icons": [ 81 | | { 82 | | "src": "icon/lowres.webp", 83 | | "sizes": "48x48", 84 | | "type": "image/webp" 85 | | },{ 86 | | "src": "icon/lowres", 87 | | "sizes": "48x48" 88 | | },{ 89 | | "src": "icon/hd_hi.ico", 90 | | "sizes": "72x72 96x96 128x128 256x256" 91 | | },{ 92 | | "src": "icon/hd_hi.svg", 93 | | "sizes": "257x257" 94 | | } 95 | | ], 96 | | "scope": "/", 97 | | "id": "superracer", 98 | | "start_url": "/start.html", 99 | | "display": "fullscreen", 100 | | "orientation": "landscape", 101 | | "theme_color": "aliceblue", 102 | | "background_color": "red" 103 | |} 104 | """.trimMargin() 105 | 106 | mockWebServer.dispatcher = object : Dispatcher() { 107 | override fun dispatch(request: RecordedRequest) = when (request.path) { 108 | "/" -> MockResponse().setBody("""""") 109 | "/static/manifest.json" -> MockResponse().setBody(manifestJson) 110 | else -> MockResponse().setResponseCode(404) 111 | } 112 | } 113 | 114 | val candidateUrl = mockWebServer.url("/") 115 | assertTrue(webAppManifestParser.canExtract(candidateUrl)) 116 | 117 | runBlocking { 118 | val parsedResource: Resource? = webAppManifestParser.extract( 119 | Resource.fetchFromUrl(candidateUrl, loggingOkHttpClient) 120 | ) 121 | assertNotNull(parsedResource) 122 | assertEquals("Super Racer 3000", parsedResource?.get(TITLE)) 123 | assertEquals("en", parsedResource?.get(LANGUAGE)) 124 | assertEquals("fullscreen", parsedResource?.get(DISPLAY)) 125 | assertEquals("landscape", parsedResource?.get(ORIENTATION)) 126 | assertEquals("aliceblue", parsedResource?.get(THEME_COLOR_HTML)) 127 | assertEquals("red", parsedResource?.get(BACKGROUND_COLOR_HTML)) 128 | assertNull(parsedResource?.get(THEME_COLOR_HEX)) 129 | assertNull(parsedResource?.get(BACKGROUND_COLOR_HEX)) 130 | assertEquals(mockWebServer.url("/static/manifest.json"), parsedResource?.get(WEB_APP_MANIFEST_URL)) 131 | assertEquals(mockWebServer.url("/static/icon/hd_hi.svg"), parsedResource?.get(FAVICON_URL)) 132 | } 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/common/HttpUrlExtensions.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.common 2 | 3 | import okhttp3.HttpUrl 4 | import okhttp3.HttpUrl.Companion.toHttpUrlOrNull 5 | 6 | // Checks heuristically whether a given URL is likely to be an article, video, image, or other types. Can optionally 7 | // resolve redirects such as when Facebook or Google show an interstitial page instead of redirecting the user to the 8 | // actual URL. 9 | 10 | public fun HttpUrl.isAdImage(): Boolean = toString().countMatches("ad") >= 2 11 | 12 | public fun HttpUrl.isLikelyArticle(): Boolean = 13 | !isLikelyImage() 14 | && !isLikelyVideo() 15 | && !isLikelyAudio() 16 | && !isLikelyBinaryDocument() 17 | && !isLikelyExecutable() 18 | && !isLikelyArchive() 19 | 20 | public fun HttpUrl.isLikelyVideo(): Boolean = when (encodedPath.substringAfterLast(".").lowercase()) { 21 | "3g2", 22 | "3gp", 23 | "amv", 24 | "asf", 25 | "avi", 26 | "drc", 27 | "flv", 28 | "gif", 29 | "gifv", 30 | "m2v", 31 | "m4p", 32 | "m4v", 33 | "mkv", 34 | "mng", 35 | "mov", 36 | "mp2", 37 | "mp4", 38 | "mpe", 39 | "mpeg", 40 | "mpg", 41 | "mpg4", 42 | "mpv", 43 | "mxf", 44 | "nsv", 45 | "ogg", 46 | "ogv", 47 | "qt", 48 | "rm", 49 | "rmvb", 50 | "roq", 51 | "svi", 52 | "swf", 53 | "viv", 54 | "vob", 55 | "webm", 56 | "wmv", 57 | "yuv", 58 | -> true 59 | else -> false 60 | } 61 | 62 | public fun HttpUrl.isLikelyAudio(): Boolean = when (encodedPath.substringAfterLast(".").lowercase()) { 63 | "3gp", 64 | "8svx", 65 | "aa", 66 | "aac", 67 | "aax", 68 | "act", 69 | "aiff", 70 | "alac", 71 | "amr", 72 | "ape", 73 | "au", 74 | "awb", 75 | "cda", 76 | "dss", 77 | "dvf", 78 | "flac", 79 | "gsm", 80 | "iklax", 81 | "ivs", 82 | "m3u", 83 | "m4a", 84 | "m4b", 85 | "m4p", 86 | "mmf", 87 | "mogg", 88 | "mp3", 89 | "mpc", 90 | "msv", 91 | "nmf", 92 | "ogg", 93 | "opus", 94 | "raw", 95 | "rf64", 96 | "rm", 97 | "sln", 98 | "tta", 99 | "voc", 100 | "vox", 101 | "wav", 102 | "webm", 103 | "wma", 104 | "wv", 105 | -> true 106 | else -> false 107 | } 108 | 109 | public fun HttpUrl.isLikelyImage(): Boolean = when (encodedPath.substringAfterLast(".").lowercase()) { 110 | "ai", 111 | "arw", 112 | "bmp", 113 | "cr2", 114 | "dib", 115 | "eps", 116 | "gif", 117 | "heic", 118 | "heif", 119 | "ico", 120 | "ind", 121 | "indd", 122 | "indt", 123 | "j2k", 124 | "jfi", 125 | "jfif", 126 | "jif", 127 | "jp2", 128 | "jpe", 129 | "jpeg", 130 | "jpf", 131 | "jpg", 132 | "jpm", 133 | "jpx", 134 | "k25", 135 | "mj2", 136 | "nrw", 137 | "pdf", 138 | "png", 139 | "psd", 140 | "raw", 141 | "svg", 142 | "svgz", 143 | "tif", 144 | "tiff", 145 | "webp", 146 | -> true 147 | else -> false 148 | } 149 | 150 | public fun HttpUrl.isLikelyBinaryDocument(): Boolean = when (encodedPath.substringAfterLast(".").lowercase()) { 151 | "doc", 152 | "pdf", 153 | "ppt", 154 | "rtf", 155 | "swf", 156 | "xls", 157 | -> true 158 | else -> false 159 | } 160 | 161 | public fun HttpUrl.isLikelyArchive(): Boolean = when (encodedPath.substringAfterLast(".").lowercase()) { 162 | "7z", 163 | "deb", 164 | "gz", 165 | "rar", 166 | "rpm", 167 | "tgz", 168 | "zip", 169 | -> true 170 | else -> false 171 | } 172 | 173 | public fun HttpUrl.isLikelyExecutable(): Boolean = when (encodedPath.substringAfterLast(".").lowercase()) { 174 | "bat", 175 | "bin", 176 | "dmg", 177 | "exe", 178 | -> true 179 | else -> false 180 | } 181 | 182 | @Suppress("unused") 183 | public fun HttpUrl.resolveRedirects(): HttpUrl { 184 | var urlBeforeThisPass = this 185 | var urlAfterThisPass = this 186 | while (true) { // Go through redirectors multiple times while the URL is still being changed. 187 | REDIRECTORS.forEach { redirector -> 188 | if (redirector.matches(urlBeforeThisPass)) { 189 | urlAfterThisPass = redirector.resolve(urlBeforeThisPass) 190 | } 191 | } 192 | if (urlBeforeThisPass == urlAfterThisPass) { 193 | return urlAfterThisPass 194 | } else { 195 | urlBeforeThisPass = urlAfterThisPass 196 | } 197 | } 198 | } 199 | 200 | private val REDIRECTORS = listOf( 201 | object : RedirectPattern { // Facebook. 202 | override fun matches(url: HttpUrl) = url.host.endsWith(".facebook.com") && url.encodedPath == "/l.php" 203 | override fun resolve(url: HttpUrl) = url.queryParameter("u")?.toHttpUrlOrNull() 204 | ?: url 205 | }, 206 | object : RedirectPattern { // Google. 207 | override fun matches(url: HttpUrl) = url.host.endsWith(".google.com") && url.encodedPath == "/url" 208 | override fun resolve(url: HttpUrl) = (url.queryParameter("q") ?: url.queryParameter("url"))?.toHttpUrlOrNull() 209 | ?: url 210 | } 211 | ) 212 | 213 | /** 214 | * Defines a pattern used by a specific service for URL redirection. This should be stateless, and will be called for 215 | * each URL that needs to be resolved. 216 | */ 217 | internal interface RedirectPattern { 218 | /** @return true if this RedirectPattern can handle the provided URL, false if not. */ 219 | fun matches(url: HttpUrl): Boolean 220 | 221 | /** @return the actual URL that is pointed to by this redirector URL. */ 222 | fun resolve(url: HttpUrl): HttpUrl 223 | } 224 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/plugins/AmpRedirectorTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.plugins 2 | 3 | import com.chimbori.crux.api.Resource 4 | import com.chimbori.crux.common.fetchFromUrl 5 | import com.chimbori.crux.common.loggingOkHttpClient 6 | import com.chimbori.crux.extractors.extractTitle 7 | import kotlinx.coroutines.runBlocking 8 | import okhttp3.mockwebserver.Dispatcher 9 | import okhttp3.mockwebserver.MockResponse 10 | import okhttp3.mockwebserver.MockWebServer 11 | import okhttp3.mockwebserver.RecordedRequest 12 | import org.jsoup.Jsoup 13 | import org.junit.After 14 | import org.junit.Assert.assertEquals 15 | import org.junit.Assert.assertNull 16 | import org.junit.Before 17 | import org.junit.Test 18 | 19 | class AmpRedirectorTest { 20 | private lateinit var mockWebServer: MockWebServer 21 | private lateinit var ampRedirector: AmpRedirector 22 | 23 | @Before 24 | fun setUp() { 25 | mockWebServer = MockWebServer().apply { start() } 26 | ampRedirector = AmpRedirector(refetchContentFromCanonicalUrl = true, loggingOkHttpClient) 27 | } 28 | 29 | @After 30 | fun tearDown() { 31 | mockWebServer.shutdown() 32 | } 33 | 34 | @Test 35 | fun testExtractsCanonicalUrl() { 36 | val canonicalUrl = mockWebServer.url("/canonical-url") 37 | val ampUrl = mockWebServer.url("/amp-url") 38 | val ampHtml = """ 39 | | 40 | | 41 | | 42 | |""".trimMargin() 43 | 44 | mockWebServer.dispatcher = object : Dispatcher() { 45 | override fun dispatch(request: RecordedRequest) = when (request.path) { 46 | ampUrl.encodedPath -> MockResponse().setBody( 47 | """""" 48 | ) 49 | canonicalUrl.encodedPath -> MockResponse().setBody("CanonicalUrl") 50 | else -> MockResponse().setResponseCode(404) 51 | } 52 | } 53 | 54 | runBlocking { 55 | val parsedResource = 56 | ampRedirector.extract(Resource(url = ampUrl, document = Jsoup.parse(ampHtml, ampUrl.toString()))) 57 | assertEquals(canonicalUrl, parsedResource?.url) 58 | } 59 | } 60 | 61 | @Test 62 | fun testReturnsNullWhenCanonicalUrlIsAbsent() { 63 | val ampUrl = mockWebServer.url("/amp-url") 64 | val ampHtmlWithNoCanonicalUrl = """""".trimMargin() 65 | 66 | runBlocking { 67 | val parsedResource = 68 | ampRedirector.extract( 69 | Resource( 70 | url = ampUrl, 71 | document = Jsoup.parse(ampHtmlWithNoCanonicalUrl, ampUrl.toString()) 72 | ) 73 | ) 74 | assertNull(parsedResource?.url) 75 | } 76 | } 77 | 78 | @Test 79 | fun testReturnsOriginalWhenAlreadyOnCanonicalUrl() { 80 | val canonicalUrl = mockWebServer.url("/canonical-url") 81 | val canonicalHtml = """ 82 | | 83 | | 84 | | 85 | |""".trimMargin() 86 | 87 | mockWebServer.dispatcher = object : Dispatcher() { 88 | override fun dispatch(request: RecordedRequest) = when (request.path) { 89 | canonicalUrl.encodedPath -> MockResponse().setBody(canonicalHtml) 90 | else -> MockResponse().setResponseCode(404) 91 | } 92 | } 93 | 94 | runBlocking { 95 | val canonicalResource = 96 | Resource(url = canonicalUrl, document = Jsoup.parse(canonicalHtml, canonicalUrl.toString())) 97 | val parsedResource = ampRedirector.extract(canonicalResource) 98 | assertNull(parsedResource) 99 | } 100 | } 101 | 102 | @Test 103 | fun testReturnsAbsoluteCanonicalUrl() { 104 | val ampUrl = mockWebServer.url("/amp-url") 105 | val canonicalUrl = mockWebServer.url("/canonical-url") 106 | val ampHtml = """""" 107 | 108 | mockWebServer.dispatcher = object : Dispatcher() { 109 | override fun dispatch(request: RecordedRequest) = when (request.path) { 110 | ampUrl.encodedPath -> MockResponse().setBody(ampHtml) 111 | else -> MockResponse().setResponseCode(404) 112 | } 113 | } 114 | 115 | runBlocking { 116 | val ampResource = Resource(url = ampUrl, document = Jsoup.parse(ampHtml, ampUrl.toString())) 117 | val parsedResource = ampRedirector.extract(ampResource) 118 | assertEquals(canonicalUrl, parsedResource?.url) 119 | } 120 | } 121 | 122 | @Test 123 | fun testFetchesContentFromCanonicalUrl() { 124 | val canonicalUrl = mockWebServer.url("/canonical-url") 125 | val ampUrl = mockWebServer.url("/amp-url") 126 | 127 | mockWebServer.dispatcher = object : Dispatcher() { 128 | override fun dispatch(request: RecordedRequest) = when (request.path) { 129 | ampUrl.encodedPath -> MockResponse().setBody( 130 | """ 131 | |AmpUrl 132 | | 133 | |""".trimMargin() 134 | ) 135 | canonicalUrl.encodedPath -> MockResponse().setBody("CanonicalUrl") 136 | else -> MockResponse().setResponseCode(404) 137 | } 138 | } 139 | 140 | runBlocking { 141 | val parsed = ampRedirector.extract( 142 | Resource.fetchFromUrl(url = ampUrl, loggingOkHttpClient) 143 | ) 144 | assertEquals(canonicalUrl, parsed?.url) 145 | assertEquals("CanonicalUrl", parsed?.document?.extractTitle()) 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/main/kotlin/com/chimbori/crux/extractors/MetadataHelpers.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux.extractors 2 | 3 | import com.chimbori.crux.common.cleanTitle 4 | import com.chimbori.crux.common.nullIfBlank 5 | import com.chimbori.crux.common.removeWhiteSpace 6 | import java.util.Locale 7 | import okhttp3.HttpUrl 8 | import okhttp3.HttpUrl.Companion.toHttpUrlOrNull 9 | import org.jsoup.nodes.Document 10 | import org.jsoup.nodes.Element 11 | 12 | public fun Document.extractTitle(): String? = ( 13 | title().nullIfBlank() 14 | ?: select("title").text().nullIfBlank() 15 | ?: select("meta[name=title]").attr("content").nullIfBlank() 16 | ?: select("meta[property=og:title]").attr("content").nullIfBlank() 17 | ?: select("meta[name=twitter:title]").attr("content").nullIfBlank() 18 | )?.cleanTitle()?.nullIfBlank() 19 | 20 | public fun Document.extractCanonicalUrl(): String? = ( 21 | select("link[rel=canonical]").attr("abs:href").nullIfBlank() 22 | ?: select("meta[property=og:url]").attr("content").nullIfBlank() 23 | ?: select("meta[name=twitter:url]").attr("content").nullIfBlank() 24 | )?.removeWhiteSpace()?.nullIfBlank() 25 | 26 | public fun Document.extractPaginationUrl(baseUrl: HttpUrl?, nextOrPrev: String): HttpUrl? = ( 27 | select("link[rel=$nextOrPrev]").attr("abs:href").nullIfBlank() 28 | )?.removeWhiteSpace()?.nullIfBlank() 29 | ?.let { relativeUrl -> baseUrl?.resolve(relativeUrl) ?: relativeUrl.toHttpUrlOrNull() } 30 | 31 | public fun Document.extractDescription(): String? = ( 32 | select("meta[name=description]").attr("content").nullIfBlank() 33 | ?: select("meta[property=og:description]").attr("content").nullIfBlank() 34 | ?: select("meta[name=twitter:description]").attr("content").nullIfBlank() 35 | )?.removeWhiteSpace()?.nullIfBlank() 36 | 37 | public fun Document.extractSiteName(): String? = ( 38 | select("meta[property=og:site_name]").attr("content").nullIfBlank() 39 | ?: select("meta[name=application-name]").attr("content").nullIfBlank() 40 | )?.removeWhiteSpace()?.nullIfBlank() 41 | 42 | public fun Document.extractThemeColor(): String? = 43 | select("meta[name=theme-color]").attr("content").nullIfBlank() 44 | 45 | public fun Document.extractPublishedAt(): String? = ( 46 | select("meta[itemprop=dateCreated]").attr("content").nullIfBlank() 47 | ?: select("meta[property=article:published_time]").attr("content").nullIfBlank() 48 | )?.removeWhiteSpace()?.nullIfBlank() 49 | 50 | public fun Document.extractModifiedAt(): String? = ( 51 | select("meta[itemprop=dateModified]").attr("content").nullIfBlank() 52 | ?: select("meta[property=article:modified_time]").attr("content").nullIfBlank() 53 | )?.removeWhiteSpace()?.nullIfBlank() 54 | 55 | public fun Document.extractKeywords(): List = 56 | select("meta[name=keywords]").attr("content") 57 | .removeWhiteSpace() 58 | .removePrefix("[") 59 | .removeSuffix("]") 60 | .split("\\s*,\\s*".toRegex()) 61 | .filter { it.isNotBlank() } 62 | 63 | public fun Document.extractFaviconUrl(baseUrl: HttpUrl?): HttpUrl? { 64 | val allPossibleIconElements = listOf( 65 | select("link[rel~=apple-touch-icon]"), 66 | select("link[rel~=apple-touch-icon-precomposed]"), 67 | select("link[rel~=icon]"), 68 | select("link[rel~=ICON]"), 69 | ) 70 | return findLargestIcon(allPossibleIconElements.flatten()) 71 | ?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() } 72 | ?: baseUrl?.newBuilder()?.encodedPath("/favicon.ico")?.build() 73 | } 74 | 75 | public fun Document.extractImageUrl(baseUrl: HttpUrl?): HttpUrl? = ( 76 | // Twitter Cards and Open Graph images are usually higher quality, so rank them first. 77 | select("meta[name=twitter:image]").attr("content").nullIfBlank() 78 | ?: select("meta[property=og:image]").attr("content").nullIfBlank() 79 | // image_src or thumbnails are usually low quality, so prioritize them *after* article images. 80 | ?: select("link[rel=image_src]").attr("href").nullIfBlank() 81 | ?: select("meta[name=thumbnail]").attr("content").nullIfBlank() 82 | )?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() } 83 | 84 | public fun Document.extractFeedUrl(baseUrl: HttpUrl?): HttpUrl? = ( 85 | select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href").nullIfBlank() 86 | ?: select("link[rel=alternate]").select("link[type=application/atom+xml]").attr("href").nullIfBlank() 87 | )?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() } 88 | 89 | public fun Document.extractAmpUrl(baseUrl: HttpUrl?): HttpUrl? = 90 | select("link[rel=amphtml]").attr("href").nullIfBlank() 91 | ?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() } 92 | 93 | public fun Document.extractVideoUrl(baseUrl: HttpUrl?): HttpUrl? = 94 | select("meta[property=og:video]").attr("content").nullIfBlank() 95 | ?.let { baseUrl?.resolve(it) ?: it.toHttpUrlOrNull() } 96 | 97 | internal fun findLargestIcon(iconElements: List): String? = 98 | iconElements.maxByOrNull { parseSize(it.attr("sizes")) }?.attr("abs:href")?.nullIfBlank() 99 | 100 | /** 101 | * Given a size represented by "WidthxHeight" or "WidthxHeight ...", will return the largest dimension found. 102 | * 103 | * Examples: "128x128" will return 128. 104 | * "128x64" will return 64. 105 | * "24x24 48x48" will return 48. 106 | * 107 | * @param sizes String representing the sizes. 108 | * @return largest dimension, or 0 if input could not be parsed. 109 | */ 110 | internal fun parseSize(sizeString: String?): Int { 111 | if (sizeString.isNullOrBlank()) return 0 112 | 113 | val sizes = sizeString.trim(' ').lowercase(Locale.getDefault()) 114 | return when { 115 | // For multiple sizes in the same String, split and parse recursively. 116 | sizes.contains(" ") -> sizes.split(" ").maxOfOrNull { parseSize(it) } ?: 0 117 | // For handling sizes of format 128x128 etc. 118 | sizes.contains("x") -> try { 119 | sizes.split("x").maxOf { it.trim().toInt() } 120 | } catch (e: NumberFormatException) { 121 | 0 122 | } 123 | else -> 0 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/test/kotlin/com/chimbori/crux/CruxTest.kt: -------------------------------------------------------------------------------- 1 | package com.chimbori.crux 2 | 3 | import com.chimbori.crux.api.Extractor 4 | import com.chimbori.crux.api.Fields.TITLE 5 | import com.chimbori.crux.api.Resource 6 | import com.chimbori.crux.plugins.GoogleUrlRewriter 7 | import kotlinx.coroutines.runBlocking 8 | import okhttp3.HttpUrl 9 | import okhttp3.HttpUrl.Companion.toHttpUrl 10 | import okhttp3.mockwebserver.Dispatcher 11 | import okhttp3.mockwebserver.MockResponse 12 | import okhttp3.mockwebserver.MockWebServer 13 | import okhttp3.mockwebserver.RecordedRequest 14 | import org.junit.After 15 | import org.junit.Assert.assertEquals 16 | import org.junit.Assert.assertNotNull 17 | import org.junit.Before 18 | import org.junit.Test 19 | 20 | class CruxTest { 21 | private lateinit var mockWebServer: MockWebServer 22 | 23 | @Before 24 | fun setUp() { 25 | mockWebServer = MockWebServer().apply { 26 | dispatcher = object : Dispatcher() { 27 | override fun dispatch(request: RecordedRequest) = MockResponse().setBody("${request.path}") 28 | } 29 | start() 30 | } 31 | } 32 | 33 | @After 34 | fun tearDown() { 35 | mockWebServer.shutdown() 36 | } 37 | 38 | @Test 39 | fun testRewritersAreCompletedBeforeExtraction() { 40 | val crux = Crux(plugins = listOf(GoogleUrlRewriter())) 41 | val metadata = runBlocking { 42 | crux.extractFrom( 43 | "https://www.google.com/url?q=https://www.google.com/url?rct%3Dj%26sa%3Dt%26url%3Dhttps://example.com/permalink%253Fid%253D1234567890%26ct%3Dga%26cd%3DCAEYACoTOTQxMTQ5NzcyMzExMjAwMTEyMzIcZWNjZWI5M2YwM2E5ZDJiODpjb206ZW46VVM6TA%26usg%3DAFQjCNFSwGsQjcbeVCaSO2rg90RgBpQvzA&source=gmail&ust=1589164930980000&usg=AFQjCNF37pEGpMAz7azFCry-Ib-hwR0VVw".toHttpUrl() 44 | ) 45 | } 46 | assertNotNull(metadata) 47 | assertEquals("https://example.com/permalink?id=1234567890".toHttpUrl(), metadata.url) 48 | } 49 | 50 | @Test 51 | fun testPluginsAreNotAskedToHandleUrlsTheyCannotHandle() { 52 | val fooHandlerPlugin = object : Extractor { 53 | override fun canExtract(url: HttpUrl) = url.encodedPath == "/foo" 54 | override suspend fun extract(request: Resource) = Resource( 55 | url = request.url?.newBuilder()?.encodedPath("/rewritten-from-foo")?.build() 56 | ) 57 | } 58 | 59 | val barHandlerPlugin = object : Extractor { 60 | override fun canExtract(url: HttpUrl) = url.encodedPath == "/bar" 61 | override suspend fun extract(request: Resource) = Resource( 62 | url = request.url?.newBuilder()?.encodedPath("/rewritten-from-bar")?.build() 63 | ) 64 | } 65 | 66 | val cruxWithFooPlugin = Crux(plugins = listOf(fooHandlerPlugin)) 67 | val fooMetadata = runBlocking { 68 | cruxWithFooPlugin.extractFrom(mockWebServer.url("/foo")) 69 | } 70 | assertEquals("/rewritten-from-foo", fooMetadata.url?.encodedPath) 71 | 72 | val cruxWithBarPlugin = Crux(plugins = listOf(barHandlerPlugin)) 73 | val barMetadata = runBlocking { 74 | cruxWithBarPlugin.extractFrom(mockWebServer.url("/foo")) 75 | } 76 | assertEquals("/foo", barMetadata.url?.encodedPath) 77 | } 78 | 79 | @Test 80 | fun testDefaultPluginsCanParseTitle() { 81 | mockWebServer.dispatcher = object : Dispatcher() { 82 | override fun dispatch(request: RecordedRequest) = MockResponse().setBody("Mock Title") 83 | } 84 | 85 | val crux = Crux() 86 | val metadata = runBlocking { crux.extractFrom(mockWebServer.url("/mock-title")) } 87 | assertNotNull(metadata) 88 | assertEquals("Mock Title", metadata[TITLE]) 89 | } 90 | 91 | @Test 92 | fun testHttpRedirectUrlReturnedInsteadOfOriginalUrl() { 93 | val originalUrl = mockWebServer.url("/original") 94 | val redirectedUrl = mockWebServer.url("/redirected") 95 | mockWebServer.dispatcher = object : Dispatcher() { 96 | override fun dispatch(request: RecordedRequest) = when (request.path) { 97 | originalUrl.encodedPath -> MockResponse().setResponseCode(301).setHeader("Location", redirectedUrl) 98 | redirectedUrl.encodedPath -> MockResponse().setBody("") 99 | else -> MockResponse().setResponseCode(404) 100 | } 101 | } 102 | 103 | val metadata = runBlocking { Crux().extractFrom(originalUrl) } 104 | assertEquals(redirectedUrl, metadata.url) 105 | } 106 | 107 | @Test 108 | fun testLaterPluginOperatesOnRewrittenUrlFromPreviousPlugin() { 109 | val rewriteFooToBarPlugin = object : Extractor { 110 | override fun canExtract(url: HttpUrl) = url.encodedPath == "/foo" 111 | override suspend fun extract(request: Resource) = 112 | Resource( 113 | url = request.url?.newBuilder()?.encodedPath("/bar")?.build(), 114 | metadata = mapOf(TITLE to "Foo Title") 115 | ) 116 | } 117 | 118 | val generateTitleForBarPlugin = object : Extractor { 119 | override fun canExtract(url: HttpUrl) = url.encodedPath == "/bar" 120 | override suspend fun extract(request: Resource) = Resource(metadata = mapOf(TITLE to "Bar Title")) 121 | } 122 | 123 | // Test Foo before Bar. 124 | val fooBeforeBarCrux = Crux(listOf(rewriteFooToBarPlugin, generateTitleForBarPlugin)) 125 | val fooBeforeBar = runBlocking { 126 | fooBeforeBarCrux.extractFrom(mockWebServer.url("/foo")) 127 | } 128 | assertEquals("Bar Title", fooBeforeBar[TITLE]) 129 | 130 | // Test Bar before Foo. 131 | val barBeforeFooCrux = Crux(listOf(generateTitleForBarPlugin, rewriteFooToBarPlugin)) 132 | val barBeforeFoo = runBlocking { 133 | barBeforeFooCrux.extractFrom(mockWebServer.url("/foo")) 134 | } 135 | assertEquals("Foo Title", barBeforeFoo[TITLE]) 136 | } 137 | 138 | @Test 139 | fun testNoHttpRequestsAreMadeWhenCallerProvidesParsedDocument() { 140 | } 141 | 142 | @Test 143 | fun testLaterPluginOverridesFieldsSetByPreviousPlugin() { 144 | } 145 | 146 | @Test 147 | fun testLaterPluginOverridesFieldsWithNull() { 148 | } 149 | 150 | @Test 151 | fun testLaterPluginOverridesFieldsWithBlanks() { 152 | } 153 | 154 | @Test 155 | fun testPluginProvidesUpdatedParsedDocument() { 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # Tell EditorConfig to ignore any further .editorconfig files found below this directory. 2 | root = true 3 | 4 | [*] 5 | charset = utf-8 6 | end_of_line = lf 7 | indent_style = space 8 | indent_size = 2 9 | insert_final_newline = true 10 | trim_trailing_whitespace = true 11 | max_line_length = 120 12 | continuation_indent_size = 4 13 | ij_continuation_indent_size = 4 14 | ij_visual_guides = 120,200 15 | ij_wrap_on_typing = false 16 | 17 | [.editorconfig] 18 | ij_editorconfig_align_group_field_declarations = false 19 | ij_editorconfig_space_after_colon = false 20 | ij_editorconfig_space_after_comma = true 21 | ij_editorconfig_space_before_colon = false 22 | ij_editorconfig_space_before_comma = false 23 | ij_editorconfig_spaces_around_assignment_operators = true 24 | 25 | [*.xml] 26 | continuation_indent_size = 2 27 | ij_continuation_indent_size = 2 28 | ij_xml_align_attributes = false 29 | ij_xml_align_text = false 30 | ij_xml_attribute_wrap = normal 31 | ij_xml_block_comment_at_first_column = true 32 | ij_xml_keep_blank_lines = 2 33 | ij_xml_keep_indents_on_empty_lines = false 34 | ij_xml_keep_line_breaks = false 35 | ij_xml_keep_line_breaks_in_text = true 36 | ij_xml_keep_whitespaces = false 37 | ij_xml_keep_whitespaces_around_cdata = preserve 38 | ij_xml_keep_whitespaces_inside_cdata = false 39 | ij_xml_line_comment_at_first_column = true 40 | ij_xml_space_after_tag_name = false 41 | ij_xml_space_around_equals_in_attribute = false 42 | ij_xml_space_inside_empty_tag = false 43 | ij_xml_text_wrap = normal 44 | ij_xml_use_custom_settings = true 45 | 46 | [{*.kt,*.kts}] 47 | indent_size = 2 48 | continuation_indent_size = 4 49 | ij_kotlin_align_in_columns_case_branch = false 50 | ij_kotlin_align_multiline_binary_operation = false 51 | ij_kotlin_align_multiline_extends_list = false 52 | ij_kotlin_align_multiline_method_parentheses = false 53 | ij_kotlin_align_multiline_parameters = true 54 | ij_kotlin_align_multiline_parameters_in_calls = false 55 | ij_kotlin_allow_trailing_comma = false 56 | ij_kotlin_allow_trailing_comma_on_call_site = false 57 | ij_kotlin_assignment_wrap = normal 58 | ij_kotlin_blank_lines_after_class_header = 0 59 | ij_kotlin_blank_lines_around_block_when_branches = 0 60 | ij_kotlin_blank_lines_before_declaration_with_comment_or_annotation_on_separate_line = 1 61 | ij_kotlin_block_comment_at_first_column = true 62 | ij_kotlin_call_parameters_new_line_after_left_paren = true 63 | ij_kotlin_call_parameters_right_paren_on_new_line = true 64 | ij_kotlin_call_parameters_wrap = on_every_item 65 | ij_kotlin_catch_on_new_line = false 66 | ij_kotlin_class_annotation_wrap = split_into_lines 67 | ij_kotlin_code_style_defaults = KOTLIN_OFFICIAL 68 | ij_kotlin_continuation_indent_for_chained_calls = false 69 | ij_kotlin_continuation_indent_for_expression_bodies = false 70 | ij_kotlin_continuation_indent_in_argument_lists = false 71 | ij_kotlin_continuation_indent_in_elvis = false 72 | ij_kotlin_continuation_indent_in_if_conditions = false 73 | ij_kotlin_continuation_indent_in_parameter_lists = false 74 | ij_kotlin_continuation_indent_in_supertype_lists = false 75 | ij_kotlin_else_on_new_line = false 76 | ij_kotlin_enum_constants_wrap = off 77 | ij_kotlin_extends_list_wrap = normal 78 | ij_kotlin_field_annotation_wrap = split_into_lines 79 | ij_kotlin_finally_on_new_line = false 80 | ij_kotlin_if_rparen_on_new_line = true 81 | ij_kotlin_import_nested_classes = false 82 | ij_kotlin_imports_layout = * 83 | ij_kotlin_insert_whitespaces_in_simple_one_line_method = true 84 | ij_kotlin_keep_blank_lines_before_right_brace = 2 85 | ij_kotlin_keep_blank_lines_in_code = 2 86 | ij_kotlin_keep_blank_lines_in_declarations = 2 87 | ij_kotlin_keep_first_column_comment = true 88 | ij_kotlin_keep_indents_on_empty_lines = false 89 | ij_kotlin_keep_line_breaks = true 90 | ij_kotlin_lbrace_on_next_line = false 91 | ij_kotlin_line_comment_add_space = false 92 | ij_kotlin_line_comment_at_first_column = true 93 | ij_kotlin_method_annotation_wrap = split_into_lines 94 | ij_kotlin_method_call_chain_wrap = normal 95 | ij_kotlin_method_parameters_new_line_after_left_paren = true 96 | ij_kotlin_method_parameters_right_paren_on_new_line = true 97 | ij_kotlin_method_parameters_wrap = on_every_item 98 | ij_kotlin_name_count_to_use_star_import = 2147483647 99 | ij_kotlin_name_count_to_use_star_import_for_members = 2147483647 100 | ij_kotlin_parameter_annotation_wrap = off 101 | ij_kotlin_space_after_comma = true 102 | ij_kotlin_space_after_extend_colon = true 103 | ij_kotlin_space_after_type_colon = true 104 | ij_kotlin_space_before_catch_parentheses = true 105 | ij_kotlin_space_before_comma = false 106 | ij_kotlin_space_before_extend_colon = true 107 | ij_kotlin_space_before_for_parentheses = true 108 | ij_kotlin_space_before_if_parentheses = true 109 | ij_kotlin_space_before_lambda_arrow = true 110 | ij_kotlin_space_before_type_colon = false 111 | ij_kotlin_space_before_when_parentheses = true 112 | ij_kotlin_space_before_while_parentheses = true 113 | ij_kotlin_spaces_around_additive_operators = true 114 | ij_kotlin_spaces_around_assignment_operators = true 115 | ij_kotlin_spaces_around_equality_operators = true 116 | ij_kotlin_spaces_around_function_type_arrow = true 117 | ij_kotlin_spaces_around_logical_operators = true 118 | ij_kotlin_spaces_around_multiplicative_operators = true 119 | ij_kotlin_spaces_around_range = false 120 | ij_kotlin_spaces_around_relational_operators = true 121 | ij_kotlin_spaces_around_unary_operator = false 122 | ij_kotlin_spaces_around_when_arrow = true 123 | ij_kotlin_variable_annotation_wrap = off 124 | ij_kotlin_while_on_new_line = false 125 | ij_kotlin_wrap_elvis_expressions = 1 126 | ij_kotlin_wrap_expression_body_functions = 1 127 | ij_kotlin_wrap_first_method_in_call_chain = false 128 | 129 | [{*.har,*.json}] 130 | ij_json_keep_blank_lines_in_code = 0 131 | ij_json_keep_indents_on_empty_lines = false 132 | ij_json_keep_line_breaks = true 133 | ij_json_space_after_colon = true 134 | ij_json_space_after_comma = true 135 | ij_json_space_before_colon = true 136 | ij_json_space_before_comma = false 137 | ij_json_spaces_within_braces = false 138 | ij_json_spaces_within_brackets = false 139 | ij_json_wrap_long_lines = false 140 | 141 | [{*.htm,*.html,*.sht,*.shtm,*.shtml}] 142 | ij_html_add_new_line_before_tags = body,div,p,form,h1,h2,h3 143 | ij_html_align_attributes = true 144 | ij_html_align_text = false 145 | ij_html_attribute_wrap = normal 146 | ij_html_block_comment_at_first_column = true 147 | ij_html_do_not_align_children_of_min_lines = 0 148 | ij_html_do_not_break_if_inline_tags = title,h1,h2,h3,h4,h5,h6,p 149 | ij_html_do_not_indent_children_of_tags = html,body,thead,tbody,tfoot 150 | ij_html_enforce_quotes = false 151 | ij_html_inline_tags = a,abbr,acronym,b,basefont,bdo,big,br,cite,cite,code,dfn,em,font,i,img,input,kbd,label,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var 152 | ij_html_keep_blank_lines = 2 153 | ij_html_keep_indents_on_empty_lines = false 154 | ij_html_keep_line_breaks = true 155 | ij_html_keep_line_breaks_in_text = true 156 | ij_html_keep_whitespaces = false 157 | ij_html_keep_whitespaces_inside = span,pre,textarea 158 | ij_html_line_comment_at_first_column = true 159 | ij_html_new_line_after_last_attribute = never 160 | ij_html_new_line_before_first_attribute = never 161 | ij_html_quote_style = double 162 | ij_html_remove_new_line_before_tags = br 163 | ij_html_space_after_tag_name = false 164 | ij_html_space_around_equality_in_attribute = false 165 | ij_html_space_inside_empty_tag = false 166 | ij_html_text_wrap = normal 167 | ij_html_uniform_ident = false 168 | 169 | [{*.yaml,*.yml}] 170 | ij_yaml_keep_indents_on_empty_lines = false 171 | ij_yaml_keep_line_breaks = true 172 | ij_yaml_space_before_colon = true 173 | ij_yaml_spaces_within_braces = true 174 | ij_yaml_spaces_within_brackets = true 175 | -------------------------------------------------------------------------------- /CLA.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Chimbori Individual Contributor License Agreement 3 | --- 4 | 5 | # Chimbori Individual Contributor License Agreement 6 | 7 | Thank you for your interest in contributing to open source software projects (“Projects”) made 8 | available by Chimbori. 9 | 10 | In order to clarify the intellectual property license granted with Contributions from any person or 11 | entity, Chimbori must have a Contributor License Agreement ("CLA") on file that has been signed by 12 | each Contributor, indicating agreement to the license terms below. This license is for your 13 | protection as a Contributor as well as the protection of Chimbori; it does not change your rights to 14 | use your own Contributions for any other purpose. This CLA sets out the terms governing any source 15 | code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, 16 | materials, feedback, information or other works of authorship that you submit or have submitted, in 17 | any form and in any manner, to Chimbori in respect of any of the Projects (collectively 18 | “Contributions”). 19 | 20 | If you have any questions respecting this Agreement, please contact hello@chimbori.com. 21 | 22 | ## Definitions. 23 | 24 | "You" (or "Your") shall mean the copyright owner or legal entity authorized by the copyright owner 25 | that is making this Agreement with Chimbori. For legal entities, the entity making a Contribution 26 | and all other entities that control, are controlled by, or are under common control with that entity 27 | are considered to be a single Contributor. For the purposes of this definition, "control" means (i) 28 | the power, direct or indirect, to cause the direction or management of such entity, whether by 29 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, 30 | or (iii) beneficial ownership of such entity. 31 | 32 | "Contribution" shall mean any original work of authorship, including any modifications or additions 33 | to an existing work, that is intentionally submitted by You to Chimbori for inclusion in, or 34 | documentation of, any of the products owned or managed by Chimbori (the "Projects"). For the 35 | purposes of this definition, "submitted" means any form of electronic, verbal, or written 36 | communication sent to Chimbori or its representatives, including but not limited to communication on 37 | electronic mailing lists, source code control systems, and issue tracking systems that are managed 38 | by, or on behalf of, Chimbori for the purpose of discussing and improving the Projects, but 39 | excluding communication that is conspicuously marked or otherwise designated in writing by You as 40 | "Not a Contribution." 41 | 42 | You accept and agree to the following terms and conditions for Your present and future Contributions 43 | submitted to Chimbori. Except for the license granted herein to Chimbori and recipients of software 44 | distributed by Chimbori, You reserve all right, title, and interest in and to Your Contributions. 45 | 46 | ## Copyright License. 47 | 48 | You hereby grant, and agree to grant, to Chimbori a non-exclusive, perpetual, irrevocable, 49 | worldwide, fully-paid, royalty-free, transferable copyright license to reproduce, prepare derivative 50 | works of, publicly display, publicly perform, and distribute your Contributions and such derivative 51 | works, with the right to sublicense the foregoing rights through multiple tiers of sublicensees. 52 | 53 | ## Patent License. 54 | 55 | You hereby grant, and agree to grant, to Chimbori, and to recipients of software distributed by 56 | Chimbori, a non-exclusive, perpetual, irrevocable, no-charge, worldwide, fully-paid, royalty-free, 57 | transferable patent license to make, have made, use, offer to sell, sell, import, and otherwise 58 | transfer your Contributions, where such license applies only to those patent claims licensable by 59 | you that are necessarily infringed by your Contributions alone or by combination of your 60 | Contributions with the Project to which such Contributions were submitted, with the right to 61 | sublicense the foregoing rights through multiple tiers of sublicensees. If any entity institutes 62 | patent litigation against You or any other entity (including a cross-claim or counterclaim in a 63 | lawsuit) alleging that your Contribution, or the Projects to which you have contributed, constitutes 64 | direct or contributory patent infringement, then any patent licenses granted to that entity under 65 | this Agreement for that Contribution or Project shall terminate as of the date such litigation is 66 | filed. 67 | 68 | ## Moral Rights. 69 | 70 | To the fullest extent permitted under applicable law, you hereby waive, and agree not to assert, all 71 | of your “moral rights” in or relating to your Contributions for the benefit of Chimbori, its 72 | assigns, and their respective direct and indirect sublicensees. 73 | 74 | ## Third Party Content/Rights. 75 | 76 | If your Contribution includes or is based on any source code, object code, bug fixes, configuration 77 | changes, tools, specifications, documentation, data, materials, feedback, information or other works 78 | of authorship that were not authored by you (“Third Party Content”) or if you are aware of any third 79 | party intellectual property or proprietary rights associated with your Contribution (“Third Party 80 | Rights”), then you agree to include with the submission of your Contribution full details respecting 81 | such Third Party Content and Third Party Rights, including, without limitation, identification of 82 | which aspects of your Contribution contain Third Party Content or are associated with Third Party 83 | Rights, the owner/author of the Third Party Content and Third Party Rights, where you obtained the 84 | Third Party Content, and any applicable third party license terms or restrictions respecting the 85 | Third Party Content and Third Party Rights. For greater certainty, the foregoing obligations 86 | respecting the identification of Third Party Content and Third Party Rights do not apply to any 87 | portion of a Project that is incorporated into your Contribution to that same Project. 88 | 89 | ## Representations. 90 | 91 | You represent that, other than the Third Party Content and Third Party Rights identified by you in 92 | accordance with this Agreement, you are the sole author of your Contributions and are legally 93 | entitled to grant the foregoing licenses and waivers in respect of your Contributions. If your 94 | Contributions were created in the course of your employment with your past or present employer(s), 95 | you represent that such employer(s) has authorized you to make your Contributions on behalf of such 96 | employer(s) or such employer(s) has waived all of their right, title or interest in or to your 97 | Contributions. 98 | 99 | ## Disclaimer. 100 | 101 | To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" 102 | basis, without any warranties or conditions, express or implied, including, without limitation, any 103 | implied warranties or conditions of non-infringement, merchantability or fitness for a particular 104 | purpose. You are not required to provide support for your Contributions, except to the extent you 105 | desire to provide support. 106 | 107 | ## No Obligation. 108 | 109 | You acknowledge that Chimbori is under no obligation to use or incorporate your Contributions into 110 | any of the Projects. The decision to use or incorporate your Contributions into any of the Projects 111 | will be made at the sole discretion of Chimbori or its authorized delegates. 112 | 113 | ## Disputes. 114 | 115 | This Agreement shall be governed by and construed in accordance with the laws of the State of 116 | California, United States of America, without giving effect to its principles or rules regarding 117 | conflicts of laws, other than such principles directing application of California law. The parties 118 | hereby submit to venue in, and jurisdiction of the courts located in San Mateo County, California 119 | for purposes relating to this Agreement. In the event that any of the provisions of this Agreement 120 | shall be held by a court or other tribunal of competent jurisdiction to be unenforceable, the 121 | remaining portions hereof shall remain in full force and effect. 122 | 123 | ## Assignment. 124 | 125 | You agree that Chimbori may assign this Agreement, and all of its rights, obligations and licenses 126 | hereunder. 127 | 128 | You agree to notify Chimbori of any facts or circumstances of which you become aware that would make 129 | these representations inaccurate in any respect. 130 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # SPDX-License-Identifier: Apache-2.0 19 | # 20 | 21 | ############################################################################## 22 | # 23 | # Gradle start up script for POSIX generated by Gradle. 24 | # 25 | # Important for running: 26 | # 27 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 28 | # noncompliant, but you have some other compliant shell such as ksh or 29 | # bash, then to run this script, type that shell name before the whole 30 | # command line, like: 31 | # 32 | # ksh Gradle 33 | # 34 | # Busybox and similar reduced shells will NOT work, because this script 35 | # requires all of these POSIX shell features: 36 | # * functions; 37 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 38 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 39 | # * compound commands having a testable exit status, especially «case»; 40 | # * various built-in commands including «command», «set», and «ulimit». 41 | # 42 | # Important for patching: 43 | # 44 | # (2) This script targets any POSIX shell, so it avoids extensions provided 45 | # by Bash, Ksh, etc; in particular arrays are avoided. 46 | # 47 | # The "traditional" practice of packing multiple parameters into a 48 | # space-separated string is a well documented source of bugs and security 49 | # problems, so this is (mostly) avoided, by progressively accumulating 50 | # options in "$@", and eventually passing that to Java. 51 | # 52 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 53 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 54 | # see the in-line comments for details. 55 | # 56 | # There are tweaks for specific operating systems such as AIX, CygWin, 57 | # Darwin, MinGW, and NonStop. 58 | # 59 | # (3) This script is generated from the Groovy template 60 | # https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 61 | # within the Gradle project. 62 | # 63 | # You can find Gradle at https://github.com/gradle/gradle/. 64 | # 65 | ############################################################################## 66 | 67 | # Attempt to set APP_HOME 68 | 69 | # Resolve links: $0 may be a link 70 | app_path=$0 71 | 72 | # Need this for daisy-chained symlinks. 73 | while 74 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 75 | [ -h "$app_path" ] 76 | do 77 | ls=$( ls -ld "$app_path" ) 78 | link=${ls#*' -> '} 79 | case $link in #( 80 | /*) app_path=$link ;; #( 81 | *) app_path=$APP_HOME$link ;; 82 | esac 83 | done 84 | 85 | # This is normally unused 86 | # shellcheck disable=SC2034 87 | APP_BASE_NAME=${0##*/} 88 | # Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) 89 | APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s\n' "$PWD" ) || exit 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | if ! command -v java >/dev/null 2>&1 137 | then 138 | die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 139 | 140 | Please set the JAVA_HOME variable in your environment to match the 141 | location of your Java installation." 142 | fi 143 | fi 144 | 145 | # Increase the maximum file descriptors if we can. 146 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 147 | case $MAX_FD in #( 148 | max*) 149 | # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. 150 | # shellcheck disable=SC2039,SC3045 151 | MAX_FD=$( ulimit -H -n ) || 152 | warn "Could not query maximum file descriptor limit" 153 | esac 154 | case $MAX_FD in #( 155 | '' | soft) :;; #( 156 | *) 157 | # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. 158 | # shellcheck disable=SC2039,SC3045 159 | ulimit -n "$MAX_FD" || 160 | warn "Could not set maximum file descriptor limit to $MAX_FD" 161 | esac 162 | fi 163 | 164 | # Collect all arguments for the java command, stacking in reverse order: 165 | # * args from the command line 166 | # * the main class name 167 | # * -classpath 168 | # * -D...appname settings 169 | # * --module-path (only if needed) 170 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 171 | 172 | # For Cygwin or MSYS, switch paths to Windows format before running java 173 | if "$cygwin" || "$msys" ; then 174 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 175 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 176 | 177 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 178 | 179 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 180 | for arg do 181 | if 182 | case $arg in #( 183 | -*) false ;; # don't mess with options #( 184 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 185 | [ -e "$t" ] ;; #( 186 | *) false ;; 187 | esac 188 | then 189 | arg=$( cygpath --path --ignore --mixed "$arg" ) 190 | fi 191 | # Roll the args list around exactly as many times as the number of 192 | # args, so each arg winds up back in the position where it started, but 193 | # possibly modified. 194 | # 195 | # NB: a `for` loop captures its iteration list before it begins, so 196 | # changing the positional parameters here affects neither the number of 197 | # iterations, nor the values presented in `arg`. 198 | shift # remove old arg 199 | set -- "$@" "$arg" # push replacement arg 200 | done 201 | fi 202 | 203 | 204 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 205 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 206 | 207 | # Collect all arguments for the java command: 208 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, 209 | # and any embedded shellness will be escaped. 210 | # * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be 211 | # treated as '${Hostname}' itself on the command line. 212 | 213 | set -- \ 214 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 215 | -classpath "$CLASSPATH" \ 216 | org.gradle.wrapper.GradleWrapperMain \ 217 | "$@" 218 | 219 | # Stop when "xargs" is not available. 220 | if ! command -v xargs >/dev/null 2>&1 221 | then 222 | die "xargs is not available" 223 | fi 224 | 225 | # Use "xargs" to parse quoted args. 226 | # 227 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 228 | # 229 | # In Bash we could simply go: 230 | # 231 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 232 | # set -- "${ARGS[@]}" "$@" 233 | # 234 | # but POSIX shell has neither arrays nor command substitution, so instead we 235 | # post-process each arg (as a line of input to sed) to backslash-escape any 236 | # character that might be a shell metacharacter, then use eval to reverse 237 | # that process (while maintaining the separation between arguments), and wrap 238 | # the whole thing up as a single "set" statement. 239 | # 240 | # This will of course break if any of these variables contains a newline or 241 | # an unmatched quote. 242 | # 243 | 244 | eval "set -- $( 245 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 246 | xargs -n1 | 247 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 248 | tr '\n' ' ' 249 | )" '"$@"' 250 | 251 | exec "$JAVACMD" "$@" 252 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crux 2 | 3 | Crux offers a flexible plugin-based API & implementation to extract metadata from Web pages. 4 | As of v5.0, Crux no longer extracts article information from web page text; read on for recommended alternatives. 5 | 6 | ## Usage 7 | 8 | Crux uses semantic versioning. If the API changes, then the major version will be incremented. 9 | Upgrading from one minor version to the next minor version within the same major version should 10 | not require any client code to be modified. 11 | 12 | The latest release is available via 13 | [Maven Central](https://search.maven.org/artifact/com.chimbori.crux/crux) 14 | or 15 | [GitHub Releases](https://github.com/chimbori/crux/releases). 16 | 17 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.chimbori.crux/crux/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.chimbori.crux/crux) 18 | 19 | ### Get Crux via Maven 20 | 21 | ```xml 22 | 23 | com.chimbori.crux 24 | crux 25 | 0.0.0 26 | 27 | ``` 28 | 29 | ### Get Crux via Gradle 30 | 31 | Project/`build.gradle.kts` 32 | 33 | ```kotlin 34 | allprojects { 35 | repositories { 36 | mavenCentral() 37 | } 38 | } 39 | ``` 40 | 41 | Module/`build.gradle.kts` 42 | 43 | ```kotlin 44 | dependencies { 45 | implementation("com.chimbori.crux:crux:0.0.0") // See the latest version number above. 46 | } 47 | ``` 48 | 49 | ## Sample Code 50 | 51 | ```kotlin 52 | // Create a reusable object configured with the default set of plugins. 53 | val crux = Crux() 54 | 55 | val httpURL = "https://chimbori.com/".toHttpUrl() 56 | 57 | // You can provide prefetched raw HTML content yourself, or have Crux fetch 58 | // it for you. 59 | val htmlContent = """ 60 | | 61 | | 62 | | Chimbori 63 | | 65 | | 66 | | 68 | | 69 | | 70 | |""".trimMargin() 71 | 72 | // Crux runs inside a `suspend` function as a Kotlin Coroutine. 73 | val extractedMetadata = runBlocking { 74 | crux.extractFrom(originalUrl = httpURL, parsedDoc = Jsoup.parse(htmlContent, httpURL.toString)) 75 | } 76 | 77 | // Metadata fields such as the Title and Description are available from the 78 | // returned [Resource] object as an indexed collection. 79 | assertEquals("Chimbori", extractedMetadata[TITLE]) 80 | 81 | // Well-known URLs related to this page are available either as strings or 82 | // OkHttp [HttpUrl]s. 83 | assertEquals("https://chimbori.com/media/favicon.png", extractedMetadata[FAVICON_URL]) 84 | assertEquals("https://chimbori.com/media/favicon.png".toHttpUrl(), 85 | extractedMetadata.urls[FAVICON_URL]) 86 | 87 | // Extra markup fields like Twitter Cards metadata or Open Graph metadata are 88 | // available as metadata fields as well. 89 | assertEquals("https://chimbori.com/media/cover-photo.png", extractedMetadata[BANNER_IMAGE_URL]) 90 | ``` 91 | 92 | ## Design & Features 93 | 94 | Crux is designed as a chain of plugins; each one performs a small specific task. 95 | 96 | Each plugin receives as input a `Resource` object, which includes a URL and all the fields 97 | populated by previous plugins in the chain. Each plugin can 98 | 99 | - extract new pieces of metadata and add them to the output, or 100 | - overwrite existing fields by setting a new value for the same key, or 101 | - remove existing fields by setting a `null` value for that key. 102 | 103 | A small set of well-known key names are defined in the API as `Fields`, but plugins and clients are 104 | not restricted to this set. You can extend Crux for your own applications by defining and using 105 | your own string keys for extracted metadata. 106 | 107 | Plugins can rewrite URLs, which are then passed on down the chain. This is how HTTP redirects 108 | (301 and 302) as well as static redirectors (such as those from Google and Facebook) are handled. 109 | 110 | Each plugin is independent of others. You can pick and choose the ones you want to use. If you use 111 | Crux in an Android app, Proguard or other minification tools can strip out the plugins you don’t 112 | use. 113 | 114 | Crux’s API includes fewer setters/getters (compared to other such libraries), to keep the method 115 | count low (this is important for Android). Its plugin-based architecture makes it cleaner & 116 | leaner, compared to other libraries not explicitly optimized for Android. 117 | 118 | ## Default Plugins 119 | 120 | ### HtmlMetadataPlugin 121 | 122 | Extracts titles, banner images, & other metadata from any web page. 123 | 124 | - Support for more metadata formats: OpenGraph, Twitter Cards, Schema.org. 125 | 126 | ### AmpPlugin 127 | 128 | Rewrites the URL of an AMP page to its canonical (original) URL. 129 | 130 | ### GoogleStaticRedirectorPlugin 131 | 132 | Rewrites URLs generated by the Google Redirector Service to their canonical (original) URLs. 133 | 134 | ### FacebookStaticRedirectorPlugin 135 | 136 | Rewrites URLs generated by the Facebook Redirector Service to their canonical (original) URLs. 137 | 138 | ## Optional Plugins 139 | 140 | ### TrackingParameterRemover 141 | 142 | Removes URL parameters typically used by analytics providers to track users’ behavior across the 143 | Web. This plugin is optional because it may break some misconfigured URLs and cause them to 144 | return the wrong content. 145 | 146 | ## Writing a Custom Plugin 147 | 148 | ### ArticleExtractorPlugin 149 | 150 | As of v5.0, Crux no longer contains its own article extraction plugin. 151 | We recommend [dankito/Readability4J](https://github.com/dankito/Readability4J), a fork of Mozilla’s Readability.js, 152 | which is higher-quality and newer than Crux’s origin, Snacktory. 153 | We recommend that you use it instead of relying on Crux’s parser (which has now been removed). 154 | 155 | Readability4J strips out sidebars, navigation bars, and other unimportant parts of a page, and extracts the core 156 | article content. 157 | 158 | build.gradle.kts: 159 | ```kotlinscript 160 | dependencies { 161 | implementation("net.dankito.readability4j:readability4j:1.0.8") 162 | } 163 | ``` 164 | 165 | Readability4JPlugin.kt: 166 | ```kotlin 167 | import com.chimbori.crux.api.Extractor 168 | import com.chimbori.crux.api.Fields.DURATION_MS 169 | import com.chimbori.crux.api.Fields.TITLE 170 | import com.chimbori.crux.api.Resource 171 | import com.chimbori.crux.common.estimatedReadingTimeMs 172 | import com.chimbori.crux.common.isLikelyArticle 173 | import net.dankito.readability4j.extended.Readability4JExtended 174 | import okhttp3.HttpUrl 175 | 176 | class Readability4JPlugin : Extractor { 177 | override fun canExtract(url: HttpUrl) = url.isLikelyArticle() 178 | 179 | override suspend fun extract(request: Resource): Resource? = if (request.url != null && request.document != null) { 180 | val readability4J = Readability4JExtended(request.url.toString(), request.document!!) 181 | val article = readability4J.parse() 182 | Resource( 183 | article = article.articleContent, 184 | metadata = mapOf( 185 | TITLE to article.title, 186 | DURATION_MS to article.articleContent?.text()?.estimatedReadingTimeMs() 187 | ), 188 | ) 189 | } else { 190 | null 191 | } 192 | } 193 | ``` 194 | 195 | Then add `Readability4JPlugin` to the list of Crux plugins to use it along with Crux’s default plugins. 196 | 197 | ### CustomerNumberExtractorPlugin 198 | 199 | As an example, one can write a custom plugin to extract specific fields from a URL as follows: 200 | 201 | ```kotlin 202 | // If you write a new plugin yourself, you can add any custom fields to the `Resource` object 203 | // yourself, and consume them in your own app. 204 | val customerNumberExtractorPlugin = object : Plugin { 205 | // Indicate that your plugin can handle all URLs on your site, but no others. 206 | override fun canHandle(url: HttpUrl): Boolean = url.topPrivateDomain() == "your-website.com" 207 | 208 | // Fields in the returned [Resource] overwrite those in the input [request]. If no changes are 209 | // to be made, then return null from your plugin. Otherwise, only return those fields that are 210 | // new or changed from the input. 211 | override suspend fun handle(request: Resource) = Resource( 212 | fields = mapOf(CUSTOMER_NUMBER_FIELD to request.url?.queryParameter("customer-number")) 213 | ) 214 | 215 | val CUSTOMER_NUMBER_FIELD = "customer-number" 216 | } 217 | 218 | val cruxWithCustomPlugin = Crux(DEFAULT_PLUGINS + customerNumberExtractorPlugin) 219 | val orderDetailsUrl = "https://www.your-website.com/orders?customer-number=42".toHttpUrl() 220 | 221 | val metadata = runBlocking { 222 | cruxWithCustomPlugin.extractFrom(orderDetailsUrl, Document(orderDetailsUrl.toString())) 223 | } 224 | 225 | // Input URL was unchanged and is available in the output metadata. 226 | assertEquals(orderDetailsUrl, metadata.url) 227 | // Data extracted by the custom plugin is available as a custom field. 228 | assertEquals("42", metadata[customerNumberExtractorPlugin.CUSTOMER_NUMBER_FIELD]) 229 | ``` 230 | 231 | ## Image URL Extractor API 232 | 233 | From a single DOM Element root, the Image URL API inspects the sub-tree and returns the best 234 | possible image URL candidate available within it. It does this by scanning within the DOM tree 235 | for interesting `src` & `style` tags. 236 | 237 | All URLs are resolved as absolute URLs, even if the HTML contained relative URLs. 238 | 239 | ```kotlin 240 | ImageUrlExtractor(url, domElement).findImage().imageUrl 241 | ``` 242 | 243 | ## Anchor Links Extractor API 244 | 245 | From a single DOM Element root, the Image URL API inspects the sub-tree and returns the best 246 | possible link URL candidate available within it. It does this by scanning within the DOM tree 247 | for interesting `href` tags. 248 | 249 | All URLs are resolved as absolute URLs, even if the HTML contained relative URLs. 250 | 251 | ```kotlin 252 | LinkUrlExtractor(url, domElement).findLink().linkUrl 253 | ``` 254 | 255 | ## URL Heuristics API 256 | 257 | This API examines a given URL (without connecting to the server), and returns 258 | heuristically-determined answers to questions such as: 259 | 260 | - Is this URL likely a video URL? 261 | - Is this URL likely an image URL? 262 | - Is this URL likely an audio URL? 263 | - Is this URL likely an executable URL? 264 | - Is this URL likely an archive URL? 265 | 266 | ```kotlin 267 | val url = "https://example.com/article.html".toHttpUrl() 268 | 269 | assertTrue(url.isLikelyArticle()) 270 | assertFalse(url.isLikelyImage()) 271 | ``` 272 | 273 | ## License 274 | 275 | Copyright 2016, Chimbori, makers of Hermit, the Lite Apps Browser. 276 | 277 | Licensed under the Apache License, Version 2.0 (the "License"); 278 | you may not use this file except in compliance with the License. 279 | You may obtain a copy of the License at 280 | 281 | http://www.apache.org/licenses/LICENSE-2.0 282 | 283 | Unless required by applicable law or agreed to in writing, software 284 | distributed under the License is distributed on an "AS IS" BASIS, 285 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 286 | See the License for the specific language governing permissions and 287 | limitations under the License. 288 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2016, Chimbori, makers of Hermit, the Lite Apps Browser. 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | --------------------------------------------------------------------------------