├── skrape-core ├── .gitignore ├── src │ ├── main │ │ └── kotlin │ │ │ └── nolambda │ │ │ └── skrape │ │ │ ├── result │ │ │ ├── SkrapeResult.kt │ │ │ ├── SimpleSkrapeResult.kt │ │ │ └── QuerySkrapeResult.kt │ │ │ ├── processor │ │ │ ├── PageAdapter.kt │ │ │ ├── formatter │ │ │ │ ├── ValueFormatter.kt │ │ │ │ ├── ValueFormatterExt.kt │ │ │ │ └── ValueFormatterManager.kt │ │ │ └── AbstractPageAdapter.kt │ │ │ ├── utils │ │ │ └── Queries.kt │ │ │ ├── transformer │ │ │ ├── PageTransformer.kt │ │ │ └── PlaceholderTransformer.kt │ │ │ ├── SkrapeLogger.kt │ │ │ ├── nodes │ │ │ ├── PageInfo.kt │ │ │ ├── NodeExt.kt │ │ │ ├── PageExt.kt │ │ │ ├── NodeBuilderExt.kt │ │ │ └── Node.kt │ │ │ ├── serialization │ │ │ ├── PageSerializer.kt │ │ │ └── JsonPageSerializer.kt │ │ │ └── Skrape.kt │ └── test │ │ └── kotlin │ │ └── nolamda │ │ └── skrape │ │ ├── PlaceholderTransformerSpec.kt │ │ ├── QuerySkapeResultSpec.kt │ │ └── JsonPageSeriliazerSpec.kt └── build.gradle ├── skrape-sample ├── .gitignore ├── build.gradle └── src │ └── main │ └── kotlin │ └── nolambda │ └── skrape │ └── example │ └── ParsingTest.kt ├── jitpack.yml ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── settings.gradle ├── setup.sh ├── .github └── workflows │ └── gradle.yml ├── skrape-jsoup ├── src │ ├── test │ │ ├── kotlin │ │ │ └── nolambda │ │ │ │ └── skrape │ │ │ │ ├── HackerNewsResponse.kt │ │ │ │ ├── JsoupValueFormatterSpec.kt │ │ │ │ ├── JsonSpec.kt │ │ │ │ └── SkrapeJsoupSpec.kt │ │ └── resources │ │ │ └── index.html │ └── main │ │ └── kotlin │ │ └── nolambda │ │ └── skrape │ │ └── processor │ │ └── jsoup │ │ ├── JsoupValueFormatter.kt │ │ └── JsoupPageAdapter.kt └── build.gradle ├── skrape-chrome ├── src │ ├── main │ │ └── kotlin │ │ │ └── nolambda │ │ │ └── skrape │ │ │ └── processor │ │ │ └── chrome │ │ │ ├── ChromeWaiter.kt │ │ │ ├── ChromeValueFormatter.kt │ │ │ ├── ChromeElement.kt │ │ │ └── ChromePageAdapter.kt │ └── test │ │ └── kotlin │ │ └── nolambda │ │ └── skrape │ │ └── ChromeValueFormatterSpec.kt └── build.gradle ├── specification.md ├── LICENSE ├── README.md ├── .gitignore ├── gradlew.bat └── gradlew /skrape-core/.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | -------------------------------------------------------------------------------- /skrape-sample/.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | -------------------------------------------------------------------------------- /jitpack.yml: -------------------------------------------------------------------------------- 1 | before_install: 2 | - ./setup.sh 3 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/esafirm/skrape/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | include 'skrape-core' 2 | include 'skrape-sample' 3 | include 'skrape-jsoup' 4 | include 'skrape-chrome' 5 | 6 | -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/result/SkrapeResult.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.result 2 | 3 | interface SkrapeResult { 4 | fun json(): String 5 | } -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm settings.gradle 3 | echo "include 'skrape-core'" >> settings.gradle 4 | echo "include 'skrape-chrome'" >> settings.gradle 5 | echo "include 'skrape-jsoup'" >> settings.gradle 6 | -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/processor/PageAdapter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor 2 | 3 | import nolambda.skrape.nodes.Page 4 | 5 | interface PageAdapter { 6 | fun adapt(page: Page): T 7 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/utils/Queries.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.utils 2 | 3 | object Queries { 4 | fun indexOfChild(selector: String, index: Int) = 5 | "$selector:nth-child(${index})" 6 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/transformer/PageTransformer.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.transformer 2 | 3 | import nolambda.skrape.nodes.Page 4 | 5 | interface PageTransformer { 6 | fun transform(page: Page): Page 7 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/SkrapeLogger.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape 2 | 3 | object SkrapeLogger { 4 | 5 | var enableLog = true 6 | 7 | fun log(message: String) { 8 | if (enableLog) { 9 | println(message) 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sun Jul 09 11:06:19 WIB 2017 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.4.1-all.zip 7 | -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/nodes/PageInfo.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.nodes 2 | 3 | import kotlinx.serialization.Serializable 4 | 5 | @Serializable 6 | data class PageInfo( 7 | val path: String, 8 | val baseUrl: String = "", 9 | val encoding: String = "UTF-8" 10 | ) -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/processor/formatter/ValueFormatter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.formatter 2 | 3 | import nolambda.skrape.nodes.Value 4 | 5 | interface ValueFormatter { 6 | /** 7 | * Check if the formatter can handle the value 8 | */ 9 | fun isForType(value: Value): Boolean 10 | fun format(value: Value, element: E): R 11 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/processor/formatter/ValueFormatterExt.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.formatter 2 | 3 | import nolambda.skrape.processor.AbstractPageAdapter 4 | import nolambda.skrape.result.SkrapeResult 5 | 6 | @Suppress("UNCHECKED_CAST") 7 | fun AbstractPageAdapter.addFormatter(formatter: ValueFormatter) { 8 | formatterManager.addFormatter(formatter) 9 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/result/SimpleSkrapeResult.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.result 2 | 3 | import kotlinx.serialization.json.JsonElement 4 | 5 | open class SimpleSkrapeResult(private val jsonElement: JsonElement) : SkrapeResult { 6 | 7 | private val jsonString by lazy { jsonElement.toString() } 8 | 9 | override fun json(): String = jsonString 10 | 11 | override fun toString(): String = json() 12 | } -------------------------------------------------------------------------------- /.github/workflows/gradle.yml: -------------------------------------------------------------------------------- 1 | name: Kotlin CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v1 11 | - name: Set up JDK 1.8 12 | uses: actions/setup-java@v1 13 | with: 14 | java-version: 1.8 15 | 16 | - name: Test 17 | run: ./gradlew test 18 | 19 | - name: Build with Gradle 20 | run: ./gradlew build 21 | -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/serialization/PageSerializer.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.serialization 2 | 3 | import nolambda.skrape.nodes.Page 4 | 5 | interface PageSerializer { 6 | 7 | companion object { 8 | const val KEY_PAGE_CHILDREN = "children" 9 | const val KEY_PAGE_INFO = "pageInfo" 10 | const val KEY_NAME = "name" 11 | const val KEY_TYPE= "type" 12 | const val KEY_SELECTOR = "selector" 13 | } 14 | 15 | fun serialize(page: Page): TARGET 16 | fun deserialize(target: TARGET): Page 17 | } -------------------------------------------------------------------------------- /skrape-jsoup/src/test/kotlin/nolambda/skrape/HackerNewsResponse.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape 2 | 3 | import kotlinx.serialization.SerialName 4 | import kotlinx.serialization.Serializable 5 | 6 | @Serializable 7 | data class HackerNewsResponse( 8 | @SerialName("items") val stories: List 9 | ) 10 | 11 | @Serializable 12 | data class HackerNewsStory( 13 | @SerialName("text") val title: String, 14 | @SerialName("detail") val detail: StoryDetail 15 | ) 16 | 17 | @Serializable 18 | data class StoryDetail( 19 | @SerialName("link") val uri: String 20 | ) -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/nodes/NodeExt.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.nodes 2 | 3 | /** 4 | * Evaluate all the children in this [Page] 5 | * It mainly use if the page want to be serialized or processed 6 | */ 7 | @Suppress("UNNECESSARY_SAFE_CALL") 8 | fun T.evaluate() = this.apply { 9 | if (children.isNotEmpty()) return@apply 10 | body?.invoke(this) 11 | evaluateChildren() 12 | } 13 | 14 | private fun ParentElement.evaluateChildren() { 15 | children.forEach { element -> 16 | if (element is ParentElement) { 17 | element.evaluate() 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /skrape-sample/build.gradle: -------------------------------------------------------------------------------- 1 | group 'nolambda.stream' 2 | version '1.0-SNAPSHOT' 3 | 4 | apply plugin: 'kotlin' 5 | apply plugin: 'application' 6 | 7 | mainClassName = 'nolambda.skrape.example.ParsingTestKt' 8 | 9 | defaultTasks 'run' 10 | 11 | repositories { 12 | maven { url 'https://jitpack.io' } 13 | } 14 | 15 | dependencies { 16 | implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version" 17 | implementation project(':skrape-core') 18 | implementation project(':skrape-chrome') 19 | implementation project(':skrape-jsoup') 20 | 21 | testImplementation group: 'junit', name: 'junit', version: '4.12' 22 | } 23 | -------------------------------------------------------------------------------- /skrape-chrome/src/main/kotlin/nolambda/skrape/processor/chrome/ChromeWaiter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.chrome 2 | 3 | import org.openqa.selenium.support.ui.WebDriverWait 4 | 5 | interface ChromeWaiter { 6 | fun until(block: () -> Boolean) 7 | } 8 | 9 | class WebChromeWaiter(private val webDriverWait: WebDriverWait) : ChromeWaiter { 10 | override fun until(block: () -> Boolean) { 11 | try { 12 | webDriverWait.until { block.invoke() } 13 | } catch (e: Exception) { 14 | e.printStackTrace() 15 | } 16 | } 17 | } 18 | 19 | object NoWait : ChromeWaiter { 20 | override fun until(block: () -> Boolean) { 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /specification.md: -------------------------------------------------------------------------------- 1 | ## Specification 2 | 3 | 1. Skrape default result is JSON 4 | 2. Skrape query can be exported and imported to/from readable text format 5 | 6 | ## Example 7 | 8 | Skrape 9 | 10 | ```groovy 11 | page("https://example.com"){ 12 | "items" to query("td.title"){ 13 | "title" to text() 14 | "link" to attr("href") 15 | "others" to child { 16 | "liked" to text().toBoolean() 17 | "count" to attr("count").toInt() 18 | } 19 | } 20 | ``` 21 | 22 | JSON 23 | 24 | ```json 25 | { 26 | "items": [ 27 | { 28 | "title":"Hellow", 29 | "link":"https://example.com/1", 30 | "others": { 31 | "liked": true, 32 | "count": 10 33 | } 34 | } 35 | ] 36 | } 37 | ``` -------------------------------------------------------------------------------- /skrape-jsoup/src/test/kotlin/nolambda/skrape/JsoupValueFormatterSpec.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape 2 | 3 | import io.kotlintest.matchers.shouldBe 4 | import io.kotlintest.specs.StringSpec 5 | import nolambda.skrape.nodes.Value 6 | import nolambda.skrape.processor.jsoup.JsoupValueFormatter 7 | 8 | class JsoupValueFormatterSpec : StringSpec({ 9 | val formatter = JsoupValueFormatter() 10 | val value = Value( 11 | name = "value", 12 | valueType = Value.TYPE_STRING, 13 | selector = "td.a" 14 | ) 15 | 16 | "it will recognize type string" { 17 | formatter.isForType(value) shouldBe true 18 | } 19 | 20 | "it won't recognize unknown value type" { 21 | formatter.isForType(Value("", "", "")) shouldBe false 22 | } 23 | }) -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/Skrape.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape 2 | 3 | import nolambda.skrape.nodes.Page 4 | import nolambda.skrape.processor.PageAdapter 5 | import nolambda.skrape.result.SkrapeResult 6 | import nolambda.skrape.SkrapeLogger as logger 7 | 8 | class Skrape( 9 | private val parser: PageAdapter, 10 | private val enableLog: Boolean = false 11 | ) { 12 | 13 | fun request(page: Page, args: Map = emptyMap()): T { 14 | log { "Requesting $this" } 15 | return parser.adapt(page).also { 16 | log { "Result $it" } 17 | } 18 | } 19 | 20 | private inline fun log(log: () -> String) { 21 | if (enableLog) { 22 | logger.log(log()) 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /skrape-chrome/src/test/kotlin/nolambda/skrape/ChromeValueFormatterSpec.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape 2 | 3 | import io.kotlintest.matchers.shouldBe 4 | import io.kotlintest.specs.StringSpec 5 | import nolambda.skrape.nodes.Value 6 | import nolambda.skrape.processor.chrome.ChromeValueFormatter 7 | import nolambda.skrape.processor.chrome.NoWait 8 | 9 | class ChromeValueFormatterSpec : StringSpec({ 10 | val formatter = ChromeValueFormatter { NoWait } 11 | val value = Value( 12 | name = "value", 13 | valueType = Value.TYPE_STRING, 14 | selector = "td.a" 15 | ) 16 | 17 | "it will recognize type string" { 18 | formatter.isForType(value) shouldBe true 19 | } 20 | 21 | "it won't recognize unknown value type" { 22 | formatter.isForType(Value("", "", "")) shouldBe false 23 | } 24 | }) -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/nodes/PageExt.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.nodes 2 | 3 | private val URL_REGEX = Regex("^(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]") 4 | 5 | fun Page.isLocalFile(): Boolean = !pageInfo.path.matches(URL_REGEX) 6 | 7 | /** 8 | * Detect query without name 9 | * Ex: query("td") { 10 | * ... 11 | * } 12 | * 13 | * This should result to JsonArray as the parent of the result rather than JsonObject 14 | * @return True if the if the parent container is Query and has no name or an empty Container 15 | */ 16 | fun Page.isUselessContainer(): Boolean { 17 | if (children.isEmpty()) return true 18 | if (children.size > 1) { 19 | return false 20 | } 21 | val firstChild = children.first() 22 | return firstChild is Query && firstChild.name.isBlank() 23 | } -------------------------------------------------------------------------------- /skrape-jsoup/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'org.jetbrains.kotlin.jvm' 3 | id 'org.jetbrains.kotlin.plugin.serialization' version "$kotlin_version" 4 | } 5 | 6 | group 'nolambda.stream' 7 | version '1.0.0' 8 | 9 | repositories { 10 | mavenCentral() 11 | } 12 | 13 | compileKotlin { 14 | kotlinOptions.jvmTarget = "1.8" 15 | } 16 | compileTestKotlin { 17 | kotlinOptions.jvmTarget = "1.8" 18 | } 19 | 20 | dependencies { 21 | implementation project(':skrape-core') 22 | implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8" 23 | 24 | api 'org.jsoup:jsoup:1.10.3' 25 | 26 | /* --------------------------------------------------- */ 27 | /* > Test */ 28 | /* --------------------------------------------------- */ 29 | 30 | testImplementation 'io.kotlintest:kotlintest:2.0.3' 31 | testImplementation 'junit:junit:4.12' 32 | } 33 | -------------------------------------------------------------------------------- /skrape-chrome/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'org.jetbrains.kotlin.jvm' 3 | id 'org.jetbrains.kotlin.plugin.serialization' version "$kotlin_version" 4 | } 5 | 6 | group 'nolambda.stream' 7 | version '1.0.0' 8 | 9 | repositories { 10 | mavenCentral() 11 | } 12 | 13 | dependencies { 14 | implementation project(':skrape-core') 15 | implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8' 16 | api 'org.seleniumhq.selenium:selenium-java:3.8.1' 17 | 18 | /* --------------------------------------------------- */ 19 | /* > Test */ 20 | /* --------------------------------------------------- */ 21 | 22 | testImplementation 'io.kotlintest:kotlintest:2.0.3' 23 | testImplementation 'junit:junit:4.12' 24 | } 25 | 26 | compileKotlin { 27 | kotlinOptions.jvmTarget = "1.8" 28 | } 29 | compileTestKotlin { 30 | kotlinOptions.jvmTarget = "1.8" 31 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/processor/formatter/ValueFormatterManager.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.formatter 2 | 3 | import nolambda.skrape.nodes.Value 4 | 5 | class ValueFormatterManager : ValueFormatter { 6 | 7 | val formatter: MutableList> = mutableListOf() 8 | 9 | override fun format(value: Value, element: ELEMENT): RETURN { 10 | formatter 11 | .filter { it.isForType(value) } 12 | .forEach { return it.format(value, element) } 13 | throw IllegalStateException("Should call isForType first for checking") 14 | } 15 | 16 | override fun isForType(value: Value): Boolean = formatter.any { it.isForType(value) } 17 | 18 | fun addFormatter(formatter: ValueFormatter) { 19 | this.formatter.add(formatter) 20 | } 21 | 22 | fun removeFormatter(formatter: ValueFormatter) { 23 | this.formatter.remove(formatter) 24 | } 25 | } -------------------------------------------------------------------------------- /skrape-core/build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'org.jetbrains.kotlin.jvm' 3 | id 'org.jetbrains.kotlin.plugin.serialization' version "$kotlin_version" 4 | } 5 | 6 | compileKotlin { 7 | kotlinOptions.jvmTarget = JavaVersion.VERSION_1_8 8 | } 9 | compileTestKotlin { 10 | kotlinOptions.jvmTarget = JavaVersion.VERSION_1_8 11 | } 12 | 13 | task sourcesJar(type: Jar, dependsOn: classes) { 14 | archiveClassifier.set("sources") 15 | from sourceSets.main.allSource 16 | } 17 | 18 | artifacts { 19 | archives sourcesJar 20 | } 21 | 22 | dependencies { 23 | implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version" 24 | 25 | /* JSON */ 26 | api 'org.jetbrains.kotlinx:kotlinx-serialization-core:1.0.1' 27 | api 'org.jetbrains.kotlinx:kotlinx-serialization-json:1.0.1' 28 | 29 | /* --------------------------------------------------- */ 30 | /* > Test */ 31 | /* --------------------------------------------------- */ 32 | 33 | testImplementation 'io.kotlintest:kotlintest:2.0.3' 34 | testImplementation 'junit:junit:4.12' 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Esa Firman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /skrape-jsoup/src/main/kotlin/nolambda/skrape/processor/jsoup/JsoupValueFormatter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.jsoup 2 | 3 | import kotlinx.serialization.json.JsonPrimitive 4 | import nolambda.skrape.nodes.Value 5 | import nolambda.skrape.processor.formatter.ValueFormatter 6 | import org.jsoup.nodes.Element 7 | 8 | class JsoupValueFormatter : ValueFormatter { 9 | 10 | override fun isForType(value: Value): Boolean { 11 | return value.valueType.let { 12 | it == Value.TYPE_STRING || it == Value.TYPE_INT || it == Value.TYPE_BOOL 13 | } 14 | } 15 | 16 | private fun extractValue(query: String, element: Element): String { 17 | return if (query.isBlank()) { 18 | element.text() 19 | } else { 20 | element.select(query).text() 21 | } 22 | } 23 | 24 | override fun format(value: Value, element: Element): JsoupParserResult = with(value) { 25 | val text = extractValue(value.selector, element) 26 | name to when (value.valueType) { 27 | Value.TYPE_BOOL -> JsonPrimitive(text.toBoolean()) 28 | Value.TYPE_INT -> JsonPrimitive(text.toInt()) 29 | else -> JsonPrimitive(text) 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /skrape-chrome/src/main/kotlin/nolambda/skrape/processor/chrome/ChromeValueFormatter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.chrome 2 | 3 | import kotlinx.serialization.json.JsonPrimitive 4 | import nolambda.skrape.nodes.Value 5 | import nolambda.skrape.processor.formatter.ValueFormatter 6 | 7 | class ChromeValueFormatter( 8 | private val waiter: () -> ChromeWaiter 9 | ) : ValueFormatter { 10 | 11 | override fun isForType(value: Value): Boolean { 12 | return value.valueType.let { 13 | it == Value.TYPE_STRING || it == Value.TYPE_INT || it == Value.TYPE_BOOL 14 | } 15 | } 16 | 17 | private fun extractValue(query: String, element: ChromeElement): String { 18 | return if (query.isBlank()) { 19 | element.text() 20 | } else { 21 | element.findElWait(waiter(), query).first().text 22 | } 23 | } 24 | 25 | override fun format(value: Value, element: ChromeElement): ChromeParserResult = with(value) { 26 | val text = extractValue(value.selector, element) 27 | name to when (value.valueType) { 28 | Value.TYPE_BOOL -> JsonPrimitive(text.toBoolean()) 29 | Value.TYPE_INT -> JsonPrimitive(text.toBoolean()) 30 | else -> JsonPrimitive(text) 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /skrape-chrome/src/main/kotlin/nolambda/skrape/processor/chrome/ChromeElement.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.chrome 2 | 3 | import org.openqa.selenium.By 4 | import org.openqa.selenium.WebElement 5 | import org.openqa.selenium.chrome.ChromeDriver 6 | 7 | sealed class ChromeElement { 8 | data class Driver(val chromeDriver: ChromeDriver) : ChromeElement() 9 | data class Component(val webElement: WebElement) : ChromeElement() 10 | } 11 | 12 | internal fun ChromeElement.findEl(selector: String): List { 13 | return when (this) { 14 | is ChromeElement.Driver -> chromeDriver.findElements(By.cssSelector(selector)) 15 | is ChromeElement.Component -> webElement.findElements(By.cssSelector(selector)) 16 | } 17 | } 18 | 19 | internal fun ChromeElement.findElWait(wait: ChromeWaiter, selector: String): List { 20 | val isDisplayed = { findEl(selector).firstOrNull()?.isDisplayed ?: false } 21 | wait.until(isDisplayed) 22 | return findEl(selector) 23 | } 24 | 25 | internal fun ChromeElement.attr(attrName: String): String { 26 | return when (this) { 27 | is ChromeElement.Component -> webElement.getAttribute(attrName) 28 | is ChromeElement.Driver -> throw IllegalStateException("Only Component can have attr") 29 | } 30 | } 31 | 32 | internal fun ChromeElement.text(): String { 33 | return when (this) { 34 | is ChromeElement.Component -> webElement.text 35 | is ChromeElement.Driver -> throw IllegalStateException("Only Component can have text") 36 | } 37 | } -------------------------------------------------------------------------------- /skrape-core/src/test/kotlin/nolamda/skrape/PlaceholderTransformerSpec.kt: -------------------------------------------------------------------------------- 1 | package nolamda.skrape 2 | 3 | import io.kotlintest.matchers.shouldBe 4 | import io.kotlintest.matchers.shouldThrow 5 | import io.kotlintest.specs.StringSpec 6 | import nolambda.skrape.nodes.* 7 | import nolambda.skrape.transformer.PlaceholderTransformer 8 | import kotlin.to 9 | 10 | class PlaceholderTransformerSpec : StringSpec({ 11 | 12 | val transformer = PlaceholderTransformer(mapOf( 13 | "ngasal" to "tweet", 14 | "COBA" to "a" 15 | )) 16 | 17 | val page = Page("https://ngasal.com/{{ngasal}}") { 18 | query("td {{ngasal}}") { 19 | "place" to text() 20 | query("td {{COBA}}") { 21 | "another" to text() 22 | "place" to attr("href") 23 | } 24 | } 25 | } 26 | 27 | val resultPage = transformer.transform(page) 28 | 29 | "it should replace path placeholder" { 30 | val expectedPath = "https://ngasal.com/tweet" 31 | resultPage.pageInfo.path shouldBe expectedPath 32 | } 33 | 34 | "it should replace css selector placeholder" { 35 | val expectedSelector = "td tweet" 36 | 37 | val query = resultPage.children.first() as Query 38 | query.selector shouldBe expectedSelector 39 | } 40 | 41 | "it should throw if there's unfulfilled" { 42 | val failingTransformer = PlaceholderTransformer(emptyMap()) 43 | shouldThrow { 44 | failingTransformer.transform(page) 45 | } 46 | } 47 | }) -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/nodes/NodeBuilderExt.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.nodes 2 | 3 | /* --------------------------------------------------- */ 4 | /* > Parent */ 5 | /* --------------------------------------------------- */ 6 | 7 | fun ParentElement.query(cssSelector: String, body: ElementBody): ParentElement = 8 | Query(selector = cssSelector, body = body).apply { 9 | postCreate(this@query, this) 10 | } 11 | 12 | fun ParentElement.container(body: ElementBody): ParentElement = 13 | Container(body = body).apply { 14 | postCreate(this@container, this) 15 | } 16 | 17 | /* --------------------------------------------------- */ 18 | /* > Child */ 19 | /* --------------------------------------------------- */ 20 | 21 | fun ParentElement.attr(attrName: String): SkrapeElemenet = 22 | Attr(attrName = attrName).apply { 23 | postCreate(this@attr, this) 24 | } 25 | 26 | private fun ParentElement.createValueElement(type: ValueType, query: String): SkrapeElemenet = 27 | Value(valueType = type, selector = query).apply { 28 | postCreate(this@createValueElement, this) 29 | } 30 | 31 | fun ParentElement.text(query: String = ""): SkrapeElemenet = createValueElement(Value.TYPE_STRING, query) 32 | fun ParentElement.bool(query: String = ""): SkrapeElemenet = createValueElement(Value.TYPE_BOOL, query) 33 | fun ParentElement.int(query: String = ""): SkrapeElemenet = createValueElement(Value.TYPE_INT, query) 34 | 35 | internal fun postCreate(parent: ParentElement, child: SkrapeElemenet) { 36 | parent.children.add(child) 37 | } 38 | 39 | infix fun String.to(element: SkrapeElemenet): SkrapeElemenet = element.apply { name = this@to } 40 | -------------------------------------------------------------------------------- /skrape-core/src/test/kotlin/nolamda/skrape/QuerySkapeResultSpec.kt: -------------------------------------------------------------------------------- 1 | package nolamda.skrape 2 | 3 | import io.kotlintest.matchers.shouldBe 4 | import io.kotlintest.specs.StringSpec 5 | import kotlinx.serialization.decodeFromString 6 | import kotlinx.serialization.json.* 7 | import nolambda.skrape.result.QuerySkrapeResult 8 | 9 | class QuerySkapeResultSpec : StringSpec({ 10 | val result = QuerySkrapeResult( 11 | JsonArray(listOf( 12 | JsonObject(mapOf( 13 | "title" to JsonPrimitive("How to kill a dragon"), 14 | "rating" to JsonPrimitive(5) 15 | )), 16 | JsonObject(mapOf( 17 | "title" to JsonPrimitive("How to kill a meme"), 18 | "rating" to JsonPrimitive(10) 19 | )) 20 | )) 21 | ) 22 | 23 | val toJsonObj = { string: String -> Json.decodeFromString(string) } 24 | val toJsonArray = { string: String -> Json.decodeFromString(string) } 25 | 26 | "it should have the right count" { 27 | toJsonObj(result.count).get("count")?.jsonPrimitive?.int shouldBe 2 28 | } 29 | 30 | "it should have the json result" { 31 | result.json().isNotBlank() shouldBe true 32 | } 33 | 34 | "it should get the right index" { 35 | val firstIndex = result.at(1) 36 | val obj = toJsonObj.invoke(firstIndex) 37 | obj.get("title").toString().contains("meme") shouldBe true 38 | } 39 | 40 | "it should get the right item" { 41 | val memeItem = result.find("title=meme") 42 | val obj = toJsonArray.invoke(memeItem) 43 | obj.first().jsonObject["title"].toString().contains("meme") shouldBe true 44 | } 45 | }) -------------------------------------------------------------------------------- /skrape-jsoup/src/test/kotlin/nolambda/skrape/JsonSpec.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape 2 | 3 | import io.kotlintest.specs.StringSpec 4 | import kotlinx.serialization.json.Json 5 | import nolambda.skrape.nodes.* 6 | import nolambda.skrape.processor.jsoup.JsoupPageAdapter 7 | import nolambda.skrape.serialization.JsonPageSerializer 8 | import nolambda.skrape.utils.Queries 9 | 10 | /** 11 | * This spec is a helper for creating json from Skrape 12 | */ 13 | class JsonSpec : StringSpec({ 14 | 15 | val page = Page("https://news.ycombinator.com/") { 16 | "items" to query("td a.storylink") { 17 | "text" to text() 18 | "link" to attr("href") 19 | } 20 | } 21 | 22 | val secondPage = Page("https://kawalcovid19.id/") { 23 | "items" to query("div.css-1ll0e4o") { 24 | "title" to text(Queries.indexOfChild("span", 2)) 25 | "count" to text(Queries.indexOfChild("span", 1)) 26 | } 27 | } 28 | 29 | val thirdPage = Page("https://twitter.com") { 30 | "title" to text("td.a") 31 | } 32 | 33 | val serializer = JsonPageSerializer( 34 | Json { 35 | prettyPrint = true 36 | encodeDefaults = true 37 | } 38 | ) 39 | 40 | "generate json" { 41 | println(serializer.serialize(page)) 42 | } 43 | 44 | "generate result json" { 45 | val skrape = Skrape(JsoupPageAdapter { 46 | proxy(null) 47 | userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6") 48 | referrer("google.com") 49 | }, false) 50 | println(skrape.request(secondPage).json()) 51 | } 52 | 53 | "generate thrid json" { 54 | println(serializer.serialize(thirdPage)) 55 | } 56 | }) -------------------------------------------------------------------------------- /skrape-sample/src/main/kotlin/nolambda/skrape/example/ParsingTest.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.example 2 | 3 | import nolambda.skrape.Skrape 4 | import nolambda.skrape.nodes.* 5 | import nolambda.skrape.processor.chrome.ChromePageAdapter 6 | import nolambda.skrape.processor.jsoup.JsoupPageAdapter 7 | import org.openqa.selenium.chrome.ChromeDriver 8 | import org.openqa.selenium.chrome.ChromeOptions 9 | 10 | fun main() { 11 | val page = Page("https://news.ycombinator.com/") { 12 | "athing" to query("span.score") { 13 | "score" to text() 14 | } 15 | "items" to query("td a.storylink") { 16 | "text" to text() 17 | "link" to attr("href") 18 | } 19 | } 20 | 21 | val twitterPage = Page("https://twitter.com/lynxluna") { 22 | "bio" to text("main > div > div > div > div > div > div > div > div > div:nth-child(1) > div > div:nth-child(3) > div") 23 | } 24 | println(runChromeDriverSample(twitterPage)) 25 | println(runChromeDriverSample(page)) 26 | } 27 | 28 | private fun runJsoupSample(page: Page): String { 29 | println("Run Jsoup sample…") 30 | 31 | val mobileUa = "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) " + 32 | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36" 33 | 34 | val parser = JsoupPageAdapter { 35 | this.userAgent(mobileUa) 36 | } 37 | 38 | val skrape = Skrape(parser) 39 | 40 | return skrape.request(page).json() 41 | } 42 | 43 | private val parser by lazy { 44 | ChromePageAdapter { 45 | ChromeDriver(ChromeOptions().apply { 46 | addArguments("--headless") 47 | }) 48 | } 49 | } 50 | 51 | private fun runChromeDriverSample(page: Page): String { 52 | println("Run Chrome Driver sample…") 53 | 54 | val skrape = Skrape(parser) 55 | 56 | return skrape.request(page).json() 57 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/transformer/PlaceholderTransformer.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.transformer 2 | 3 | import nolambda.skrape.nodes.* 4 | 5 | class PlaceholderTransformer( 6 | private val args: Map 7 | ) : PageTransformer { 8 | 9 | companion object { 10 | private val PLACEHOLDER_PATTERN = Regex(".*\\{\\{(.*)}}.*") 11 | } 12 | 13 | override fun transform(page: Page): Page { 14 | page.evaluate() 15 | 16 | val pageInfo = page.pageInfo 17 | return page.copy(pageInfo = pageInfo.copy(path = pageInfo.path.replacePlaceholder())).apply { 18 | setNewChildren(transformChildren(page.children)) 19 | } 20 | } 21 | 22 | private fun transformChildren(children: List): List { 23 | return children.map { 24 | when (it) { 25 | is Query -> it.copy(selector = it.selector.replacePlaceholder()).also { query -> 26 | query.setNewChildren(transformChildren(it.children)) 27 | } 28 | else -> it 29 | } 30 | } 31 | } 32 | 33 | private fun ParentElement.setNewChildren(newChildren: List) { 34 | children.clear() 35 | children.addAll(newChildren) 36 | } 37 | 38 | private fun String.replacePlaceholder(): String { 39 | val results = PLACEHOLDER_PATTERN.findAll(this) 40 | if (results.count() == 0) return this 41 | 42 | val finalResult = results.fold(this) { acc, result -> 43 | val capturedKey = result.groupValues[1] 44 | args[capturedKey]?.let { acc.replace("{{${capturedKey}}}", it) } ?: acc 45 | } 46 | 47 | // Check if there's un-fulfilled placeholder 48 | if (PLACEHOLDER_PATTERN.matches(finalResult)) { 49 | throw IllegalArgumentException("Unfulfilled placeholder on: $this") 50 | } 51 | 52 | return finalResult 53 | } 54 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/result/QuerySkrapeResult.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.result 2 | 3 | import kotlinx.serialization.json.* 4 | 5 | /** 6 | * This result support two kind of [JsonElement] 7 | * 1. JsonObject with one array as the child 8 | * { 9 | * "items": [ 10 | * "a", "b", "c" 11 | * ] 12 | * 13 | * 2. JsonArray 14 | */ 15 | class QuerySkrapeResult( 16 | private val jsonElement: JsonElement 17 | ) : SimpleSkrapeResult(jsonElement) { 18 | 19 | companion object { 20 | private const val KEY_FILTER = "filter" 21 | private const val KEY_ITEMS = "items" 22 | private const val KEY_COUNT = "count" 23 | 24 | private const val QUERY_SEPARATOR = "=" 25 | } 26 | 27 | private val items: JsonArray by lazy { 28 | when (jsonElement) { 29 | is JsonArray -> jsonElement 30 | is JsonObject -> jsonElement[KEY_ITEMS] as JsonArray 31 | else -> throw IllegalStateException("The json element must be one of JsonArray or JsonObject") 32 | } 33 | } 34 | 35 | val count by lazy { 36 | val map = mapOf( 37 | KEY_COUNT to JsonPrimitive(items.size) 38 | ) 39 | JsonObject(map).toString() 40 | } 41 | 42 | fun at(index: Int) = items[index].toString() 43 | 44 | fun find(query: String): String { 45 | val (key, value) = query.split(QUERY_SEPARATOR).map { it.toLowerCase() } 46 | val filtered = items.filter { 47 | when (it) { 48 | is JsonPrimitive -> key == KEY_FILTER && it.contentOrNull?.contains(value) == true 49 | is JsonObject -> { 50 | val itemKey = it.keys.first() 51 | val itemValue = it[itemKey] 52 | 53 | if (itemValue is JsonPrimitive) { 54 | itemKey.contains(key) && itemValue.contentOrNull?.contains(value) == true 55 | } else false 56 | } 57 | else -> false 58 | } 59 | } 60 | return filtered.toString() 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/processor/AbstractPageAdapter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor 2 | 3 | import nolambda.skrape.SkrapeLogger 4 | import nolambda.skrape.nodes.* 5 | import nolambda.skrape.processor.formatter.ValueFormatterManager 6 | import nolambda.skrape.result.SkrapeResult 7 | 8 | abstract class AbstractPageAdapter : PageAdapter { 9 | 10 | val formatterManager: ValueFormatterManager by lazy { ValueFormatterManager() } 11 | 12 | override fun adapt(page: Page): T { 13 | onStart() 14 | val requested = requestPage(page) 15 | val results = internalProcessPage(page, requested) 16 | val result = onHandleResult(page, results) 17 | onEnd() 18 | return result 19 | } 20 | 21 | open fun onStart() {} 22 | open fun onEnd() {} 23 | 24 | private fun internalProcessPage(page: Page, element: ELEMENT): List = with(page) { 25 | evaluate() 26 | processChildren(page, element) 27 | } 28 | 29 | abstract fun requestPage(page: Page): ELEMENT 30 | 31 | abstract fun onHandleResult(page: Page, results: List): T 32 | 33 | private fun processChildren( 34 | page: Page, 35 | element: ELEMENT 36 | ): List = with(page) { 37 | children.map { 38 | processElement(it, element) 39 | } 40 | } 41 | 42 | abstract fun processQuery(query: Query, element: ELEMENT): R 43 | 44 | abstract fun processContainer(container: Container, element: ELEMENT): R 45 | 46 | abstract fun processAttr(attr: Attr, element: ELEMENT): R 47 | 48 | private fun processValue(value: Value, element: ELEMENT): R = 49 | formatterManager.format(value, element) 50 | 51 | protected fun processElement(skrapeElemenet: SkrapeElemenet, element: ELEMENT): R { 52 | SkrapeLogger.log("$skrapeElemenet") 53 | 54 | return when (skrapeElemenet) { 55 | is Query -> processQuery(skrapeElemenet, element) 56 | is Value -> processValue(skrapeElemenet, element) 57 | is Attr -> processAttr(skrapeElemenet, element) 58 | is Container -> processContainer(skrapeElemenet, element) 59 | else -> throw IllegalStateException("Skrape Element undefined") 60 | } 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![OSS Skrape Banner](https://user-images.githubusercontent.com/1691440/126060801-fc14e96d-d800-4fa6-be92-eb49847eb112.gif) 2 | 3 | ## Skrape [![](https://jitpack.io/v/esafirm/skrape.svg)](https://jitpack.io/#esafirm/skrape) 4 | 5 | Turn your HTML to JSON with graph based Kotlin DSL 💪 6 | 7 | # Support Me! 8 | 9 | I would make myself more commited to this repo and OSS works in general. 10 | 11 | Would you help me achieving this goals? 12 | 13 | Buy Me a Coffee at ko-fi.com 14 | 15 | ## Getting Started 16 | 17 | Define your query in type-safe Kotlin DSL 18 | 19 | ```kotlin 20 | Page("https://news.ycombinator.com/") { 21 | "items" to query("td a.storylink") { 22 | "text" to text() 23 | "info" to container { 24 | "link" to attr("href") 25 | } 26 | } 27 | }.run { 28 | Skrape(JsoupDocumentParser()).request(this) 29 | } 30 | ``` 31 | To predictable JSON result 32 | 33 | ```javascript 34 | { 35 | "items": [ 36 | { 37 | "text": "SFO near miss could have triggered \u2018greatest aviation disaster in history'", 38 | "detail": { 39 | "link": "http://www.mercurynews.com/2017/07/10/exclusive-sfo-near-miss-might-have-triggered-greatest-aviation-disaster-in-history/" 40 | } 41 | }, 42 | { 43 | "text": "Taking control of all .io domains with a targeted registration", 44 | "detail": { 45 | "link": "https://thehackerblog.com/the-io-error-taking-control-of-all-io-domains-with-a-targeted-registration/" 46 | } 47 | } 48 | ] 49 | ... 50 | } 51 | ``` 52 | 53 | ## Binaries 54 | 55 | Add to your root `build.gradle` 56 | 57 | ```groovy 58 | allprojects { 59 | repositories { 60 | ... 61 | maven { url 'https://jitpack.io' } 62 | } 63 | } 64 | ``` 65 | 66 | Then add the dependency 67 | 68 | ```groovy 69 | dependencies { 70 | compile 'com.github.esafirm:skrape:x.y.z' 71 | } 72 | ``` 73 | 74 | Where `x.y.z` is the latest release (can be viewed from [Github release page](https://github.com/esafirm/skrape/releases) or Badge. 75 | 76 | ## License 77 | 78 | [MIT](https://github.com/esafirm/skrape/blob/master/LICENSE) 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # All build dir 2 | */build/ 3 | 4 | # User Defined 5 | .idea/ 6 | 7 | # Created by https://www.gitignore.io/api/java,gradle,intellij 8 | 9 | ### Intellij ### 10 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 11 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 12 | 13 | # User-specific stuff: 14 | .idea/**/workspace.xml 15 | .idea/**/tasks.xml 16 | .idea/dictionaries 17 | 18 | # Sensitive or high-churn files: 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.xml 22 | .idea/**/dataSources.local.xml 23 | .idea/**/sqlDataSources.xml 24 | .idea/**/dynamic.xml 25 | .idea/**/uiDesigner.xml 26 | 27 | # Gradle: 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # CMake 32 | cmake-build-debug/ 33 | 34 | # Mongo Explorer plugin: 35 | .idea/**/mongoSettings.xml 36 | 37 | ## File-based project format: 38 | *.iws 39 | 40 | ## Plugin-specific files: 41 | 42 | # IntelliJ 43 | /out/ 44 | 45 | # mpeltonen/sbt-idea plugin 46 | .idea_modules/ 47 | 48 | # JIRA plugin 49 | atlassian-ide-plugin.xml 50 | 51 | # Cursive Clojure plugin 52 | .idea/replstate.xml 53 | 54 | # Crashlytics plugin (for Android Studio and IntelliJ) 55 | com_crashlytics_export_strings.xml 56 | crashlytics.properties 57 | crashlytics-build.properties 58 | fabric.properties 59 | 60 | ### Intellij Patch ### 61 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 62 | 63 | # *.iml 64 | # modules.xml 65 | # .idea/misc.xml 66 | # *.ipr 67 | 68 | # Sonarlint plugin 69 | .idea/sonarlint 70 | 71 | ### Java ### 72 | # Compiled class file 73 | *.class 74 | 75 | # Log file 76 | *.log 77 | 78 | # BlueJ files 79 | *.ctxt 80 | 81 | # Mobile Tools for Java (J2ME) 82 | .mtj.tmp/ 83 | 84 | # Package Files # 85 | *.jar 86 | *.war 87 | *.ear 88 | *.zip 89 | *.tar.gz 90 | *.rar 91 | 92 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 93 | hs_err_pid* 94 | 95 | ### Gradle ### 96 | .gradle 97 | /build/ 98 | 99 | # Ignore Gradle GUI config 100 | gradle-app.setting 101 | 102 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 103 | !gradle-wrapper.jar 104 | 105 | # Cache of project 106 | .gradletasknamecache 107 | 108 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 109 | # gradle/wrapper/gradle-wrapper.properties 110 | 111 | # End of https://www.gitignore.io/api/java,gradle,intellij 112 | -------------------------------------------------------------------------------- /skrape-jsoup/src/main/kotlin/nolambda/skrape/processor/jsoup/JsoupPageAdapter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.jsoup 2 | 3 | import kotlinx.serialization.json.JsonArray 4 | import kotlinx.serialization.json.JsonElement 5 | import kotlinx.serialization.json.JsonObject 6 | import kotlinx.serialization.json.JsonPrimitive 7 | import nolambda.skrape.nodes.* 8 | import nolambda.skrape.processor.AbstractPageAdapter 9 | import nolambda.skrape.processor.formatter.addFormatter 10 | import nolambda.skrape.result.QuerySkrapeResult 11 | import nolambda.skrape.result.SkrapeResult 12 | import org.jsoup.Connection 13 | import org.jsoup.Jsoup 14 | import org.jsoup.nodes.Element 15 | import java.io.File 16 | 17 | typealias JsoupParserResult = Pair 18 | typealias JsoupConfig = Connection.() -> Unit 19 | 20 | class JsoupPageAdapter( 21 | private val config: JsoupConfig = {} 22 | ) : AbstractPageAdapter() { 23 | 24 | init { 25 | addFormatter(JsoupValueFormatter()) 26 | } 27 | 28 | override fun processQuery(query: Query, element: Element): JsoupParserResult = with(query) { 29 | val children = element.select(selector).map { jsoupElement -> 30 | JsonObject(children.map { 31 | processElement(it, jsoupElement) 32 | }.toMap()) 33 | } 34 | name to JsonArray(children) 35 | } 36 | 37 | override fun processContainer(container: Container, element: Element): JsoupParserResult = with(container) { 38 | val children = children.map { 39 | processElement(it, element) 40 | } 41 | name to JsonObject(children.toMap()) 42 | } 43 | 44 | override fun processAttr(attr: Attr, element: Element): JsoupParserResult = with(attr) { 45 | name to JsonPrimitive(element.attr(attrName)) 46 | } 47 | 48 | override fun requestPage(page: Page): Element { 49 | val (path, baseUrl, encoding) = page.pageInfo 50 | 51 | return if (page.isLocalFile()) { 52 | val file = File(path) 53 | Jsoup.parse(file, encoding, baseUrl) 54 | } else { 55 | Jsoup.connect(path) 56 | .apply(config) 57 | .get() 58 | } 59 | } 60 | 61 | override fun onHandleResult(page: Page, results: List): SkrapeResult { 62 | val json: JsonElement = if (page.isUselessContainer()) { 63 | results.map { it.second }.first() 64 | } else { 65 | JsonObject(results.toMap()) 66 | } 67 | return QuerySkrapeResult(json) 68 | } 69 | } -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /skrape-jsoup/src/test/kotlin/nolambda/skrape/SkrapeJsoupSpec.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape 2 | 3 | import io.kotlintest.matchers.beGreaterThan 4 | import io.kotlintest.matchers.shouldBe 5 | import io.kotlintest.matchers.shouldNotBe 6 | import io.kotlintest.specs.StringSpec 7 | import kotlinx.serialization.decodeFromString 8 | import kotlinx.serialization.json.Json 9 | import kotlinx.serialization.json.JsonArray 10 | import nolambda.skrape.nodes.* 11 | import nolambda.skrape.processor.jsoup.JsoupPageAdapter 12 | import nolambda.skrape.result.SkrapeResult 13 | import java.io.File 14 | 15 | typealias SimpleSkrape = Skrape 16 | 17 | class SkrapeJsoupSpec : StringSpec() { 18 | init { 19 | val skrape = Skrape(JsoupPageAdapter(), enableLog = true) 20 | 21 | "it parsing from local file" { 22 | val result = requestWithFile(skrape, ::createFirstPage) 23 | val json = Json { ignoreUnknownKeys = true } 24 | val response = json.decodeFromString(result) 25 | 26 | result shouldNotBe null 27 | response shouldNotBe null 28 | response.stories[0] shouldNotBe null 29 | } 30 | 31 | "it parsing from url" { 32 | requestWithUrl(skrape) shouldNotBe null 33 | } 34 | 35 | "it support un-named query" { 36 | val result = requestWithFile(skrape, ::createSecondPagee) 37 | val array = Json.decodeFromString(result) 38 | 39 | array.size shouldBe beGreaterThan(1) 40 | } 41 | } 42 | } 43 | 44 | fun createFirstPage(file: File): Page { 45 | return Page(file) { 46 | "items" to query("td a.storylink") { 47 | "text" to text() 48 | "detail" to container { 49 | "link" to attr("href") 50 | } 51 | } 52 | } 53 | } 54 | 55 | fun createSecondPagee(file: File): Page { 56 | return Page(file) { 57 | query("td a.storylink") { 58 | "text" to text() 59 | } 60 | } 61 | } 62 | 63 | fun requestWithFile(skrape: SimpleSkrape, pageCreator: (File) -> Page): String { 64 | val classLoader = ClassLoader.getSystemClassLoader() 65 | val file = File(classLoader.getResource("index.html").file) 66 | 67 | return pageCreator(file).run { 68 | skrape.request(this).json() 69 | } 70 | } 71 | 72 | 73 | fun requestWithUrl(skrape: SimpleSkrape): String { 74 | return Page("https://news.ycombinator.com/") { 75 | "athing" to query("span.score") { 76 | "score" to text() 77 | } 78 | "items" to query("td a.storylink") { 79 | "text" to text() 80 | "link" to attr("href") 81 | } 82 | }.run { 83 | skrape.request(this) 84 | }.json() 85 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/nodes/Node.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.nodes 2 | 3 | import kotlinx.serialization.SerialName 4 | import kotlinx.serialization.Serializable 5 | import kotlinx.serialization.Transient 6 | import java.io.File 7 | 8 | object ElementName { 9 | const val ELEMENT_PAGE = "page" 10 | const val ELEMENT_QUERY = "query" 11 | const val ELEMENT_CONTAINER = "container" 12 | const val ELEMENT_ATTR = "attr" 13 | const val ELEMENT_VALUE = "value" 14 | } 15 | 16 | interface Node { 17 | var name: String 18 | } 19 | 20 | @Serializable 21 | sealed class SkrapeElemenet : Node 22 | 23 | @Serializable 24 | sealed class ParentElement : SkrapeElemenet() { 25 | abstract val body: ElementBody 26 | val children = arrayListOf() 27 | } 28 | 29 | typealias ElementBody = ParentElement.() -> Unit 30 | 31 | /* --------------------------------------------------- */ 32 | /* > Parent Elements */ 33 | /* --------------------------------------------------- */ 34 | 35 | @Serializable 36 | @SerialName(ElementName.ELEMENT_PAGE) 37 | data class Page( 38 | val pageInfo: PageInfo, 39 | override var name: String = "", 40 | @Transient override val body: ElementBody = {} 41 | ) : ParentElement() { 42 | 43 | constructor(path: String, baseUrl: String = "", body: ElementBody) : this(PageInfo(path, baseUrl), body = body) 44 | constructor(file: File, baseUrl: String = "", body: ElementBody) : this(file.path, baseUrl, body) 45 | } 46 | 47 | @Serializable 48 | @SerialName(ElementName.ELEMENT_QUERY) 49 | data class Query( 50 | val selector: String, 51 | override var name: String = "", 52 | @Transient override val body: ElementBody = {} 53 | ) : ParentElement() 54 | 55 | @Serializable 56 | @SerialName(ElementName.ELEMENT_CONTAINER) 57 | data class Container( 58 | override var name: String = "", 59 | @Transient override val body: ElementBody = {} 60 | ) : ParentElement() 61 | 62 | /* --------------------------------------------------- */ 63 | /* > Child Elements */ 64 | /* --------------------------------------------------- */ 65 | 66 | /** 67 | * Attr fetch the [attrName] on the element 68 | */ 69 | @Serializable 70 | @SerialName(ElementName.ELEMENT_ATTR) 71 | data class Attr( 72 | override var name: String = "", 73 | val attrName: String 74 | ) : SkrapeElemenet() 75 | 76 | typealias ValueType = String 77 | 78 | /** 79 | * Value should process [query] inside the parent element if exist 80 | * if not, it will fetch text from the parent element instead 81 | * after that it will convert the data to expected type 82 | */ 83 | @Serializable 84 | @SerialName(ElementName.ELEMENT_VALUE) 85 | data class Value( 86 | override var name: String = "", 87 | val valueType: ValueType = TYPE_STRING, 88 | val selector: String = "" 89 | ) : SkrapeElemenet() { 90 | 91 | companion object { 92 | const val TYPE_STRING: ValueType = "string" 93 | const val TYPE_BOOL: ValueType = "bool" 94 | const val TYPE_INT: ValueType = "int" 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /skrape-chrome/src/main/kotlin/nolambda/skrape/processor/chrome/ChromePageAdapter.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.processor.chrome 2 | 3 | import kotlinx.serialization.json.JsonArray 4 | import kotlinx.serialization.json.JsonElement 5 | import kotlinx.serialization.json.JsonObject 6 | import kotlinx.serialization.json.JsonPrimitive 7 | import nolambda.skrape.nodes.Attr 8 | import nolambda.skrape.nodes.Container 9 | import nolambda.skrape.nodes.Page 10 | import nolambda.skrape.nodes.Query 11 | import nolambda.skrape.processor.AbstractPageAdapter 12 | import nolambda.skrape.processor.formatter.addFormatter 13 | import nolambda.skrape.result.QuerySkrapeResult 14 | import nolambda.skrape.result.SkrapeResult 15 | import org.openqa.selenium.chrome.ChromeDriver 16 | import org.openqa.selenium.support.ui.WebDriverWait 17 | 18 | typealias ChromeParserResult = Pair 19 | 20 | class ChromePageAdapter( 21 | private val waitTimeInSecond: Long = DEFAULT_WAIT_TIME, 22 | private val driverFactory: () -> ChromeDriver = { ChromeDriver() } 23 | ) : AbstractPageAdapter() { 24 | 25 | companion object { 26 | private const val DEFAULT_WAIT_TIME = 3L 27 | const val NO_WAIT_TIME = 0L 28 | } 29 | 30 | private var driver: ChromeDriver? = null 31 | private var waiter: ChromeWaiter? = null 32 | 33 | init { 34 | addFormatter(ChromeValueFormatter { waiter!! }) 35 | } 36 | 37 | private fun createWaiter(): ChromeWaiter { 38 | return if (waitTimeInSecond == NO_WAIT_TIME) { 39 | NoWait 40 | } else { 41 | WebChromeWaiter(WebDriverWait(driver, waitTimeInSecond)) 42 | } 43 | } 44 | 45 | private fun getDriver(): ChromeDriver { 46 | if (driver == null) { 47 | driver = driverFactory() 48 | } 49 | if (waiter == null) { 50 | waiter = createWaiter() 51 | } 52 | return driver!! 53 | } 54 | 55 | override fun requestPage(page: Page): ChromeElement { 56 | val currentDriver = getDriver().apply { 57 | get(page.pageInfo.path) 58 | } 59 | return ChromeElement.Driver(currentDriver) 60 | } 61 | 62 | override fun onHandleResult(page: Page, results: List): SkrapeResult { 63 | return QuerySkrapeResult(JsonObject(results.toMap())) 64 | } 65 | 66 | override fun processQuery(query: Query, element: ChromeElement): ChromeParserResult = with(query) { 67 | val children = element.findElWait(checkNotNull(waiter), selector).map { webEl -> 68 | JsonObject(children.map { 69 | processElement(it, ChromeElement.Component(webEl)) 70 | }.toMap()) 71 | } 72 | name to JsonArray(children) 73 | } 74 | 75 | override fun processContainer(container: Container, element: ChromeElement): ChromeParserResult = with(container) { 76 | val children = children.map { 77 | processElement(it, element) 78 | } 79 | name to JsonObject(children.toMap()) 80 | } 81 | 82 | override fun processAttr(attr: Attr, element: ChromeElement): ChromeParserResult = with(attr) { 83 | name to JsonPrimitive(element.attr(attrName)) 84 | } 85 | 86 | override fun onEnd() { 87 | driver?.quit() 88 | driver = null 89 | waiter = null 90 | } 91 | } -------------------------------------------------------------------------------- /skrape-core/src/main/kotlin/nolambda/skrape/serialization/JsonPageSerializer.kt: -------------------------------------------------------------------------------- 1 | package nolambda.skrape.serialization 2 | 3 | import kotlinx.serialization.decodeFromString 4 | import kotlinx.serialization.encodeToString 5 | import kotlinx.serialization.json.* 6 | import nolambda.skrape.nodes.* 7 | 8 | class JsonPageSerializer( 9 | private val json: Json = Json { 10 | encodeDefaults = true 11 | ignoreUnknownKeys = true 12 | } 13 | ) : PageSerializer { 14 | 15 | override fun serialize(page: Page): String { 16 | return json.encodeToString(page.evaluate()) 17 | } 18 | 19 | override fun deserialize(target: String): Page { 20 | val jsonObject = json.decodeFromString(target) 21 | val pageInfo = json.decodeFromJsonElement(jsonObject[PageSerializer.KEY_PAGE_INFO]!!) 22 | 23 | return Page(pageInfo = pageInfo, name = jsonObject.getName()) { 24 | children.addAll(createChildrenFromJsonArray(jsonObject.getChildArray())) 25 | }.evaluate() 26 | } 27 | 28 | private fun mapTypeToClass(type: String) = when (type) { 29 | ElementName.ELEMENT_QUERY -> Query::class.java 30 | ElementName.ELEMENT_CONTAINER -> Container::class.java 31 | ElementName.ELEMENT_ATTR -> Attr::class.java 32 | ElementName.ELEMENT_VALUE -> Value::class.java 33 | else -> throw IllegalArgumentException("Not a valid page JSON!") 34 | } 35 | 36 | private fun createElementFromJson(element: JsonElement): SkrapeElemenet { 37 | val type = element.jsonObject[PageSerializer.KEY_TYPE]!!.jsonPrimitive.content 38 | val clazz = mapTypeToClass(type) 39 | return if (ParentElement::class.java.isAssignableFrom(clazz)) { 40 | createParentElement(clazz, element) 41 | } else { 42 | json.decodeFromJsonElement(element) 43 | } 44 | } 45 | 46 | private fun createParentElement(clazz: Class, content: JsonElement): SkrapeElemenet { 47 | return when (clazz) { 48 | Query::class.java -> Query( 49 | content.jsonObject[PageSerializer.KEY_SELECTOR]!!.jsonPrimitive.content, 50 | content.getName(), 51 | content.createBody() 52 | ) 53 | Container::class.java -> Container( 54 | content.getName(), 55 | content.createBody() 56 | ) 57 | else -> throw IllegalArgumentException("Not a valid parent class!") 58 | } 59 | } 60 | 61 | private fun createChildrenFromJsonArray(children: JsonArray): List { 62 | return children.map { child: JsonElement -> 63 | createElementFromJson(child) 64 | } 65 | } 66 | 67 | private fun JsonElement.getChildArray(): JsonArray { 68 | return when (this) { 69 | is JsonObject -> get(PageSerializer.KEY_PAGE_CHILDREN) as JsonArray 70 | else -> throw IllegalStateException("Element is not an object: $this") 71 | } 72 | } 73 | 74 | private fun JsonElement.getName() = when (this) { 75 | is JsonObject -> get(PageSerializer.KEY_NAME)!!.jsonPrimitive.content 76 | else -> throw IllegalStateException("Element is not an object: $this") 77 | } 78 | 79 | private fun JsonElement.createBody(): ElementBody { 80 | val json = this 81 | return { children.addAll(createChildrenFromJsonArray(json.getChildArray())) } 82 | } 83 | } -------------------------------------------------------------------------------- /skrape-core/src/test/kotlin/nolamda/skrape/JsonPageSeriliazerSpec.kt: -------------------------------------------------------------------------------- 1 | package nolamda.skrape 2 | 3 | import io.kotlintest.matchers.shouldBe 4 | import io.kotlintest.specs.StringSpec 5 | import kotlinx.serialization.json.Json 6 | import nolambda.skrape.nodes.* 7 | import nolambda.skrape.serialization.JsonPageSerializer 8 | 9 | class JsonPageSeriliazerSpec : StringSpec({ 10 | 11 | val page = Page("https://news.ycombinator.com/") { 12 | "athing" to query("span.score") { 13 | "score" to text() 14 | "info" to container { 15 | "coolness" to attr("alt") 16 | } 17 | } 18 | "items" to query("td a.storylink") { 19 | "text" to text() 20 | "link" to attr("href") 21 | } 22 | } 23 | 24 | val pageString = """ 25 | { 26 | "type": "page", 27 | "pageInfo": { 28 | "path": "https://news.ycombinator.com/", 29 | "baseUrl": "", 30 | "encoding": "UTF-8" 31 | }, 32 | "name": "", 33 | "children": [ 34 | { 35 | "type": "query", 36 | "selector": "span.score", 37 | "name": "athing", 38 | "children": [ 39 | { 40 | "type": "value", 41 | "name": "score", 42 | "valueType": "string", 43 | "selector": "" 44 | }, 45 | { 46 | "type": "container", 47 | "name": "info", 48 | "children": [ 49 | { 50 | "type": "attr", 51 | "name": "coolness", 52 | "attrName": "alt" 53 | } 54 | ] 55 | } 56 | ] 57 | }, 58 | { 59 | "type": "query", 60 | "selector": "td a.storylink", 61 | "name": "items", 62 | "children": [ 63 | { 64 | "type": "value", 65 | "name": "text", 66 | "valueType": "string", 67 | "selector": "" 68 | }, 69 | { 70 | "type": "attr", 71 | "name": "link", 72 | "attrName": "href" 73 | } 74 | ] 75 | } 76 | ] 77 | } 78 | """.trimIndent() 79 | 80 | val serializer = JsonPageSerializer(Json { 81 | prettyPrint = true 82 | encodeDefaults = true 83 | }) 84 | 85 | "it serialize to string" { 86 | val result = serializer.serialize(page) 87 | result.contains(Regex("ycombinator")) shouldBe true 88 | } 89 | 90 | "it deserialize to page" { 91 | val result = serializer.deserialize(pageString) 92 | 93 | result.children.size shouldBe 2 94 | result.pageInfo.path shouldBe "https://news.ycombinator.com/" 95 | 96 | (result.children.first() as ParentElement).children.size shouldBe 2 97 | 98 | val value = (result.children.first() as ParentElement).children.first() as Value 99 | value.valueType shouldBe Value.TYPE_STRING 100 | 101 | val secondResult = serializer.deserialize(serializer.serialize(result)) 102 | 103 | secondResult.children.size shouldBe 2 104 | secondResult.pageInfo.path shouldBe "https://news.ycombinator.com/" 105 | } 106 | }) -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn ( ) { 37 | echo "$*" 38 | } 39 | 40 | die ( ) { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save ( ) { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /skrape-jsoup/src/test/resources/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Hacker News
6 | 12 | 136 |
7 | 11 |
Hacker News 8 | new | comments | show | ask | jobs | submit 9 | login 10 |
13 | 14 | 16 | 17 | 18 | 20 | 21 | 22 | 24 | 25 | 26 | 28 | 29 | 30 | 32 | 33 | 34 | 36 | 37 | 38 | 40 | 41 | 42 | 44 | 45 | 46 | 48 | 49 | 50 | 52 | 53 | 54 | 56 | 57 | 58 | 60 | 61 | 62 | 64 | 65 | 66 | 68 | 69 | 70 | 72 | 73 | 74 | 76 | 77 | 78 | 80 | 81 | 82 | 84 | 85 | 86 | 88 | 89 | 90 | 92 | 93 | 94 | 96 | 97 | 98 | 100 | 101 | 102 | 104 | 105 | 106 | 108 | 109 | 110 | 112 | 113 | 114 | 116 | 117 | 118 | 120 | 121 | 122 | 124 | 125 | 126 | 128 | 129 | 130 | 132 | 133 | 134 |
1. SFO near miss could have triggered ‘greatest aviation disaster in history' (mercurynews.com)
15 | 55 points by milesf 49 minutes ago | hide | 26 comments
2. Taking control of all .io domains with a targeted registration (thehackerblog.com)
19 | 1057 points by koenrh 13 hours ago | hide | 181 comments
3. Using Tesseract OCR with Python (pyimagesearch.com)
23 | 37 points by jonbaer 3 hours ago | hide | 6 comments
4. China Tells Carriers to Block Access to Personal VPNs by February (bloomberg.com)
27 | 37 points by valentinebm 3 hours ago | hide | 19 comments
5. Joe Hruska, founder and CEO of RescueTime (YC W08), has died (rescuetime.com)
31 | 344 points by robby1066 11 hours ago | hide | 35 comments
6. How To Go Viral By Using Fake Reddit Likes (hack-pr.com)
35 | 198 points by scribu 6 hours ago | hide | 69 comments
7. Snap falls to IPO price (usatoday.com)
39 | 226 points by prostoalex 8 hours ago | hide | 190 comments
8. Elon Musk confirms that he just bought back X.com, the domain he owned in 1999 (techcrunch.com)
43 | 100 points by janober 3 hours ago | hide | 46 comments
9. MIOpen: AMD's Machine Intelligence Library (github.com)
47 | 30 points by jonbaer 3 hours ago | hide | 2 comments
10. How Rust is tested (brson.github.io)
51 | 227 points by brson 12 hours ago | hide | 67 comments
11. Bloom Filters by Example (llimllib.github.io)
55 | 96 points by gvenzl 7 hours ago | hide | 13 comments
12. Certigrad: bug-free machine learning on stochastic computation graphs (github.com)
59 | 74 points by kg9000 8 hours ago | hide | 16 comments
13. Memory Usage Inside the CLR (mattwarren.org)
63 | 59 points by matthewwarren 7 hours ago | hide | 6 comments
14. Show HN: Seashells – Pipe output from CLI apps to the web in real time (seashells.io)
67 | 276 points by anishathalye 13 hours ago | hide | 54 comments
15. Evolution of the Lego Logo (logodesignlove.com)
71 | 58 points by NaOH 7 hours ago | hide | 11 comments
16. Iterators and Streams in Rust and Haskell (fpcomplete.com)
75 | 114 points by psibi 10 hours ago | hide | 17 comments
17. Unikernels are secure (unikernel.org)
79 | 167 points by ingve 13 hours ago | hide | 109 comments
18. Automated Verification of a Type-Safe Operating System [pdf] (microsoft.com)
83 | 107 points by muraiki 12 hours ago | hide | 20 comments
19. Build a Serverless Web Applicaion (amazon.com)
87 | 49 points by munns 3 hours ago | hide | 23 comments
20. Sexual Harassment in Silicon Valley (elaineou.com)
91 | 50 points by anthuswilliams 8 hours ago | hide | 28 comments
21. OpenBSD and the modern laptop (bsdly.blogspot.com)
95 | 33 points by protomyth 7 hours ago | hide | 2 comments
22. Project Everest: Efficient, verified components for the HTTPS ecosystem (project-everest.github.io)
99 | 60 points by EvgeniyZh 11 hours ago | hide | 3 comments
23. FarmLogs (YC W12) is looking for someone to lead our Android/iOS teams (farmlogs.com)
103 | 2 hours ago | hide
24. Semantic Segmentation Using Fully Convolutional Networks Over the Years (meetshah1995.github.io)
107 | 63 points by stared 11 hours ago | hide | 2 comments
25. Higher Levels of CO2 May Diminish Decision Making Performance (2013) [pdf] (lbl.gov)
111 | 55 points by bryanrasmussen 11 hours ago | hide | 15 comments
26. Librarian: Get links to references and Bibtex for papers on arXiv (fermatslibrary.com)
115 | 59 points by mgdo 12 hours ago | hide | 15 comments
27. Return to abort() – Using code introspection to prevent stack-smashing (github.com)
119 | 37 points by cjd 10 hours ago | hide | 16 comments
28. Urban Climates (thebritishgeographer.weebly.com)
123 | 34 points by raattgift 9 hours ago | hide | 4 comments
29. The Berkeley Revolution: A digital archive (berkeley.edu)
127 | 26 points by smacktoward 8 hours ago | hide | 7 comments
30. Launch HN: Guilded (YC S17) – Power-Ups for Gaming Teams (guilded.gg)
131 | 65 points by iEchoic 12 hours ago | hide | 46 comments
More
135 |

Guidelines 137 | | FAQ 138 | | Support 139 | | API 140 | | Security 141 | | Lists 142 | | Bookmarklet 143 | | DMCA 144 | | Apply to YC 145 | | Contact

Search: 146 |
147 |
148 | --------------------------------------------------------------------------------