├── .editorconfig ├── .gitignore ├── .idea ├── .gitignore ├── codeStyles │ ├── Project.xml │ └── codeStyleConfig.xml ├── compiler.xml ├── encodings.xml ├── inspectionProfiles │ └── Project_Default.xml └── misc.xml ├── .travis.yml ├── CHANGES.txt ├── LICENSE ├── README.md ├── config ├── checkstyle.xml └── intellij │ ├── gradle-template.xml │ └── reset.sh ├── crawler4j-examples ├── crawler4j-examples-base │ ├── .gitignore │ ├── build.gradle │ └── src │ │ ├── main │ │ └── resources │ │ │ └── logback.xml │ │ └── test │ │ └── java │ │ └── edu │ │ └── uci │ │ └── ics │ │ └── crawler4j │ │ └── examples │ │ ├── basic │ │ ├── BasicCrawlController.java │ │ └── BasicCrawler.java │ │ ├── imagecrawler │ │ ├── ImageCrawlController.java │ │ └── ImageCrawler.java │ │ ├── localdata │ │ ├── CrawlStat.java │ │ ├── LocalDataCollectorController.java │ │ └── LocalDataCollectorCrawler.java │ │ ├── multiple │ │ ├── BasicCrawler.java │ │ └── MultipleCrawlerController.java │ │ ├── shutdown │ │ ├── BasicCrawler.java │ │ └── ControllerWithShutdown.java │ │ └── statushandler │ │ ├── StatusHandlerCrawlController.java │ │ └── StatusHandlerCrawler.java └── crawler4j-examples-postgres │ ├── README.md │ ├── build.gradle │ └── src │ ├── main │ └── java │ │ └── edu │ │ └── uci │ │ └── ics │ │ └── crawler4j │ │ └── examples │ │ ├── SampleLauncher.java │ │ ├── crawler │ │ ├── PostgresCrawlerFactory.java │ │ └── PostgresWebCrawler.java │ │ └── db │ │ ├── PostgresDBService.java │ │ └── impl │ │ └── PostgresDBServiceImpl.java │ └── test │ ├── java │ └── edu │ │ └── uci │ │ └── ics │ │ └── crawler4j │ │ └── examples │ │ └── PgsqlTest.java │ └── resources │ ├── db │ └── migration │ │ └── V1__initial_schema.sql │ ├── docker-compose.yml │ └── logback.xml ├── crawler4j ├── build.gradle └── src │ ├── main │ └── java │ │ └── edu │ │ └── uci │ │ └── ics │ │ └── crawler4j │ │ ├── crawler │ │ ├── Configurable.java │ │ ├── CrawlConfig.java │ │ ├── CrawlController.java │ │ ├── Page.java │ │ ├── WebCrawler.java │ │ ├── authentication │ │ │ ├── AuthInfo.java │ │ │ ├── BasicAuthInfo.java │ │ │ ├── FormAuthInfo.java │ │ │ └── NtAuthInfo.java │ │ └── exceptions │ │ │ ├── ContentFetchException.java │ │ │ ├── PageBiggerThanMaxSizeException.java │ │ │ └── ParseException.java │ │ ├── fetcher │ │ ├── BasicAuthHttpRequestInterceptor.java │ │ ├── IdleConnectionMonitorThread.java │ │ ├── PageFetchResult.java │ │ ├── PageFetcher.java │ │ ├── SniPoolingHttpClientConnectionManager.java │ │ └── SniSSLConnectionSocketFactory.java │ │ ├── frontier │ │ ├── Counters.java │ │ ├── DocIDServer.java │ │ ├── Frontier.java │ │ ├── InProcessPagesDB.java │ │ ├── WebURLTupleBinding.java │ │ └── WorkQueues.java │ │ ├── parser │ │ ├── AllTagMapper.java │ │ ├── BinaryParseData.java │ │ ├── CssParseData.java │ │ ├── ExtractedUrlAnchorPair.java │ │ ├── HtmlContentHandler.java │ │ ├── HtmlParseData.java │ │ ├── HtmlParser.java │ │ ├── NotAllowedContentException.java │ │ ├── ParseData.java │ │ ├── Parser.java │ │ ├── TextParseData.java │ │ └── TikaHtmlParser.java │ │ ├── robotstxt │ │ ├── HostDirectives.java │ │ ├── PathRule.java │ │ ├── RobotstxtConfig.java │ │ ├── RobotstxtParser.java │ │ ├── RobotstxtServer.java │ │ └── UserAgentDirectives.java │ │ ├── url │ │ ├── TLDList.java │ │ ├── URLCanonicalizer.java │ │ ├── UrlResolver.java │ │ └── WebURL.java │ │ └── util │ │ ├── IO.java │ │ ├── Net.java │ │ └── Util.java │ └── test │ ├── groovy │ └── edu │ │ └── uci │ │ └── ics │ │ └── crawler4j │ │ ├── auth │ │ ├── BasicAuthTest.groovy │ │ └── FormAuthInfoTest.groovy │ │ ├── config │ │ └── CustomDnsResolverTest.groovy │ │ ├── crawler │ │ ├── CrawlerWithJSTest.groovy │ │ ├── NoFollowTest.groovy │ │ ├── NoIndexTest.groovy │ │ ├── OnRedirectedToInvalidTest.groovy │ │ ├── PageTest.groovy │ │ ├── RedirectHandlerTest.groovy │ │ ├── TimeoutTest.groovy │ │ └── WebCrawlerTest.groovy │ │ ├── parser │ │ ├── CssParseDataTest.groovy │ │ └── HtmlParserTest.groovy │ │ ├── robotstxt │ │ └── RobotstxtParserTest.groovy │ │ ├── url │ │ ├── PublicSuffixTest.groovy │ │ └── TLDListOnlineTest.groovy │ │ └── util │ │ └── NetTest.groovy │ ├── java │ └── edu │ │ └── uci │ │ └── ics │ │ └── crawler4j │ │ └── tests │ │ ├── URLCanonicalizerTest.java │ │ └── fetcher │ │ ├── PageFetcherHtmlOnly.java │ │ └── PageFetcherHtmlTest.java │ └── resources │ ├── css │ ├── absolute.css │ ├── quotes.css │ └── relative.css │ ├── html │ └── wiki.c2.com.html │ ├── logback.xml │ ├── public_suffix_list.dat │ └── robotstxt │ └── he.wikipedia.org_robots.txt ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat └── settings.gradle /.editorconfig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.editorconfig -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.gitignore -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.idea/.gitignore -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.idea/codeStyles/Project.xml -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.idea/codeStyles/codeStyleConfig.xml -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.idea/compiler.xml -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.idea/encodings.xml -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.idea/inspectionProfiles/Project_Default.xml -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.idea/misc.xml -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/.travis.yml -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/CHANGES.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/README.md -------------------------------------------------------------------------------- /config/checkstyle.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/config/checkstyle.xml -------------------------------------------------------------------------------- /config/intellij/gradle-template.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/config/intellij/gradle-template.xml -------------------------------------------------------------------------------- /config/intellij/reset.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/config/intellij/reset.sh -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/.gitignore: -------------------------------------------------------------------------------- 1 | /frontier/ 2 | -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/build.gradle: -------------------------------------------------------------------------------- 1 | dependencies { 2 | compile project(':crawler4j') 3 | } 4 | -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/main/resources/logback.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/main/resources/logback.xml -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawlController.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawlController.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawlController.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawlController.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/localdata/CrawlStat.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/localdata/CrawlStat.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorController.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorController.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/MultipleCrawlerController.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/MultipleCrawlerController.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/ControllerWithShutdown.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/ControllerWithShutdown.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawlController.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawlController.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/README.md -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/build.gradle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/build.gradle -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresCrawlerFactory.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresCrawlerFactory.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresWebCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresWebCrawler.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/PostgresDBService.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/PostgresDBService.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/impl/PostgresDBServiceImpl.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/impl/PostgresDBServiceImpl.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/test/java/edu/uci/ics/crawler4j/examples/PgsqlTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/test/java/edu/uci/ics/crawler4j/examples/PgsqlTest.java -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/test/resources/db/migration/V1__initial_schema.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/test/resources/db/migration/V1__initial_schema.sql -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/test/resources/docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/test/resources/docker-compose.yml -------------------------------------------------------------------------------- /crawler4j-examples/crawler4j-examples-postgres/src/test/resources/logback.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j-examples/crawler4j-examples-postgres/src/test/resources/logback.xml -------------------------------------------------------------------------------- /crawler4j/build.gradle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/build.gradle -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/AuthInfo.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/BasicAuthInfo.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/FormAuthInfo.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/authentication/NtAuthInfo.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/ContentFetchException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/ContentFetchException.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/PageBiggerThanMaxSizeException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/PageBiggerThanMaxSizeException.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/ParseException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/exceptions/ParseException.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/BasicAuthHttpRequestInterceptor.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/BasicAuthHttpRequestInterceptor.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/IdleConnectionMonitorThread.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/IdleConnectionMonitorThread.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/SniPoolingHttpClientConnectionManager.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/SniPoolingHttpClientConnectionManager.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/SniSSLConnectionSocketFactory.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/SniSSLConnectionSocketFactory.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Counters.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Counters.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/InProcessPagesDB.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/InProcessPagesDB.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WorkQueues.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/BinaryParseData.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/BinaryParseData.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/CssParseData.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/CssParseData.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParser.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/NotAllowedContentException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/NotAllowedContentException.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ParseData.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ParseData.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TextParseData.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TextParseData.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/HostDirectives.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/HostDirectives.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/PathRule.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/PathRule.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtConfig.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtParser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtParser.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/UserAgentDirectives.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/UserAgentDirectives.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/url/TLDList.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/TLDList.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/url/UrlResolver.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/UrlResolver.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/util/IO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/util/IO.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Net.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Net.java -------------------------------------------------------------------------------- /crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Util.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Util.java -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/auth/BasicAuthTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/auth/BasicAuthTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/auth/FormAuthInfoTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/auth/FormAuthInfoTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/config/CustomDnsResolverTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/config/CustomDnsResolverTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/CrawlerWithJSTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/CrawlerWithJSTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoFollowTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoFollowTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoIndexTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoIndexTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/OnRedirectedToInvalidTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/OnRedirectedToInvalidTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/PageTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/PageTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/RedirectHandlerTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/RedirectHandlerTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/TimeoutTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/TimeoutTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/WebCrawlerTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/WebCrawlerTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/parser/CssParseDataTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/parser/CssParseDataTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/parser/HtmlParserTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/parser/HtmlParserTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/robotstxt/RobotstxtParserTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/robotstxt/RobotstxtParserTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/url/PublicSuffixTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/url/PublicSuffixTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/url/TLDListOnlineTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/url/TLDListOnlineTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/groovy/edu/uci/ics/crawler4j/util/NetTest.groovy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/util/NetTest.groovy -------------------------------------------------------------------------------- /crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/URLCanonicalizerTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/URLCanonicalizerTest.java -------------------------------------------------------------------------------- /crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java -------------------------------------------------------------------------------- /crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlTest.java -------------------------------------------------------------------------------- /crawler4j/src/test/resources/css/absolute.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/resources/css/absolute.css -------------------------------------------------------------------------------- /crawler4j/src/test/resources/css/quotes.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/resources/css/quotes.css -------------------------------------------------------------------------------- /crawler4j/src/test/resources/css/relative.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/resources/css/relative.css -------------------------------------------------------------------------------- /crawler4j/src/test/resources/html/wiki.c2.com.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/resources/html/wiki.c2.com.html -------------------------------------------------------------------------------- /crawler4j/src/test/resources/logback.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/resources/logback.xml -------------------------------------------------------------------------------- /crawler4j/src/test/resources/public_suffix_list.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/resources/public_suffix_list.dat -------------------------------------------------------------------------------- /crawler4j/src/test/resources/robotstxt/he.wikipedia.org_robots.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/crawler4j/src/test/resources/robotstxt/he.wikipedia.org_robots.txt -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/gradle.properties -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/gradle/wrapper/gradle-wrapper.properties -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/gradlew -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/gradlew.bat -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yasserg/crawler4j/HEAD/settings.gradle --------------------------------------------------------------------------------