├── .gitignore ├── LICENSE ├── README.md ├── cdp ├── README.md ├── pom.xml ├── resources │ └── org │ │ └── netpreserve │ │ └── warcaroo │ │ └── cdp │ │ └── forceload.js ├── src │ └── org │ │ └── netpreserve │ │ └── warcaroo │ │ ├── cdp │ │ ├── BrowserProcess.java │ │ ├── IdleMonitor.java │ │ ├── NavigationException.java │ │ ├── NavigationFailedException.java │ │ ├── NavigationHandler.java │ │ ├── NavigationTimedOutException.java │ │ ├── Navigator.java │ │ ├── NetworkManager.java │ │ ├── RequestHandler.java │ │ ├── ResourceFetched.java │ │ ├── ResourceRecorder.java │ │ ├── domains │ │ │ ├── Browser.java │ │ │ ├── Emulation.java │ │ │ ├── Fetch.java │ │ │ ├── IO.java │ │ │ ├── Network.java │ │ │ ├── Page.java │ │ │ ├── Runtime.java │ │ │ └── Target.java │ │ └── protocol │ │ │ ├── CDPBase.java │ │ │ ├── CDPClient.java │ │ │ ├── CDPClosedException.java │ │ │ ├── CDPException.java │ │ │ ├── CDPSession.java │ │ │ ├── CDPTimeoutException.java │ │ │ ├── RPC.java │ │ │ └── Unwrap.java │ │ └── util │ │ ├── BareMediaType.java │ │ ├── LogUtils.java │ │ └── Url.java ├── test-resources │ └── simplelogger.properties └── test │ └── org │ └── netpreserve │ └── warcaroo │ └── cdp │ ├── NavigatorTest.java │ ├── ResourceRecorderInterceptorTest.java │ └── ResourceRecorderTest.java ├── crawler ├── config.yaml ├── pom.xml ├── resources │ ├── META-INF │ │ └── resources │ │ │ ├── frontier.html │ │ │ ├── hosts.html │ │ │ ├── main.html │ │ │ ├── pages.html │ │ │ ├── redoc.html │ │ │ ├── resources.html │ │ │ ├── scalar.html │ │ │ ├── settings.html │ │ │ ├── tables.js │ │ │ ├── warcaroo.css │ │ │ └── warcaroo.svg │ ├── logback.xml │ └── org │ │ └── netpreserve │ │ └── warcaroo │ │ ├── config │ │ ├── defaults.yaml │ │ └── example.yaml │ │ └── schema.sql ├── src │ └── org │ │ └── netpreserve │ │ └── warcaroo │ │ ├── BrowserManager.java │ │ ├── CrawlLimitException.java │ │ ├── Database.java │ │ ├── Domain.java │ │ ├── Frontier.java │ │ ├── FrontierUrl.java │ │ ├── Host.java │ │ ├── Job.java │ │ ├── Page.java │ │ ├── Progress.java │ │ ├── ProgressTracker.java │ │ ├── Replay.java │ │ ├── Resource.java │ │ ├── RobotsTxt.java │ │ ├── RobotsTxtChecker.java │ │ ├── Scope.java │ │ ├── Storage.java │ │ ├── UrlMatcher.java │ │ ├── Visit.java │ │ ├── WarcRotator.java │ │ ├── Warcaroo.java │ │ ├── WarcarooException.java │ │ ├── Worker.java │ │ ├── config │ │ ├── BrowserConfig.java │ │ ├── CrawlConfig.java │ │ ├── JobConfig.java │ │ ├── LimitsConfig.java │ │ ├── LocalLimitsConfig.java │ │ ├── ResourcesConfig.java │ │ ├── ScopeConfig.java │ │ ├── ScopeType.java │ │ ├── SeedConfig.java │ │ ├── SheetConfig.java │ │ └── StorageConfig.java │ │ ├── db │ │ ├── DomainDAO.java │ │ ├── FrontierDAO.java │ │ ├── HostDAO.java │ │ ├── PageDAO.java │ │ ├── ProgressDAO.java │ │ ├── ResourceDAO.java │ │ └── RobotsTxtDAO.java │ │ ├── robotstxt │ │ ├── Matcher.java │ │ ├── MatchingStrategy.java │ │ ├── ParseException.java │ │ ├── ParseHandler.java │ │ ├── Parser.java │ │ ├── RobotsContents.java │ │ ├── RobotsLongestMatchStrategy.java │ │ ├── RobotsMatcher.java │ │ ├── RobotsParseHandler.java │ │ └── RobotsParser.java │ │ ├── util │ │ ├── LogHighlighter.java │ │ ├── MustUpdate.java │ │ ├── NamedThreadFactory.java │ │ └── jackson │ │ │ ├── ByteSizeDeserializer.java │ │ │ ├── DurationDeserializer.java │ │ │ └── ShellCommandDeserializer.java │ │ └── webapp │ │ ├── OpenAPI.java │ │ ├── Query.java │ │ ├── QueryMapper.java │ │ ├── Route.java │ │ └── Webapp.java └── test │ └── org │ └── netpreserve │ └── warcaroo │ ├── BrowserManagerTest.java │ ├── DatabaseTest.java │ ├── FrontierTest.java │ ├── HostDAOTest.java │ ├── InMemoryDatabaseTestExtension.java │ ├── PageDAOTest.java │ ├── QueryMapperTest.java │ ├── UrlMatcherTest.java │ └── config │ └── JobConfigTest.java ├── pom.xml ├── renderer ├── pom.xml └── src │ └── org │ └── netpreserve │ └── warcaroo │ ├── Index.java │ └── Renderer.java └── roo.svg /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/README.md -------------------------------------------------------------------------------- /cdp/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/README.md -------------------------------------------------------------------------------- /cdp/pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/pom.xml -------------------------------------------------------------------------------- /cdp/resources/org/netpreserve/warcaroo/cdp/forceload.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/resources/org/netpreserve/warcaroo/cdp/forceload.js -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/BrowserProcess.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/BrowserProcess.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/IdleMonitor.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/IdleMonitor.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/NavigationException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/NavigationException.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/NavigationFailedException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/NavigationFailedException.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/NavigationHandler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/NavigationHandler.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/NavigationTimedOutException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/NavigationTimedOutException.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/Navigator.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/Navigator.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/NetworkManager.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/NetworkManager.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/RequestHandler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/RequestHandler.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/ResourceFetched.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/ResourceFetched.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/ResourceRecorder.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/ResourceRecorder.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/Browser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/Browser.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/Emulation.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/Emulation.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/Fetch.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/Fetch.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/IO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/IO.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/Network.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/Network.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/Page.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/Page.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/Runtime.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/Runtime.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/domains/Target.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/domains/Target.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPBase.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPBase.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPClient.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPClient.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPClosedException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPClosedException.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPException.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPSession.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPSession.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPTimeoutException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/CDPTimeoutException.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/RPC.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/RPC.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/cdp/protocol/Unwrap.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/cdp/protocol/Unwrap.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/util/BareMediaType.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/util/BareMediaType.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/util/LogUtils.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/util/LogUtils.java -------------------------------------------------------------------------------- /cdp/src/org/netpreserve/warcaroo/util/Url.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/src/org/netpreserve/warcaroo/util/Url.java -------------------------------------------------------------------------------- /cdp/test-resources/simplelogger.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/test-resources/simplelogger.properties -------------------------------------------------------------------------------- /cdp/test/org/netpreserve/warcaroo/cdp/NavigatorTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/test/org/netpreserve/warcaroo/cdp/NavigatorTest.java -------------------------------------------------------------------------------- /cdp/test/org/netpreserve/warcaroo/cdp/ResourceRecorderInterceptorTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/test/org/netpreserve/warcaroo/cdp/ResourceRecorderInterceptorTest.java -------------------------------------------------------------------------------- /cdp/test/org/netpreserve/warcaroo/cdp/ResourceRecorderTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/cdp/test/org/netpreserve/warcaroo/cdp/ResourceRecorderTest.java -------------------------------------------------------------------------------- /crawler/config.yaml: -------------------------------------------------------------------------------- 1 | seeds: 2 | - "https://www-test.nla.gov.au/xinq/" -------------------------------------------------------------------------------- /crawler/pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/pom.xml -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/frontier.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/frontier.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/hosts.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/hosts.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/main.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/main.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/pages.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/pages.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/redoc.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/redoc.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/resources.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/resources.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/scalar.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/scalar.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/settings.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/settings.html -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/tables.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/tables.js -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/warcaroo.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/warcaroo.css -------------------------------------------------------------------------------- /crawler/resources/META-INF/resources/warcaroo.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/META-INF/resources/warcaroo.svg -------------------------------------------------------------------------------- /crawler/resources/logback.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/logback.xml -------------------------------------------------------------------------------- /crawler/resources/org/netpreserve/warcaroo/config/defaults.yaml: -------------------------------------------------------------------------------- 1 | seeds: [] 2 | scopeType: prefix 3 | crawl: 4 | userAgent: warcaroo 5 | delay: 2000 -------------------------------------------------------------------------------- /crawler/resources/org/netpreserve/warcaroo/config/example.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/org/netpreserve/warcaroo/config/example.yaml -------------------------------------------------------------------------------- /crawler/resources/org/netpreserve/warcaroo/schema.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/resources/org/netpreserve/warcaroo/schema.sql -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/BrowserManager.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/BrowserManager.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/CrawlLimitException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/CrawlLimitException.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Database.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Database.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Domain.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Domain.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Frontier.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Frontier.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/FrontierUrl.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/FrontierUrl.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Host.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Host.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Job.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Job.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Page.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Page.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Progress.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Progress.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/ProgressTracker.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/ProgressTracker.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Replay.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Replay.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Resource.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Resource.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/RobotsTxt.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/RobotsTxt.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/RobotsTxtChecker.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/RobotsTxtChecker.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Scope.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Scope.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Storage.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Storage.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/UrlMatcher.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/UrlMatcher.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Visit.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Visit.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/WarcRotator.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/WarcRotator.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Warcaroo.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Warcaroo.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/WarcarooException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/WarcarooException.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/Worker.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/Worker.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/BrowserConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/BrowserConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/CrawlConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/CrawlConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/JobConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/JobConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/LimitsConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/LimitsConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/LocalLimitsConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/LocalLimitsConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/ResourcesConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/ResourcesConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/ScopeConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/ScopeConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/ScopeType.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/ScopeType.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/SeedConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/SeedConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/SheetConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/SheetConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/config/StorageConfig.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/config/StorageConfig.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/db/DomainDAO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/db/DomainDAO.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/db/FrontierDAO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/db/FrontierDAO.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/db/HostDAO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/db/HostDAO.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/db/PageDAO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/db/PageDAO.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/db/ProgressDAO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/db/ProgressDAO.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/db/ResourceDAO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/db/ResourceDAO.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/db/RobotsTxtDAO.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/db/RobotsTxtDAO.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/Matcher.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/Matcher.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/MatchingStrategy.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/MatchingStrategy.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/ParseException.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/ParseException.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/ParseHandler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/ParseHandler.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/Parser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/Parser.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsContents.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsContents.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsLongestMatchStrategy.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsLongestMatchStrategy.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsMatcher.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsMatcher.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsParseHandler.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsParseHandler.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsParser.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/robotstxt/RobotsParser.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/util/LogHighlighter.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/util/LogHighlighter.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/util/MustUpdate.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/util/MustUpdate.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/util/NamedThreadFactory.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/util/NamedThreadFactory.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/util/jackson/ByteSizeDeserializer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/util/jackson/ByteSizeDeserializer.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/util/jackson/DurationDeserializer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/util/jackson/DurationDeserializer.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/util/jackson/ShellCommandDeserializer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/util/jackson/ShellCommandDeserializer.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/webapp/OpenAPI.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/webapp/OpenAPI.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/webapp/Query.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/webapp/Query.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/webapp/QueryMapper.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/webapp/QueryMapper.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/webapp/Route.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/webapp/Route.java -------------------------------------------------------------------------------- /crawler/src/org/netpreserve/warcaroo/webapp/Webapp.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/src/org/netpreserve/warcaroo/webapp/Webapp.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/BrowserManagerTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/BrowserManagerTest.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/DatabaseTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/DatabaseTest.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/FrontierTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/FrontierTest.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/HostDAOTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/HostDAOTest.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/InMemoryDatabaseTestExtension.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/InMemoryDatabaseTestExtension.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/PageDAOTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/PageDAOTest.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/QueryMapperTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/QueryMapperTest.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/UrlMatcherTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/UrlMatcherTest.java -------------------------------------------------------------------------------- /crawler/test/org/netpreserve/warcaroo/config/JobConfigTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/crawler/test/org/netpreserve/warcaroo/config/JobConfigTest.java -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/pom.xml -------------------------------------------------------------------------------- /renderer/pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/renderer/pom.xml -------------------------------------------------------------------------------- /renderer/src/org/netpreserve/warcaroo/Index.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/renderer/src/org/netpreserve/warcaroo/Index.java -------------------------------------------------------------------------------- /renderer/src/org/netpreserve/warcaroo/Renderer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/renderer/src/org/netpreserve/warcaroo/Renderer.java -------------------------------------------------------------------------------- /roo.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/warcaroo/HEAD/roo.svg --------------------------------------------------------------------------------