├── .gitignore ├── Examples ├── HeadlessBrowserExample │ └── HeadlessBrowserExample.swift ├── ImageScraperExample │ └── ImageScraperExample.swift ├── PagingScraperExample │ └── PagingScraperExample.swift └── ScraperExample │ └── ScraperExample.swift ├── LICENSE ├── Package.resolved ├── Package.swift ├── README.md ├── Sources ├── ActoCrawler │ ├── CrawlError.swift │ ├── CrawlEvent.swift │ ├── Crawler.htmlScraper.swift │ ├── Crawler.swift │ ├── CrawlerConfig.swift │ ├── Domain.swift │ ├── DomainQueueTable.swift │ ├── Internal │ │ ├── CrawlQueue.swift │ │ ├── Environment.swift │ │ ├── Logic.swift │ │ └── Regex.swift │ ├── NetworkSession.swift │ ├── Request.swift │ └── Response.swift ├── ActoCrawlerPlaywright │ ├── Crawler.withPlaywright.swift │ ├── PlaywrightActor.swift │ └── _exported.swift ├── AsyncChannel │ ├── AsyncChannel.swift │ ├── Locking.swift │ └── UnsafeResumption.swift └── PythonKitAsync │ ├── Bundle.swift │ ├── asPyAsync.swift │ └── pythonkit-async.py └── Tests └── ActoCrawlerTests ├── CrawlerTests.swift └── ReadMeExample.swift /.gitignore: -------------------------------------------------------------------------------- 1 | # Xcode 2 | # 3 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 4 | 5 | ## User settings 6 | xcuserdata/ 7 | 8 | ## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9) 9 | *.xcscmblueprint 10 | *.xccheckout 11 | 12 | ## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4) 13 | build/ 14 | DerivedData/ 15 | *.moved-aside 16 | *.pbxuser 17 | !default.pbxuser 18 | *.mode1v3 19 | !default.mode1v3 20 | *.mode2v3 21 | !default.mode2v3 22 | *.perspectivev3 23 | !default.perspectivev3 24 | 25 | ## Obj-C/Swift specific 26 | *.hmap 27 | 28 | ## App packaging 29 | *.ipa 30 | *.dSYM.zip 31 | *.dSYM 32 | 33 | ## Playgrounds 34 | timeline.xctimeline 35 | playground.xcworkspace 36 | 37 | # Swift Package Manager 38 | # 39 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. 40 | # Packages/ 41 | # Package.pins 42 | # Package.resolved 43 | # *.xcodeproj 44 | # 45 | # Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata 46 | # hence it is not needed unless you have added a package configuration file to your project 47 | # .swiftpm 48 | 49 | .build/ 50 | 51 | # CocoaPods 52 | # 53 | # We recommend against adding the Pods directory to your .gitignore. However 54 | # you should judge for yourself, the pros and cons are mentioned at: 55 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control 56 | # 57 | # Pods/ 58 | # 59 | # Add this line if you want to avoid checking in source code from the Xcode workspace 60 | # *.xcworkspace 61 | 62 | # Carthage 63 | # 64 | # Add this line if you want to avoid checking in source code from Carthage dependencies. 65 | # Carthage/Checkouts 66 | 67 | Carthage/Build/ 68 | 69 | # Accio dependency management 70 | Dependencies/ 71 | .accio/ 72 | 73 | # fastlane 74 | # 75 | # It is recommended to not store the screenshots in the git repo. 76 | # Instead, use fastlane to re-generate the screenshots whenever they are needed. 77 | # For more information about the recommended setup visit: 78 | # https://docs.fastlane.tools/best-practices/source-control/#source-control 79 | 80 | fastlane/report.xml 81 | fastlane/Preview.html 82 | fastlane/screenshots/**/*.png 83 | fastlane/test_output 84 | 85 | # Code Injection 86 | # 87 | # After new code Injection tools there's a generated folder /iOSInjectionProject 88 | # https://github.com/johnno1962/injectionforxcode 89 | 90 | iOSInjectionProject/ 91 | -------------------------------------------------------------------------------- /Examples/HeadlessBrowserExample/HeadlessBrowserExample.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import ActoCrawler 3 | import ActoCrawlerPlaywright 4 | 5 | /// [playwright-python](https://playwright.dev/python/docs/intro) (headless browser) example. 6 | @main 7 | struct HeadlessBrowserExample 8 | { 9 | static func main() async 10 | { 11 | struct Output: Sendable 12 | { 13 | let screenshotPath: String 14 | } 15 | 16 | let home = NSHomeDirectory() 17 | 18 | let crawler = await Crawler.withPlaywright( 19 | pythonPackagePaths: [ 20 | // NOTE: Change path to your own settings. 21 | "\(home)/.pyenv/versions/miniforge3-4.10.3-10/envs/ml/lib/python3.9/site-packages" 22 | ], 23 | config: CrawlerConfig( 24 | maxTotalRequests: 8, 25 | domainQueueTable: [ 26 | ".*": .init(maxConcurrency: 5, delay: 0) 27 | ] 28 | ), 29 | crawl: { request, playwright, browser in 30 | // NOTE: 31 | // `playwright` is `PythonObject` that can inter-op with Python using `@dynamicMemberLookup`. 32 | // For playwright-python APIs, see documentation: 33 | // https://playwright.dev/python/docs/intro 34 | 35 | let context = await browser.new_context().asPyAsync() 36 | let page = await context.new_page().asPyAsync() 37 | 38 | // Visit URL. 39 | await page.goto(request.url.absoluteString).asPyAsync() 40 | 41 | // Take screenshot. 42 | let screenshotPath = "screenshots/example-\(request.order).png" 43 | await page.screenshot(path: screenshotPath).asPyAsync() 44 | 45 | // Extract next URL links. 46 | // https://playwright.dev/python/docs/evaluating 47 | let linkObjects = await page 48 | .evaluate("() => Array.from(document.links).map(item => item.href)") 49 | .asPyAsync() 50 | 51 | let nextUserRequests: [UserRequest] 52 | if let links: [String] = Array(linkObjects) { 53 | nextUserRequests = links 54 | .compactMap { URL(string: $0).map(UserRequest.init(url:)) } 55 | .shuffled() 56 | } 57 | else { 58 | nextUserRequests = [] 59 | } 60 | 61 | await page.close().asPyAsync() 62 | await context.close().asPyAsync() 63 | 64 | return (nextUserRequests, Output(screenshotPath: screenshotPath)) 65 | } 66 | ) 67 | 68 | // Initial crawls. 69 | crawler.visit(requests: [ 70 | .init(url: URL(string: "https://en.wikipedia.org")!), 71 | .init(url: URL(string: "https://ja.wikipedia.org")!), 72 | .init(url: URL(string: "https://zh.wikipedia.org")!), 73 | ]) 74 | 75 | for await event in crawler.events { 76 | switch event { 77 | case let .willCrawl(req): 78 | print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)") 79 | case let .didCrawl(req, .success(output)): 80 | print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), screenshotPath = \(output.screenshotPath)") 81 | case let .didCrawl(req, .failure(error)): 82 | print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)") 83 | } 84 | } 85 | 86 | print("Output Done") 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /Examples/ImageScraperExample/ImageScraperExample.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import Foundation 2 | import ActoCrawler 3 | 4 | /// Image scraper example using 2 crawlers: `htmlCrawler` and `imageDownloader`. 5 | @main 6 | struct ImageScraperExample 7 | { 8 | static func main() async 9 | { 10 | struct HtmlCrawlerOutput: Sendable 11 | { 12 | let nextLinksCount: Int 13 | } 14 | 15 | struct ImageDownloaderOutput: Sendable 16 | { 17 | let savedFileURL: URL 18 | } 19 | 20 | let imageDownloader = await Crawler.withNetworkSession( 21 | config: CrawlerConfig( 22 | maxTotalRequests: 10, 23 | domainQueueTable: [ 24 | ".*": .init(maxConcurrency: 10, delay: 0.1) 25 | ] 26 | ), 27 | crawl: { request, urlSession in 28 | let fileURL = try await urlSession.downloadImage(url: request.url) 29 | return ( 30 | [] /* no next URLs */, 31 | ImageDownloaderOutput(savedFileURL: fileURL) 32 | ) 33 | } 34 | ) 35 | 36 | let htmlCrawler = await Crawler.htmlScraper( 37 | config: CrawlerConfig( 38 | maxTotalRequests: 10, 39 | // domainFilteringPolicy: .disallowedDomains(["wiki*"]), 40 | // domainFilteringPolicy: .allowedDomains(["wiki*"]), 41 | domainQueueTable: [ 42 | ".*": .init(maxConcurrency: 10, delay: 0.1) 43 | ] 44 | ), 45 | scrapeHTML: { response in 46 | let html = response.data 47 | let links = try html.select("a").map { try $0.attr("href") } 48 | let nextRequests = links 49 | .compactMap(URL.init(string:)) 50 | .filter { $0.scheme != nil } 51 | .map { UserRequest(url: $0) } 52 | 53 | // Send `imageURLs` to `imageDownloader`. 54 | // NOTE: `imageDownloader` queues are managed separately from `htmlCrawler`. 55 | let imageURLs = try html.select("img").map { try $0.attr("src") } 56 | .compactMap(URL.init) 57 | .filter { $0.scheme?.hasPrefix("http") == true } 58 | 59 | for imageURL in imageURLs { 60 | let request = UserRequest(url: imageURL) 61 | imageDownloader.visit(request: request) 62 | } 63 | 64 | return ( 65 | nextRequests, 66 | HtmlCrawlerOutput(nextLinksCount: imageURLs.count) 67 | ) 68 | } 69 | ) 70 | 71 | htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!) 72 | 73 | await withThrowingTaskGroup(of: Void.self, returning: Void.self) { group in 74 | group.addTask { 75 | for await event in imageDownloader.events { 76 | switch event { 77 | case let .willCrawl(req): 78 | print("🖼️ Image Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)") 79 | case let .didCrawl(req, .success(output)): 80 | print("🖼️ Image Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), savedFileURL = \(output.savedFileURL)") 81 | case let .didCrawl(req, .failure(error)): 82 | print("🖼️ Image Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)") 83 | } 84 | } 85 | 86 | print("Image Output Done") 87 | } 88 | group.addTask { 89 | for await event in htmlCrawler.events { 90 | switch event { 91 | case let .willCrawl(req): 92 | print("🌐 HTML Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)") 93 | case let .didCrawl(req, .success): 94 | print("🌐 HTML Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url)") 95 | case let .didCrawl(req, .failure(error)): 96 | print("🌐 HTML Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)") 97 | } 98 | } 99 | 100 | print("🌐 HTML Output Done") 101 | } 102 | } 103 | 104 | print("Image directory:", savingSubDirectory()) 105 | } 106 | } 107 | 108 | // MARK: - Private 109 | 110 | extension NetworkSession 111 | { 112 | fileprivate func downloadImage(url: URL) async throws -> URL 113 | { 114 | let (data, _) = try await self.data(for: URLRequest(url: url)) 115 | 116 | let filename = url.lastPathComponent 117 | let dirURL = savingSubDirectory() 118 | return try saveData(data, dirURL: dirURL, filename: filename) 119 | } 120 | } 121 | 122 | private func saveData(_ data: Data, dirURL: URL, filename: String) throws -> URL { 123 | let fileURL = dirURL.appendingPathComponent(filename) 124 | try data.write(to: fileURL) 125 | return fileURL 126 | } 127 | 128 | private func savingSubDirectory() -> URL { 129 | let dirURL = URL(fileURLWithPath: NSTemporaryDirectory()) 130 | .appendingPathComponent("ActoCrawlerExample") 131 | try? FileManager.default.createDirectory( 132 | at: dirURL, 133 | withIntermediateDirectories: true, 134 | attributes: nil 135 | ) 136 | return dirURL 137 | } 138 | -------------------------------------------------------------------------------- /Examples/PagingScraperExample/PagingScraperExample.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import ActoCrawler 3 | 4 | /// Pagination-based scraping example using `URLInfo`. 5 | @main 6 | struct PagingScraperExample 7 | { 8 | static func main() async 9 | { 10 | /// Additive information (page type) attached to requesting URL to track for determining next crawlings. 11 | enum URLInfo 12 | { 13 | case page(UInt64) 14 | case post 15 | } 16 | 17 | struct Output: Sendable 18 | { 19 | let message: String 20 | } 21 | 22 | let htmlCrawler = await Crawler.htmlScraper( 23 | config: CrawlerConfig( 24 | maxTotalRequests: 50, 25 | domainQueueTable: [ 26 | ".*news.ycombinator.com.*": .init(maxConcurrency: 3, delay: 0.3) 27 | ] 28 | ), 29 | scrapeHTML: { response in 30 | let html = response.data 31 | 32 | switch response.urlInfo { 33 | case let .page(page): 34 | var nextURLs: [UserRequest] 35 | nextURLs = try html.select("table.itemlist tr.athing") 36 | .map { "https://news.ycombinator.com/item?id=\($0.id())" } 37 | .compactMap(URL.init) 38 | .map { UserRequest(url: $0, urlInfo: .post) } 39 | 40 | if page < 100 { 41 | if let nextPageURL = URL(string: "https://news.ycombinator.com/news?p=\(page + 1)") 42 | { 43 | let nextPageRequest = UserRequest(url: nextPageURL, urlInfo: .page(page + 1)) 44 | nextURLs.append(nextPageRequest) 45 | } 46 | } 47 | 48 | return (nextURLs, Output(message: "Crawled page = \(page).")) 49 | 50 | case .post: 51 | let title = try html.title() 52 | return ([], Output(message: "Crawled post, title = \(title)")) 53 | } 54 | } 55 | ) 56 | 57 | htmlCrawler.visit(url: URL(string: "https://news.ycombinator.com/news")!, urlInfo: .page(1)) 58 | 59 | for await event in htmlCrawler.events { 60 | switch event { 61 | case let .willCrawl(req): 62 | print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)") 63 | case let .didCrawl(req, .success(output)): 64 | print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), message = \(output.message)") 65 | case let .didCrawl(req, .failure(error)): 66 | print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)") 67 | } 68 | } 69 | 70 | print("Output Done") 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /Examples/ScraperExample/ScraperExample.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import ActoCrawler 3 | 4 | /// Basic HTML scraping example using `Crawler.htmlScraper`. 5 | @main 6 | struct ScraperExample 7 | { 8 | static func main() async 9 | { 10 | struct Output: Sendable 11 | { 12 | let nextLinksCount: Int 13 | } 14 | 15 | let htmlCrawler = await Crawler.htmlScraper( 16 | config: CrawlerConfig( 17 | maxTotalRequests: 10 18 | ), 19 | scrapeHTML: { response in 20 | let html = response.data 21 | let links = try html.select("a").map { try $0.attr("href") } 22 | 23 | let nextRequests = links 24 | .compactMap(URL.init(string:)) 25 | .filter { $0.scheme != nil } 26 | .map { UserRequest(url: $0) } 27 | 28 | return (nextRequests, Output(nextLinksCount: nextRequests.count)) 29 | } 30 | ) 31 | 32 | htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!) 33 | 34 | for await event in htmlCrawler.events { 35 | switch event { 36 | case let .willCrawl(req): 37 | print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)") 38 | case let .didCrawl(req, .success(output)): 39 | print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), nextLinksCount = \(output.nextLinksCount)") 40 | case let .didCrawl(req, .failure(error)): 41 | print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)") 42 | } 43 | } 44 | 45 | print("Output Done") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Yasuhiro Inami 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "actomaton", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/inamiy/Actomaton.git", 7 | "state" : { 8 | "branch" : "main", 9 | "revision" : "18b559dc4f6b89676e6a71448257df2af6887196" 10 | } 11 | }, 12 | { 13 | "identity" : "pythonkit", 14 | "kind" : "remoteSourceControl", 15 | "location" : "https://github.com/pvieito/PythonKit.git", 16 | "state" : { 17 | "branch" : "master", 18 | "revision" : "81f621d094a7c8923207efe5178f50dba1b56c39" 19 | } 20 | }, 21 | { 22 | "identity" : "swift-case-paths", 23 | "kind" : "remoteSourceControl", 24 | "location" : "https://github.com/pointfreeco/swift-case-paths", 25 | "state" : { 26 | "revision" : "ce9c0d897db8a840c39de64caaa9b60119cf4be8", 27 | "version" : "0.8.1" 28 | } 29 | }, 30 | { 31 | "identity" : "swift-collections", 32 | "kind" : "remoteSourceControl", 33 | "location" : "https://github.com/apple/swift-collections.git", 34 | "state" : { 35 | "revision" : "48254824bb4248676bf7ce56014ff57b142b77eb", 36 | "version" : "1.0.2" 37 | } 38 | }, 39 | { 40 | "identity" : "swift-custom-dump", 41 | "kind" : "remoteSourceControl", 42 | "location" : "https://github.com/pointfreeco/swift-custom-dump", 43 | "state" : { 44 | "revision" : "c4f78db9b90ca57b7b6abc2223e235242739ea3c", 45 | "version" : "0.4.0" 46 | } 47 | }, 48 | { 49 | "identity" : "swiftsoup", 50 | "kind" : "remoteSourceControl", 51 | "location" : "https://github.com/scinfu/SwiftSoup.git", 52 | "state" : { 53 | "revision" : "41e7c263fb8c277e980ebcb9b0b5f6031d3d4886", 54 | "version" : "2.4.2" 55 | } 56 | }, 57 | { 58 | "identity" : "xctest-dynamic-overlay", 59 | "kind" : "remoteSourceControl", 60 | "location" : "https://github.com/pointfreeco/xctest-dynamic-overlay", 61 | "state" : { 62 | "revision" : "50a70a9d3583fe228ce672e8923010c8df2deddd", 63 | "version" : "0.2.1" 64 | } 65 | } 66 | ], 67 | "version" : 2 68 | } 69 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.6 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "ActoCrawler", 8 | platforms: [.macOS(.v12)], 9 | products: [ 10 | .library( 11 | name: "ActoCrawler", 12 | targets: ["ActoCrawler"]), 13 | .library( 14 | name: "ActoCrawlerPlaywright", 15 | targets: ["ActoCrawlerPlaywright"]), 16 | .executable( 17 | name: "ScraperExample", 18 | targets: ["ScraperExample"]), 19 | .executable( 20 | name: "ImageScraperExample", 21 | targets: ["ImageScraperExample"]), 22 | .executable( 23 | name: "PagingScraperExample", 24 | targets: ["PagingScraperExample"]), 25 | .executable( 26 | name: "HeadlessBrowserExample", 27 | targets: ["HeadlessBrowserExample"]), 28 | ], 29 | dependencies: [ 30 | .package(url: "https://github.com/inamiy/Actomaton.git", branch: "main"), 31 | .package(url: "https://github.com/apple/swift-collections.git", from: "1.0.0"), 32 | // .package(url: "https://github.com/apple/swift-async-algorithms.git", branch: "main"), 33 | .package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.4.2"), 34 | .package(url: "https://github.com/pvieito/PythonKit.git", branch: "master"), 35 | ], 36 | targets: [ 37 | .target( 38 | name: "AsyncChannel"), 39 | .target( 40 | name: "PythonKitAsync", 41 | dependencies: [ 42 | .product(name: "PythonKit", package: "PythonKit") 43 | ], 44 | resources: [.copy("pythonkit-async.py")] 45 | ), 46 | .target( 47 | name: "ActoCrawler", 48 | dependencies: [ 49 | "AsyncChannel", 50 | .product(name: "Actomaton", package: "Actomaton"), 51 | .product(name: "Collections", package: "swift-collections"), 52 | // .product(name: "AsyncAlgorithms", package: "swift-async-algorithms"), 53 | .product(name: "SwiftSoup", package: "SwiftSoup"), 54 | ], 55 | swiftSettings: [ 56 | .unsafeFlags([ 57 | "-Xfrontend", "-warn-concurrency", 58 | "-Xfrontend", "-enable-actor-data-race-checks", 59 | ]) 60 | ] 61 | ), 62 | .target( 63 | name: "ActoCrawlerPlaywright", 64 | dependencies: [ 65 | "ActoCrawler", "PythonKitAsync" 66 | ], 67 | swiftSettings: [ 68 | .unsafeFlags([ 69 | "-Xfrontend", "-warn-concurrency", 70 | "-Xfrontend", "-enable-actor-data-race-checks", 71 | ]) 72 | ] 73 | ), 74 | .testTarget( 75 | name: "ActoCrawlerTests", 76 | dependencies: ["ActoCrawler"]), 77 | .executableTarget( 78 | name: "ScraperExample", 79 | dependencies: ["ActoCrawler"], 80 | path: "Examples/ScraperExample"), 81 | .executableTarget( 82 | name: "ImageScraperExample", 83 | dependencies: ["ActoCrawler"], 84 | path: "Examples/ImageScraperExample"), 85 | .executableTarget( 86 | name: "PagingScraperExample", 87 | dependencies: ["ActoCrawler"], 88 | path: "Examples/PagingScraperExample"), 89 | .executableTarget( 90 | name: "HeadlessBrowserExample", 91 | dependencies: ["ActoCrawlerPlaywright"], 92 | path: "Examples/HeadlessBrowserExample"), 93 | ] 94 | ) 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🕸️ ActoCrawler 2 | 3 | **ActoCrawler** is a Swift Concurrency-powered crawler engine on top of [Actomaton](https://github.com/inamiy/Actomaton), with flexible customizability to create various HTML scrapers, image scrapers, etc. 4 | 5 | ## Example 6 | 7 | - [Examples](Examples) 8 | 9 | ```swift 10 | struct Output: Sendable 11 | { 12 | let nextLinksCount: Int 13 | } 14 | 15 | let htmlCrawler = await Crawler.htmlScraper( 16 | config: CrawlerConfig( 17 | maxDepths: 10, 18 | maxTotalRequests: 100, 19 | timeoutPerRequest: 5, 20 | userAgent: "ActoCrawler", 21 | domainFilteringPolicy: .disallowedDomains([".*google.com*" /* ... */]), 22 | domainQueueTable: [ 23 | ".*example1.com*": .init(maxConcurrency: 1, delay: 0), 24 | ".*example2.com*": .init(maxConcurrency: 5, delay: 0.1 ... 0.5) 25 | ] 26 | ), 27 | scrapeHTML: { response in 28 | let html = response.data 29 | let links = try html.select("a").map { try $0.attr("href") } 30 | 31 | let nextRequests = links 32 | .compactMap(URL.init(string:)) 33 | .map { UserRequest(url: $0) } 34 | 35 | return (nextRequests, Output(nextLinksCount: nextRequests.count)) 36 | } 37 | ) 38 | 39 | // Visit initial page. 40 | htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!) 41 | 42 | // Observe crawl events. 43 | for await event in htmlCrawler.events { 44 | switch event { 45 | case let .willCrawl(req): 46 | print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)") 47 | case let .didCrawl(req, .success(output)): 48 | print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), nextLinksCount = \(output.nextLinksCount)") 49 | case let .didCrawl(req, .failure(error)): 50 | print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)") 51 | } 52 | } 53 | 54 | print("Output Done") 55 | ``` 56 | 57 | ## Acknowledgements 58 | 59 | - [mattsse/voyager](https://github.com/mattsse/voyager) 60 | 61 | ## License 62 | 63 | [MIT](LICENSE) 64 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/CrawlError.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import Foundation 2 | 3 | /// Crawler error type. 4 | public enum CrawlError: Error 5 | { 6 | /// Failed to convert `URLResponse` into `HTTPURLResponse`. 7 | case invalidHTTPResponse(URLResponse) 8 | 9 | /// Failed to convert `Data` into crawler's perferred format. 10 | case invalidData 11 | 12 | /// Error when ``ActoCrawlerConfig/domainFilteringPolicy`` did not allow URL to pass. 13 | case domainNotAllowed(Domain) 14 | 15 | /// Crawling failed during ``Crawler/init(config:dependency:crawl:)``'s `crawl` method. 16 | case crawlFailed(Error) 17 | } 18 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/CrawlEvent.swift: -------------------------------------------------------------------------------- 1 | /// Crawler output event to be delivered to ``Crawler/events``. 2 | public enum CrawlEvent: Sendable 3 | where Output: Sendable, URLInfo: Sendable 4 | { 5 | case willCrawl(Request) 6 | case didCrawl(Request, Result) 7 | } 8 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Crawler.htmlScraper.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import Foundation 2 | @preconcurrency import SwiftSoup 3 | 4 | extension Crawler 5 | { 6 | /// Helper initializer using ``NetworkSession`` as network request, and [SwiftSoup](https://github.com/scinfu/SwiftSoup) as HTML scraper. 7 | /// 8 | /// - Parameters: 9 | /// - scrapeHTML: 10 | /// Receives `Response` that contains HTML `Document` to be scraped, 11 | /// and returns array of next `UserRequest`s as well as `Output` for current request output. 12 | /// If `Error` is thrown inside this closure, it will be observed as a failure of ``Crawler/events``. 13 | public static func htmlScraper( 14 | config: CrawlerConfig, 15 | scrapeHTML: @escaping @Sendable (Response) async throws -> ([UserRequest], Output) 16 | ) async -> Crawler 17 | where Output: Sendable 18 | { 19 | await Crawler.withNetworkSession( 20 | config: config, 21 | crawl: { request, urlSession in 22 | // Network request. 23 | let urlRequest: URLRequest = URLRequest(url: request.url, timeoutInterval: config.timeoutPerRequest) 24 | let (data, httpResponse) = try await urlSession.data(for: urlRequest) 25 | 26 | guard let html = String(data: data, encoding: .utf8) else { 27 | throw CrawlError.invalidData 28 | } 29 | 30 | // SwiftSoup HTML parsing. 31 | let doc = try SwiftSoup.parse(html) 32 | 33 | let response = Response( 34 | request: Request( 35 | url: request.url, 36 | urlInfo: request.urlInfo, 37 | order: request.order, 38 | depth: request.depth 39 | ), 40 | data: doc, 41 | httpResponse: httpResponse 42 | ) 43 | 44 | return try await scrapeHTML(response) 45 | } 46 | ) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Crawler.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import Foundation 2 | import Actomaton 3 | import AsyncChannel 4 | 5 | /// Swift Concurrency-powered crawler engine on top of [Actomaton](https://github.com/inamiy/Actomaton). 6 | /// 7 | /// Initializers: 8 | /// - ``Crawler/init(config:dependency:crawl:)`` is the designated initializer for arbitrary effectful crawling logic. 9 | /// - ``Crawler/withNetworkSession(config:crawl:)`` is a helper initializer that uses ``NetworkSession`` as dependency. 10 | /// - ``Crawler/htmlScraper(config:scrapeHTML:)`` is a helper initializer to scrape HTML using [SwiftSoup](https://github.com/scinfu/SwiftSoup) . 11 | public struct Crawler: Sendable 12 | where Output: Sendable, URLInfo: Sendable 13 | { 14 | private let actomaton: Actomaton, State> 15 | private let environment: Environment 16 | 17 | /// Designated initializer for arbitrary crawling logic. 18 | /// 19 | /// - Parameters: 20 | /// - dependency: ``Crawler``-retained reference that is passed on every `crawl`. 21 | /// - crawl: 22 | /// Receives `Request` to perform some async operations (e.g. network requesting and parsing), 23 | /// and returns array of next `UserRequest`s as well as `Output` for current request output. 24 | /// If `Error` is thrown inside this closure, it will be observed as a failure of ``Crawler/events``. 25 | public init( 26 | config: CrawlerConfig, 27 | dependency: Dependency, 28 | crawl: @escaping @Sendable (Request, Dependency) async throws -> ([UserRequest], Output) 29 | ) 30 | where Dependency: Sendable 31 | { 32 | let environment = Environment(config: config, dependency: dependency, crawl: crawl) 33 | 34 | self.actomaton = Actomaton( 35 | state: State(), 36 | reducer: reducer(), 37 | environment: environment 38 | ) 39 | self.environment = environment 40 | } 41 | 42 | /// Helper initializer that adds ``NetworkSession`` as dependency. 43 | public static func withNetworkSession( 44 | config: CrawlerConfig, 45 | crawl: @escaping @Sendable (Request, NetworkSession) async throws -> ([UserRequest], Output) 46 | ) async -> Crawler 47 | { 48 | let configuration: URLSessionConfiguration = { 49 | let configuration = URLSessionConfiguration.default 50 | configuration.httpAdditionalHeaders = ["User-Agent": config.userAgent] 51 | return configuration 52 | }() 53 | 54 | return .init(config: config, dependency: await NetworkSession(configuration: configuration), crawl: crawl) 55 | } 56 | 57 | /// Crawler output event `AsyncSequence`. 58 | /// - Todo: `any `AsyncSequence`.` 59 | public var events: AsyncChannel> 60 | { 61 | self.environment.events 62 | } 63 | 64 | /// Visits `url` as depth = 1 without `urlInfo`. 65 | public func visit(url: URL) where URLInfo == Void 66 | { 67 | self.visit( 68 | requests: [UserRequest(url: url)] 69 | ) 70 | } 71 | 72 | /// Visits `request` as depth = 1 with `urlInfo` as additive information. 73 | public func visit(url: URL, urlInfo: URLInfo) 74 | { 75 | self.visit( 76 | request: UserRequest(url: url, urlInfo: urlInfo) 77 | ) 78 | } 79 | 80 | /// Visits `request` as depth = 1. 81 | public func visit(request: UserRequest) 82 | { 83 | self.visit(requests: [request]) 84 | } 85 | 86 | /// Visits multiple `requests` as depth = 1. 87 | public func visit(requests: [UserRequest]) 88 | { 89 | Task { [actomaton] in 90 | await actomaton.send(.visit(requests)) 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/CrawlerConfig.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// ``Crawler`` configuration. 4 | public struct CrawlerConfig: Hashable, Sendable 5 | { 6 | /// Maximum depth of crawling. 7 | public let maxDepths: UInt64 8 | 9 | /// Maximum total requests for crawling. 10 | public let maxTotalRequests: UInt64 11 | 12 | /// Per request timeout (in seconds). 13 | public let timeoutPerRequest: TimeInterval 14 | 15 | /// User-Agent attached to request header. 16 | public let userAgent: String 17 | 18 | /// Domain filtering policy. 19 | public let domainFilteringPolicy: DomainFilteringPolicy 20 | 21 | /// Domain-to-Queue mapping table where `maxConcurrency` and `delay` are configurable per `domain`. 22 | public let domainQueueTable: DomainQueueTable 23 | 24 | // TODO: 25 | // public let respectsRobotsTxt: Bool 26 | 27 | public init( 28 | maxDepths: UInt64 = .max, 29 | maxTotalRequests: UInt64 = .max, 30 | timeoutPerRequest: TimeInterval = .greatestFiniteMagnitude, 31 | userAgent: String = "ActoCrawler", 32 | domainFilteringPolicy: DomainFilteringPolicy = .allDomains, 33 | domainQueueTable: DomainQueueTable = [:] 34 | // respectsRobotsTxt: Bool = true 35 | ) 36 | { 37 | self.maxDepths = maxDepths 38 | self.maxTotalRequests = maxTotalRequests 39 | self.timeoutPerRequest = timeoutPerRequest 40 | self.userAgent = userAgent 41 | self.domainQueueTable = domainQueueTable 42 | self.domainFilteringPolicy = domainFilteringPolicy 43 | // self.respectsRobotsTxt = respectsRobotsTxt 44 | } 45 | } 46 | 47 | // MARK: - DomainFilteringPolicy 48 | 49 | public enum DomainFilteringPolicy: Hashable, Sendable 50 | { 51 | /// All domains policy. 52 | case allDomains 53 | 54 | /// Allowed domains only policy. 55 | case allowedDomains(Set) 56 | 57 | /// Ignores disallowed domains policy. 58 | case disallowedDomains(Set) 59 | 60 | func isDomainAllowed(for domain: Domain) -> Bool 61 | { 62 | switch self { 63 | case .allDomains: 64 | return true 65 | 66 | case let .allowedDomains(domains): 67 | return domains.contains(where: { isRegexMatched(domain, pattern: $0) }) 68 | 69 | case let .disallowedDomains(domains): 70 | return !domains.contains(where: { isRegexMatched(domain, pattern: $0) }) 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Domain.swift: -------------------------------------------------------------------------------- 1 | /// Web domain name, with possible regular expression pattern. 2 | public typealias Domain = String 3 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/DomainQueueTable.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | @preconcurrency import Collections 3 | 4 | /// Domain-to-Queue mapping table where `maxConcurrency` and random `delay` are configurable per `domain`. 5 | /// - Note: `domain` can use regular expressions. 6 | /// 7 | /// For example: 8 | /// 9 | /// ```swift 10 | /// let domainQueueTable: DomainQueueTable = [ 11 | /// ".*google.*": .init(maxConcurrency: 5, delay: 0.1) // fixed delay 12 | /// ".*wikipedia.*": .init(maxConcurrency: 3, delay: 0.1 ... 0.3) // random delay in range 13 | /// ".*": .default 14 | /// ] 15 | /// let config = CrawlerConfig(..., domainQueueTable: domainQueueTable) 16 | /// ``` 17 | public struct DomainQueueTable: Hashable, Sendable 18 | { 19 | let dictionary: OrderedDictionary 20 | 21 | func buildQueue(url: URL) -> CrawlQueue 22 | { 23 | guard let host = url.host else { return .default } 24 | 25 | for (pattern, values) in self.dictionary { 26 | guard isRegexMatched(host, pattern: pattern) else { continue } 27 | 28 | return CrawlQueue( 29 | domain: pattern, 30 | maxConcurrency: values.maxConcurrency, 31 | delay: .random(values.delay) 32 | ) 33 | } 34 | 35 | return .default 36 | } 37 | 38 | // MARK: Key/Value 39 | 40 | public typealias Key = Domain 41 | 42 | public struct Value: Hashable, Sendable 43 | { 44 | let maxConcurrency: Int 45 | let delay: ClosedRange 46 | 47 | public init(maxConcurrency: Int, delay: ClosedRange) 48 | { 49 | self.maxConcurrency = maxConcurrency 50 | self.delay = delay 51 | } 52 | 53 | public init(maxConcurrency: Int, delay: TimeInterval) 54 | { 55 | self.maxConcurrency = maxConcurrency 56 | self.delay = delay ... delay 57 | } 58 | } 59 | } 60 | 61 | extension DomainQueueTable: ExpressibleByDictionaryLiteral 62 | { 63 | public init(dictionaryLiteral elements: (Key, Value)...) 64 | { 65 | self.dictionary = .init(uniqueKeysWithValues: elements) 66 | } 67 | } 68 | 69 | extension DomainQueueTable: Sequence 70 | { 71 | public func makeIterator() -> AnyIterator<(key: Key, value: Value)> 72 | { 73 | return AnyIterator(dictionary.makeIterator()) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Internal/CrawlQueue.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Actomaton 3 | 4 | /// Crawler `EffectQueue` to run on Actomaton. 5 | /// - Note: This hashable identity is distinguishable per `domain`. 6 | struct CrawlQueue: EffectQueueProtocol 7 | { 8 | private let domain: Domain? 9 | private let maxConcurrency: Int 10 | let effectQueueDelay: EffectQueueDelay 11 | 12 | private init( 13 | domain: Domain?, 14 | maxConcurrency: Int, 15 | delay: EffectQueueDelay 16 | ) 17 | { 18 | self.domain = domain 19 | self.maxConcurrency = maxConcurrency 20 | self.effectQueueDelay = delay 21 | } 22 | 23 | init( 24 | domain: Domain, 25 | maxConcurrency: Int, 26 | delay: EffectQueueDelay 27 | ) 28 | { 29 | self.domain = domain 30 | self.maxConcurrency = maxConcurrency 31 | self.effectQueueDelay = delay 32 | } 33 | 34 | static var `default`: CrawlQueue 35 | { 36 | CrawlQueue(domain: nil, maxConcurrency: .max, delay: .constant(0)) 37 | } 38 | 39 | var effectQueuePolicy: EffectQueuePolicy 40 | { 41 | .runOldest(maxCount: self.maxConcurrency, .suspendNew) 42 | } 43 | 44 | func hash(into hasher: inout Hasher) 45 | { 46 | hasher.combine(self.domain) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Internal/Environment.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import AsyncChannel 3 | 4 | /// Effectful environment for making arbitrary crawler. 5 | struct Environment: Sendable 6 | where Output: Sendable, URLInfo: Sendable 7 | { 8 | let config: CrawlerConfig 9 | 10 | /// Receives `Request` to perform some async operations (e.g. network requesting and parsing), 11 | /// and returns array of next `Request`s as well as `Output`. 12 | let crawl: @Sendable (Request) async throws -> ([UserRequest], Output) 13 | 14 | /// Crawler output event `AsyncSequence`. 15 | /// - Todo: `any `AsyncSequence`. 16 | let events: AsyncChannel> = .init() 17 | 18 | init( 19 | config: CrawlerConfig, 20 | dependency: Dependency, 21 | crawl: @escaping @Sendable (Request, Dependency) async throws -> ([UserRequest], Output) 22 | ) 23 | where Dependency: Sendable 24 | { 25 | self.config = config 26 | self.crawl = { request in 27 | try await crawl(request, dependency) 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Internal/Logic.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import Foundation 2 | import Actomaton 3 | import ActomatonDebugging 4 | 5 | // MARK: - Action 6 | 7 | enum Action: Sendable 8 | where Output: Sendable, URLInfo: Sendable 9 | { 10 | case visit([UserRequest]) 11 | case _visit(Request) 12 | case _didVisit(Request, nextRequests: [UserRequest], output: Output) 13 | case _didFailVisit(Request, CrawlError) 14 | } 15 | 16 | // MARK: - State 17 | 18 | struct State: Sendable 19 | { 20 | var waitingURLs: Set = [] 21 | 22 | /// Total count of "waiting" + "visited" + "failed" URLs. 23 | var totalVisitCount: UInt64 = 0 24 | } 25 | 26 | // MARK: - Reducer 27 | 28 | func reducer() -> Reducer, State, Environment> 29 | where Output: Sendable, URLInfo: Sendable 30 | { 31 | typealias Eff = Effect> 32 | 33 | return Reducer { action, state, env in 34 | /// Common logic to update `state`, output to channel & dispatch next visits. 35 | func didFinish( 36 | request: Request?, 37 | nextRequests: [UserRequest], 38 | outputResult: Result? 39 | ) -> Eff 40 | { 41 | // Remove from `waitingURLs`. 42 | if let request = request { 43 | state.waitingURLs.remove(request.url) 44 | } 45 | 46 | // Limit `nextRequests` by checking `config.maxTotalRequests`. 47 | let totalVisitCount = state.totalVisitCount 48 | let remainingVisitCount = max(env.config.maxTotalRequests - totalVisitCount, 0) 49 | let nextRequests = nextRequests.prefix(Int(clamping: remainingVisitCount)) 50 | 51 | // Insert next "waiting"s. 52 | for nextRequest in nextRequests { 53 | state.waitingURLs.insert(nextRequest.url) 54 | } 55 | 56 | let isFinished = state.waitingURLs.isEmpty && nextRequests.isEmpty 57 | 58 | /// AsyncChannel effect. 59 | let sendToChannel = Eff.fireAndForget { 60 | // NOTE: `outputResult = nil` is passed only on initial crawl. 61 | if let request = request, let outputResult = outputResult { 62 | await env.events.send(.didCrawl(request, outputResult)) 63 | } 64 | if isFinished { 65 | env.events.finish() 66 | } 67 | } 68 | 69 | let depth = request?.depth ?? 0 70 | 71 | /// nextCrawls effect. 72 | let nextCrawls = nextRequests.isEmpty || depth >= env.config.maxDepths 73 | ? .empty 74 | : Eff.combine( // Visit next with incrementing `depth`. 75 | nextRequests.enumerated() 76 | .map { i, userReq in 77 | let request = Request( 78 | url: userReq.url, 79 | urlInfo: userReq.urlInfo, 80 | order: totalVisitCount + UInt64(i), 81 | depth: depth + 1 82 | ) 83 | return .nextAction(._visit(request)) 84 | } 85 | ) 86 | 87 | state.totalVisitCount += UInt64(nextRequests.count) 88 | 89 | return sendToChannel + nextCrawls 90 | } 91 | 92 | // Reducer pattern-matching. 93 | switch action { 94 | case let .visit(requests): 95 | // NOTE: 96 | // This is a fake `didFinish` to reuse calculation of `state.waitingURLs` etc by only sending `nextRequests`. 97 | return didFinish( 98 | request: nil, 99 | nextRequests: requests, 100 | outputResult: nil 101 | ) 102 | 103 | case let ._visit(request): 104 | let host = request.url.host ?? "" 105 | 106 | let isAllowed = env.config.domainFilteringPolicy.isDomainAllowed(for: host) 107 | guard isAllowed else { 108 | return .nextAction( 109 | ._didFailVisit(request, CrawlError.domainNotAllowed(host)) 110 | ) 111 | } 112 | 113 | let queue = env.config.domainQueueTable.buildQueue(url: request.url) 114 | 115 | return Effect(queue: queue) { 116 | // Check if `queue` has already force-cancelled this effect. 117 | // This is important when using `EffectQueue` with delay. 118 | try Task.checkCancellation() 119 | 120 | await env.events.send(.willCrawl(request)) 121 | 122 | do { 123 | let (nextRequests, output) = try await env.crawl(request) 124 | return ._didVisit(request, nextRequests: nextRequests, output: output) 125 | } 126 | catch { 127 | return ._didFailVisit(request, CrawlError.crawlFailed(error)) 128 | } 129 | } 130 | 131 | case let ._didVisit(request, nextRequests, output): 132 | return didFinish( 133 | request: request, 134 | nextRequests: nextRequests, 135 | outputResult: .success(output) 136 | ) 137 | 138 | case let ._didFailVisit(request, error): 139 | return didFinish( 140 | request: request, 141 | nextRequests: [], 142 | outputResult: .failure(error) 143 | ) 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Internal/Regex.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | func isRegexMatched(_ string: String, pattern: String) -> Bool 4 | { 5 | let matches = try? NSRegularExpression(pattern: pattern) 6 | .matches(in: string, range: .init(location: 0, length: string.utf16.count)) 7 | 8 | return !(matches ?? []).isEmpty 9 | } 10 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/NetworkSession.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// `URLSession` wrapped by `Actor`, used in ``Crawler/withNetworkSession(config:crawl:)``. 4 | public actor NetworkSession 5 | { 6 | private let urlSession: URLSession 7 | 8 | public init(configuration: URLSessionConfiguration) async 9 | { 10 | self.urlSession = URLSession(configuration: configuration) 11 | } 12 | 13 | public func data(for request: URLRequest) async throws -> (Data, HTTPURLResponse) 14 | { 15 | let (data, urlResponse) = try await urlSession.data(for: request) 16 | 17 | if let urlResponse = urlResponse as? HTTPURLResponse { 18 | return (data, urlResponse) 19 | } 20 | else { 21 | throw CrawlError.invalidHTTPResponse(urlResponse) 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Request.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import Foundation 2 | 3 | // MARK: - UserRequest 4 | 5 | /// User-defined requesting `URL` with additional `URLInfo`. 6 | public struct UserRequest: Sendable where URLInfo: Sendable 7 | { 8 | /// Requesting `URL`. 9 | public var url: URL 10 | 11 | /// Additional info that is attached next to requesting `URL`. 12 | /// 13 | /// For example, URL page number can be passed as `URLInfo` in ``Crawler/visit(requests:)`` 14 | /// or ``Crawler/init(config:crawl:)``'s `crawl` return value so that next request can be determined by its page number increment. 15 | public var urlInfo: URLInfo 16 | 17 | public init(url: URL, urlInfo: URLInfo) 18 | { 19 | self.url = url 20 | self.urlInfo = urlInfo 21 | } 22 | 23 | public init(url: URL) where URLInfo == Void 24 | { 25 | self.url = url 26 | self.urlInfo = () 27 | } 28 | } 29 | 30 | // MARK: - Request 31 | 32 | /// ``UserRequest`` + Acrawler-additions i.e. ``order`` + ``depth``. 33 | @dynamicMemberLookup 34 | public struct Request: Sendable where URLInfo: Sendable 35 | { 36 | /// - Note: Accessible via `@dynamicMemberLookup`. 37 | private var userRequest: UserRequest 38 | 39 | /// Request order number. 40 | public let order: UInt64 41 | 42 | /// Request crawling depth. 43 | public let depth: UInt64 44 | 45 | public init(url: URL, urlInfo: URLInfo, order: UInt64, depth: UInt64) 46 | { 47 | self.userRequest = .init(url: url, urlInfo: urlInfo) 48 | self.order = order 49 | self.depth = depth 50 | } 51 | 52 | public init(url: URL, order: UInt64, depth: UInt64) where URLInfo == Void 53 | { 54 | self.userRequest = .init(url: url) 55 | self.order = order 56 | self.depth = depth 57 | } 58 | 59 | public subscript(dynamicMember keyPath: WritableKeyPath, T>) -> T 60 | { 61 | get { 62 | self.userRequest[keyPath: keyPath] 63 | } 64 | set { 65 | self.userRequest[keyPath: keyPath] = newValue 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /Sources/ActoCrawler/Response.swift: -------------------------------------------------------------------------------- 1 | @preconcurrency import Foundation 2 | 3 | /// HTTP Response for ``Request`` with `URLSession`'s results and additional `URLInfo`. 4 | @dynamicMemberLookup 5 | public struct Response: Sendable 6 | where Data: Sendable, URLInfo: Sendable 7 | { 8 | /// - Note: Accessible via `@dynamicMemberLookup`. 9 | private var request: Request 10 | 11 | // URLSession response. 12 | public var data: Data 13 | public var httpResponse: HTTPURLResponse 14 | 15 | public init( 16 | request: Request, 17 | data: Data, 18 | httpResponse: HTTPURLResponse 19 | ) 20 | { 21 | self.request = request 22 | self.data = data 23 | self.httpResponse = httpResponse 24 | } 25 | 26 | public subscript(dynamicMember keyPath: WritableKeyPath, T>) -> T 27 | { 28 | get { 29 | self.request[keyPath: keyPath] 30 | } 31 | set { 32 | self.request[keyPath: keyPath] = newValue 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Sources/ActoCrawlerPlaywright/Crawler.withPlaywright.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import ActoCrawler 3 | 4 | extension Crawler 5 | { 6 | /// Helper initializer that adds [playwright-python](https://playwright.dev/python/docs/intro) (headless browser) as ActoCrawler's dependency. 7 | /// 8 | /// As written in the documentation, make sure to setup Python environment before calling this method: 9 | /// 10 | /// 1. `pip install playwright` 11 | /// 2. `playwright install` 12 | /// 13 | /// - Parameters: 14 | /// - pythonPackagePaths: 15 | /// Python library paths for interacting with `playwright-python`. Use `pip show playwright` to find its locaiton. 16 | /// 17 | /// - browser: 18 | /// Creates a new `Browser` object from `playwright` to reuse during crawling iterations. 19 | /// If `nil`, Chromium with non-headless mode will launch. 20 | /// 21 | /// Example of this closure is: 22 | /// ``` 23 | /// let browser = { await $0.chromium.launch(headless: false).asPyAsync() } 24 | /// ``` 25 | /// 26 | /// - crawl: 27 | /// Crawling function that receives 28 | /// [Playwright](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L12153) 29 | /// and [Browser](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L11134) 30 | /// as `PythonObject`s to inter-op with Python. 31 | public static func withPlaywright( 32 | pythonPackagePaths: [String], 33 | config: CrawlerConfig, 34 | browser: (@Sendable (_ playwright: PythonObject) async -> PythonObject)? = nil, 35 | crawl: @escaping @CrawlActor @Sendable ( 36 | Request, 37 | _ playwright: PythonObject, 38 | _ browser: PythonObject 39 | ) async throws -> ([UserRequest], Output) 40 | ) async -> Crawler 41 | { 42 | let playwrightActor = await PlaywrightActor( 43 | pythonPackagePaths: pythonPackagePaths, 44 | prepare: browser ?? { await $0.chromium.launch(headless: false).asPyAsync() } 45 | ) 46 | 47 | return Crawler( 48 | config: config, 49 | dependency: playwrightActor, 50 | crawl: { request, playwrightActor in 51 | try await playwrightActor.runCrawl { 52 | try await crawl(request, $0, $1) 53 | } 54 | } 55 | ) 56 | } 57 | } 58 | 59 | // MARK: - Private 60 | 61 | /// Global actor for cooperative Playwright crawling to avoid `EXC_BAD_ACCESS`. 62 | @globalActor 63 | internal actor CrawlActor 64 | { 65 | static let shared: CrawlActor = CrawlActor() 66 | } 67 | -------------------------------------------------------------------------------- /Sources/ActoCrawlerPlaywright/PlaywrightActor.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import ActoCrawler 3 | 4 | /// [playwright-python](https://playwright.dev/python/docs/intro) (headless browser) Actor wrapper. 5 | /// - Note: This will be used as a dependency of ActoCrawler, and stored throughout its lifetime. 6 | internal actor PlaywrightActor 7 | { 8 | /// Root of `playwright/async_api`. 9 | /// - [async_playwright](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/__init__.py#L85) 10 | /// - [PlaywrightContextManager](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_context_manager.py#L25) 11 | private let playwrightContextManager: PythonObject 12 | 13 | /// Python [Playwright](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L12153) object. 14 | internal let playwright: PythonObject 15 | 16 | /// Python object that is prepared via `init`'s `prepare`. 17 | /// For example, preparing [Browser](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L11134) 18 | /// is often useful not to launch multiple times and keep using the same reference. 19 | internal let preparedObject: PythonObject 20 | 21 | /// - Parameters: 22 | /// - pythonPackagePaths: 23 | /// Python library paths for interacting with `playwright-python`. Use `pip show playwright` to find its locaiton. 24 | /// - prepare: 25 | /// Async closure for setting-up `preparedObject`, which is usually a reusable `Browser`. 26 | internal init( 27 | pythonPackagePaths: [String], 28 | prepare: @Sendable (_ playwright: PythonObject) async -> PythonObject 29 | ) async 30 | { 31 | // Set PATH. 32 | let sys = Python.import("sys") 33 | for path in pythonPackagePaths { 34 | sys.path.append(path) 35 | } 36 | sys.path.append(PythonKitAsync.bundleResourcePath) // For importing `pythonkit-async.py`. 37 | 38 | let playwrightModule = Python.import("playwright.async_api") 39 | self.playwrightContextManager = playwrightModule.async_playwright() 40 | self.playwright = await self.playwrightContextManager.start().asPyAsync() 41 | self.preparedObject = await prepare(self.playwright) 42 | } 43 | 44 | deinit 45 | { 46 | Task.detached { [playwrightContextManager] in 47 | await playwrightContextManager.__aexit__().asPyAsync() 48 | } 49 | } 50 | 51 | internal func runCrawl( 52 | _ crawl: @Sendable ( 53 | _ playwright: PythonObject, 54 | _ setupObject: PythonObject 55 | ) async throws -> Res 56 | ) async rethrows -> Res 57 | { 58 | try await crawl(self.playwright, self.preparedObject) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Sources/ActoCrawlerPlaywright/_exported.swift: -------------------------------------------------------------------------------- 1 | @_exported import PythonKit 2 | @_exported import PythonKitAsync 3 | -------------------------------------------------------------------------------- /Sources/AsyncChannel/AsyncChannel.swift: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // 3 | // This source file is part of the Swift Async Algorithms open source project 4 | // 5 | // Copyright (c) 2022 Apple Inc. and the Swift project authors 6 | // Licensed under Apache License v2.0 with Runtime Library Exception 7 | // 8 | // See https://swift.org/LICENSE.txt for license information 9 | // 10 | //===----------------------------------------------------------------------===// 11 | 12 | /// A channel for sending elements from one task to another with back pressure. 13 | /// 14 | /// The `AsyncChannel` class is intended to be used as a communication type between tasks, 15 | /// particularly when one task produces values and another task consumes those values. The back 16 | /// pressure applied by `send(_:)` and `finish()` via the suspension/resume ensures that 17 | /// the production of values does not exceed the consumption of values from iteration. Each of these 18 | /// methods suspends after enqueuing the event and is resumed when the next call to `next()` 19 | /// on the `Iterator` is made. 20 | public final class AsyncChannel: AsyncSequence, Sendable { 21 | /// The iterator for a `AsyncChannel` instance. 22 | public struct Iterator: AsyncIteratorProtocol, Sendable { 23 | let channel: AsyncChannel 24 | var active: Bool = true 25 | 26 | init(_ channel: AsyncChannel) { 27 | self.channel = channel 28 | } 29 | 30 | /// Await the next sent element or finish. 31 | public mutating func next() async -> Element? { 32 | guard active else { 33 | return nil 34 | } 35 | let generation = channel.establish() 36 | let value: Element? = await withTaskCancellationHandler { [channel] in 37 | channel.cancel(generation) 38 | } operation: { 39 | await channel.next(generation) 40 | } 41 | 42 | if let value = value { 43 | return value 44 | } else { 45 | active = false 46 | return nil 47 | } 48 | } 49 | } 50 | 51 | struct Awaiting: Hashable { 52 | var generation: Int 53 | var continuation: UnsafeContinuation? 54 | let cancelled: Bool 55 | 56 | init(generation: Int, continuation: UnsafeContinuation) { 57 | self.generation = generation 58 | self.continuation = continuation 59 | cancelled = false 60 | } 61 | 62 | init(placeholder generation: Int) { 63 | self.generation = generation 64 | self.continuation = nil 65 | cancelled = false 66 | } 67 | 68 | init(cancelled generation: Int) { 69 | self.generation = generation 70 | self.continuation = nil 71 | cancelled = true 72 | } 73 | 74 | func hash(into hasher: inout Hasher) { 75 | hasher.combine(generation) 76 | } 77 | 78 | static func == (_ lhs: Awaiting, _ rhs: Awaiting) -> Bool { 79 | return lhs.generation == rhs.generation 80 | } 81 | } 82 | 83 | enum Emission { 84 | case idle 85 | case pending([UnsafeContinuation?, Never>]) 86 | case awaiting(Set) 87 | 88 | mutating func cancel(_ generation: Int) -> UnsafeContinuation? { 89 | switch self { 90 | case .awaiting(var awaiting): 91 | let continuation = awaiting.remove(Awaiting(placeholder: generation))?.continuation 92 | if awaiting.isEmpty { 93 | self = .idle 94 | } else { 95 | self = .awaiting(awaiting) 96 | } 97 | return continuation 98 | case .idle: 99 | self = .awaiting([Awaiting(cancelled: generation)]) 100 | return nil 101 | default: 102 | return nil 103 | } 104 | } 105 | } 106 | 107 | struct State { 108 | var emission: Emission = .idle 109 | var generation = 0 110 | var terminal = false 111 | } 112 | 113 | let state = ManagedCriticalState(State()) 114 | 115 | /// Create a new `AsyncChannel` given an element type. 116 | public init(element elementType: Element.Type = Element.self) { } 117 | 118 | func establish() -> Int { 119 | state.withCriticalRegion { state in 120 | defer { state.generation &+= 1 } 121 | return state.generation 122 | } 123 | } 124 | 125 | func cancel(_ generation: Int) { 126 | state.withCriticalRegion { state in 127 | state.emission.cancel(generation) 128 | }?.resume(returning: nil) 129 | } 130 | 131 | func next(_ generation: Int) async -> Element? { 132 | return await withUnsafeContinuation { continuation in 133 | var cancelled = false 134 | var terminal = false 135 | state.withCriticalRegion { state -> UnsafeResumption?, Never>? in 136 | if state.terminal { 137 | terminal = true 138 | return nil 139 | } 140 | switch state.emission { 141 | case .idle: 142 | state.emission = .awaiting([Awaiting(generation: generation, continuation: continuation)]) 143 | return nil 144 | case .pending(var sends): 145 | let send = sends.removeFirst() 146 | if sends.count == 0 { 147 | state.emission = .idle 148 | } else { 149 | state.emission = .pending(sends) 150 | } 151 | return UnsafeResumption(continuation: send, success: continuation) 152 | case .awaiting(var nexts): 153 | if nexts.update(with: Awaiting(generation: generation, continuation: continuation)) != nil { 154 | nexts.remove(Awaiting(placeholder: generation)) 155 | cancelled = true 156 | } 157 | if nexts.isEmpty { 158 | state.emission = .idle 159 | } else { 160 | state.emission = .awaiting(nexts) 161 | } 162 | return nil 163 | } 164 | }?.resume() 165 | if cancelled || terminal { 166 | continuation.resume(returning: nil) 167 | } 168 | } 169 | } 170 | 171 | func finishAll() { 172 | let (sends, nexts) = state.withCriticalRegion { state -> ([UnsafeContinuation?, Never>], Set) in 173 | if state.terminal { 174 | return ([], []) 175 | } 176 | state.terminal = true 177 | switch state.emission { 178 | case .idle: 179 | return ([], []) 180 | case .pending(let nexts): 181 | state.emission = .idle 182 | return (nexts, []) 183 | case .awaiting(let nexts): 184 | state.emission = .idle 185 | return ([], nexts) 186 | } 187 | } 188 | for send in sends { 189 | send.resume(returning: nil) 190 | } 191 | for next in nexts { 192 | next.continuation?.resume(returning: nil) 193 | } 194 | } 195 | 196 | func _send(_ element: Element) async { 197 | await withTaskCancellationHandler { 198 | finishAll() 199 | } operation: { 200 | let continuation: UnsafeContinuation? = await withUnsafeContinuation { continuation in 201 | state.withCriticalRegion { state -> UnsafeResumption?, Never>? in 202 | if state.terminal { 203 | return UnsafeResumption(continuation: continuation, success: nil) 204 | } 205 | switch state.emission { 206 | case .idle: 207 | state.emission = .pending([continuation]) 208 | return nil 209 | case .pending(var sends): 210 | sends.append(continuation) 211 | state.emission = .pending(sends) 212 | return nil 213 | case .awaiting(var nexts): 214 | let next = nexts.removeFirst().continuation 215 | if nexts.count == 0 { 216 | state.emission = .idle 217 | } else { 218 | state.emission = .awaiting(nexts) 219 | } 220 | return UnsafeResumption(continuation: continuation, success: next) 221 | } 222 | }?.resume() 223 | } 224 | continuation?.resume(returning: element) 225 | } 226 | } 227 | 228 | /// Send an element to an awaiting iteration. This function will resume when the next call to `next()` is made. 229 | /// If the channel is already finished then this returns immediately 230 | public func send(_ element: Element) async { 231 | await _send(element) 232 | } 233 | 234 | /// Send a finish to all awaiting iterations. 235 | public func finish() { 236 | finishAll() 237 | } 238 | 239 | /// Create an `Iterator` for iteration of an `AsyncChannel` 240 | public func makeAsyncIterator() -> Iterator { 241 | return Iterator(self) 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /Sources/AsyncChannel/Locking.swift: -------------------------------------------------------------------------------- 1 | //===----------------------------------------------------------------------===// 2 | // 3 | // This source file is part of the Swift Async Algorithms open source project 4 | // 5 | // Copyright (c) 2022 Apple Inc. and the Swift project authors 6 | // Licensed under Apache License v2.0 with Runtime Library Exception 7 | // 8 | // See https://swift.org/LICENSE.txt for license information 9 | // 10 | //===----------------------------------------------------------------------===// 11 | 12 | #if canImport(Darwin) 13 | @_implementationOnly import Darwin 14 | #elseif canImport(Glibc) 15 | @_implementationOnly import Glibc 16 | #elseif canImport(WinSDK) 17 | @_implementationOnly import WinSDK 18 | #endif 19 | 20 | internal struct Lock { 21 | #if canImport(Darwin) 22 | typealias Primitive = os_unfair_lock 23 | #elseif canImport(Glibc) 24 | typealias Primitive = pthread_mutex_t 25 | #elseif canImport(WinSDK) 26 | typealias Primitive = SRWLOCK 27 | #endif 28 | 29 | typealias PlatformLock = UnsafeMutablePointer 30 | let platformLock: PlatformLock 31 | 32 | private init(_ platformLock: PlatformLock) { 33 | self.platformLock = platformLock 34 | } 35 | 36 | fileprivate static func initialize(_ platformLock: PlatformLock) { 37 | #if canImport(Darwin) 38 | platformLock.initialize(to: os_unfair_lock()) 39 | #elseif canImport(Glibc) 40 | pthread_mutex_init(platformLock, nil) 41 | #elseif canImport(WinSDK) 42 | InitializeSRWLock(platformLock) 43 | #endif 44 | } 45 | 46 | fileprivate static func deinitialize(_ platformLock: PlatformLock) { 47 | #if canImport(Glibc) 48 | pthread_mutex_destroy(platformLock) 49 | #endif 50 | platformLock.deinitialize(count: 1) 51 | } 52 | 53 | fileprivate static func lock(_ platformLock: PlatformLock) { 54 | #if canImport(Darwin) 55 | os_unfair_lock_lock(platformLock) 56 | #elseif canImport(Glibc) 57 | pthread_mutex_lock(platformLock) 58 | #elseif canImport(WinSDK) 59 | AcquireSRWLockExclusive(platformLock) 60 | #endif 61 | } 62 | 63 | fileprivate static func unlock(_ platformLock: PlatformLock) { 64 | #if canImport(Darwin) 65 | os_unfair_lock_unlock(platformLock) 66 | #elseif canImport(Glibc) 67 | pthread_mutex_unlock(platformLock) 68 | #elseif canImport(WinSDK) 69 | ReleaseSRWLockExclusive(platformLock) 70 | #endif 71 | } 72 | 73 | static func allocate() -> Lock { 74 | let platformLock = PlatformLock.allocate(capacity: 1) 75 | initialize(platformLock) 76 | return Lock(platformLock) 77 | } 78 | 79 | func deinitialize() { 80 | Lock.deinitialize(platformLock) 81 | } 82 | 83 | func lock() { 84 | Lock.lock(platformLock) 85 | } 86 | 87 | func unlock() { 88 | Lock.unlock(platformLock) 89 | } 90 | } 91 | 92 | struct ManagedCriticalState { 93 | private final class LockedBuffer: ManagedBuffer { 94 | deinit { 95 | withUnsafeMutablePointerToElements { Lock.deinitialize($0) } 96 | } 97 | } 98 | 99 | private let buffer: ManagedBuffer 100 | 101 | init(_ initial: State) { 102 | buffer = LockedBuffer.create(minimumCapacity: 1) { buffer in 103 | buffer.withUnsafeMutablePointerToElements { Lock.initialize($0) } 104 | return initial 105 | } 106 | } 107 | 108 | func withCriticalRegion(_ critical: (inout State) throws -> R) rethrows -> R { 109 | try buffer.withUnsafeMutablePointers { header, lock in 110 | Lock.lock(lock) 111 | defer { Lock.unlock(lock) } 112 | return try critical(&header.pointee) 113 | } 114 | } 115 | } 116 | 117 | extension ManagedCriticalState: @unchecked Sendable where State: Sendable { } 118 | -------------------------------------------------------------------------------- /Sources/AsyncChannel/UnsafeResumption.swift: -------------------------------------------------------------------------------- 1 | struct UnsafeResumption { 2 | let continuation: UnsafeContinuation 3 | let result: Result 4 | 5 | init(continuation: UnsafeContinuation, result: Result) { 6 | self.continuation = continuation 7 | self.result = result 8 | } 9 | 10 | init(continuation: UnsafeContinuation, success: Success) { 11 | self.init(continuation: continuation, result: .success(success)) 12 | } 13 | 14 | init(continuation: UnsafeContinuation, failure: Failure) { 15 | self.init(continuation: continuation, result: .failure(failure)) 16 | } 17 | 18 | func resume() { 19 | continuation.resume(with: result) 20 | } 21 | } 22 | 23 | extension UnsafeResumption where Failure == Error { 24 | init(continuation: UnsafeContinuation, catching body: () throws -> Success) { 25 | self.init(continuation: continuation, result: Result(catching: body)) 26 | } 27 | } 28 | 29 | extension UnsafeResumption where Success == Void { 30 | init(continuation: UnsafeContinuation) { 31 | self.init(continuation: continuation, result: .success(())) 32 | } 33 | } 34 | 35 | extension UnsafeResumption: Sendable where Success: Sendable { } 36 | 37 | 38 | -------------------------------------------------------------------------------- /Sources/PythonKitAsync/Bundle.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// Path to `pythonkit-async.py` 4 | public let bundleResourcePath: String = { 5 | BundleToken.bundle.resourcePath! 6 | }() 7 | 8 | // MARK: - Private 9 | 10 | private final class BundleToken 11 | { 12 | static let bundle: Bundle = { 13 | #if SWIFT_PACKAGE 14 | return Bundle.module 15 | #else 16 | return Bundle(for: BundleToken.self) 17 | #endif 18 | }() 19 | } 20 | -------------------------------------------------------------------------------- /Sources/PythonKitAsync/asPyAsync.swift: -------------------------------------------------------------------------------- 1 | import PythonKit 2 | 3 | // NOTE: 4 | private let pythonKitAsync = Python.import("pythonkit-async") 5 | 6 | extension PythonObject 7 | { 8 | /// Converts `self` as Python's coroutine (`async def`) object into Swift async function. 9 | /// - Important: `self` must be Python coroutine object to run properly. Otherwise, async-returned value will be `self`. 10 | @discardableResult 11 | public func asPyAsync() async -> PythonObject 12 | { 13 | let pyObj: PythonObject = await withCheckedContinuation { continuation in 14 | // NOTE: Uses `pythonkit-async.py`'s `coroutine_to_callback`. 15 | pythonKitAsync.coroutine_to_callback(self, PythonFunction { (arg: PythonObject) in 16 | continuation.resume(returning: arg) 17 | return 0 18 | }) 19 | } 20 | 21 | // NOTE: Required to run other concurrent coroutines. 22 | await Task.yield() 23 | 24 | return pyObj 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Sources/PythonKitAsync/pythonkit-async.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | async def coroutine_wrapper(coroutine, callback): 4 | val = await coroutine 5 | callback(val) 6 | 7 | def coroutine_to_callback(coroutine, callback): 8 | if asyncio.iscoroutine(coroutine): 9 | loop = asyncio.get_event_loop() 10 | loop.run_until_complete(coroutine_wrapper(coroutine, callback)) 11 | else: 12 | callback(coroutine) # Calls back immediately with non-coroutine object. 13 | -------------------------------------------------------------------------------- /Tests/ActoCrawlerTests/CrawlerTests.swift: -------------------------------------------------------------------------------- 1 | import XCTest 2 | import class Foundation.Bundle 3 | 4 | final class CrawlerTests: XCTestCase 5 | { 6 | // TBD 7 | } 8 | -------------------------------------------------------------------------------- /Tests/ActoCrawlerTests/ReadMeExample.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import ActoCrawler 3 | 4 | /// - Note: Only for compiling purpose. 5 | /// @main 6 | struct ReadMeExample 7 | { 8 | static func main() async 9 | { 10 | struct Output: Sendable 11 | { 12 | let nextLinksCount: Int 13 | } 14 | 15 | let htmlCrawler = await Crawler.htmlScraper( 16 | config: CrawlerConfig( 17 | maxDepths: 10, 18 | maxTotalRequests: 100, 19 | timeoutPerRequest: 5, 20 | userAgent: "ActoCrawler", 21 | domainFilteringPolicy: .disallowedDomains([".*google.com*" /* ... */]), 22 | domainQueueTable: [ 23 | ".*example1.com*": .init(maxConcurrency: 1, delay: 0), 24 | ".*example2.com*": .init(maxConcurrency: 5, delay: 0.1 ... 0.5) 25 | ] 26 | ), 27 | scrapeHTML: { response in 28 | let html = response.data 29 | let links = try html.select("a").map { try $0.attr("href") } 30 | 31 | let nextRequests = links 32 | .compactMap(URL.init(string:)) 33 | .map { UserRequest(url: $0) } 34 | 35 | return (nextRequests, Output(nextLinksCount: nextRequests.count)) 36 | } 37 | ) 38 | 39 | // Visit initial page. 40 | htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!) 41 | 42 | // Observe crawl events. 43 | for await event in htmlCrawler.events { 44 | switch event { 45 | case let .willCrawl(req): 46 | print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)") 47 | case let .didCrawl(req, .success(output)): 48 | print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), nextLinksCount = \(output.nextLinksCount)") 49 | case let .didCrawl(req, .failure(error)): 50 | print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)") 51 | } 52 | } 53 | 54 | print("Output Done") 55 | } 56 | } 57 | --------------------------------------------------------------------------------