├── .gitignore
├── Examples
    ├── HeadlessBrowserExample
    │   └── HeadlessBrowserExample.swift
    ├── ImageScraperExample
    │   └── ImageScraperExample.swift
    ├── PagingScraperExample
    │   └── PagingScraperExample.swift
    └── ScraperExample
    │   └── ScraperExample.swift
├── LICENSE
├── Package.resolved
├── Package.swift
├── README.md
├── Sources
    ├── ActoCrawler
    │   ├── CrawlError.swift
    │   ├── CrawlEvent.swift
    │   ├── Crawler.htmlScraper.swift
    │   ├── Crawler.swift
    │   ├── CrawlerConfig.swift
    │   ├── Domain.swift
    │   ├── DomainQueueTable.swift
    │   ├── Internal
    │   │   ├── CrawlQueue.swift
    │   │   ├── Environment.swift
    │   │   ├── Logic.swift
    │   │   └── Regex.swift
    │   ├── NetworkSession.swift
    │   ├── Request.swift
    │   └── Response.swift
    ├── ActoCrawlerPlaywright
    │   ├── Crawler.withPlaywright.swift
    │   ├── PlaywrightActor.swift
    │   └── _exported.swift
    ├── AsyncChannel
    │   ├── AsyncChannel.swift
    │   ├── Locking.swift
    │   └── UnsafeResumption.swift
    └── PythonKitAsync
    │   ├── Bundle.swift
    │   ├── asPyAsync.swift
    │   └── pythonkit-async.py
└── Tests
    └── ActoCrawlerTests
        ├── CrawlerTests.swift
        └── ReadMeExample.swift


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Xcode
 2 | #
 3 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
 4 | 
 5 | ## User settings
 6 | xcuserdata/
 7 | 
 8 | ## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9)
 9 | *.xcscmblueprint
10 | *.xccheckout
11 | 
12 | ## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4)
13 | build/
14 | DerivedData/
15 | *.moved-aside
16 | *.pbxuser
17 | !default.pbxuser
18 | *.mode1v3
19 | !default.mode1v3
20 | *.mode2v3
21 | !default.mode2v3
22 | *.perspectivev3
23 | !default.perspectivev3
24 | 
25 | ## Obj-C/Swift specific
26 | *.hmap
27 | 
28 | ## App packaging
29 | *.ipa
30 | *.dSYM.zip
31 | *.dSYM
32 | 
33 | ## Playgrounds
34 | timeline.xctimeline
35 | playground.xcworkspace
36 | 
37 | # Swift Package Manager
38 | #
39 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies.
40 | # Packages/
41 | # Package.pins
42 | # Package.resolved
43 | # *.xcodeproj
44 | #
45 | # Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata
46 | # hence it is not needed unless you have added a package configuration file to your project
47 | # .swiftpm
48 | 
49 | .build/
50 | 
51 | # CocoaPods
52 | #
53 | # We recommend against adding the Pods directory to your .gitignore. However
54 | # you should judge for yourself, the pros and cons are mentioned at:
55 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
56 | #
57 | # Pods/
58 | #
59 | # Add this line if you want to avoid checking in source code from the Xcode workspace
60 | # *.xcworkspace
61 | 
62 | # Carthage
63 | #
64 | # Add this line if you want to avoid checking in source code from Carthage dependencies.
65 | # Carthage/Checkouts
66 | 
67 | Carthage/Build/
68 | 
69 | # Accio dependency management
70 | Dependencies/
71 | .accio/
72 | 
73 | # fastlane
74 | #
75 | # It is recommended to not store the screenshots in the git repo.
76 | # Instead, use fastlane to re-generate the screenshots whenever they are needed.
77 | # For more information about the recommended setup visit:
78 | # https://docs.fastlane.tools/best-practices/source-control/#source-control
79 | 
80 | fastlane/report.xml
81 | fastlane/Preview.html
82 | fastlane/screenshots/**/*.png
83 | fastlane/test_output
84 | 
85 | # Code Injection
86 | #
87 | # After new code Injection tools there's a generated folder /iOSInjectionProject
88 | # https://github.com/johnno1962/injectionforxcode
89 | 
90 | iOSInjectionProject/
91 | 


--------------------------------------------------------------------------------
/Examples/HeadlessBrowserExample/HeadlessBrowserExample.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import ActoCrawler
 3 | import ActoCrawlerPlaywright
 4 | 
 5 | /// [playwright-python](https://playwright.dev/python/docs/intro) (headless browser) example.
 6 | @main
 7 | struct HeadlessBrowserExample
 8 | {
 9 |     static func main() async
10 |     {
11 |         struct Output: Sendable
12 |         {
13 |             let screenshotPath: String
14 |         }
15 | 
16 |         let home = NSHomeDirectory()
17 | 
18 |         let crawler = await Crawler<Output, Void>.withPlaywright(
19 |             pythonPackagePaths: [
20 |                 // NOTE: Change path to your own settings.
21 |                 "\(home)/.pyenv/versions/miniforge3-4.10.3-10/envs/ml/lib/python3.9/site-packages"
22 |             ],
23 |             config: CrawlerConfig(
24 |                 maxTotalRequests: 8,
25 |                 domainQueueTable: [
26 |                     ".*": .init(maxConcurrency: 5, delay: 0)
27 |                 ]
28 |             ),
29 |             crawl: { request, playwright, browser in
30 |                 // NOTE:
31 |                 // `playwright` is `PythonObject` that can inter-op with Python using `@dynamicMemberLookup`.
32 |                 // For playwright-python APIs, see documentation:
33 |                 // https://playwright.dev/python/docs/intro
34 | 
35 |                 let context = await browser.new_context().asPyAsync()
36 |                 let page = await context.new_page().asPyAsync()
37 | 
38 |                 // Visit URL.
39 |                 await page.goto(request.url.absoluteString).asPyAsync()
40 | 
41 |                 // Take screenshot.
42 |                 let screenshotPath = "screenshots/example-\(request.order).png"
43 |                 await page.screenshot(path: screenshotPath).asPyAsync()
44 | 
45 |                 // Extract next URL links.
46 |                 // https://playwright.dev/python/docs/evaluating
47 |                 let linkObjects = await page
48 |                     .evaluate("() => Array.from(document.links).map(item => item.href)")
49 |                     .asPyAsync()
50 | 
51 |                 let nextUserRequests: [UserRequest<Void>]
52 |                 if let links: [String] = Array(linkObjects) {
53 |                     nextUserRequests = links
54 |                         .compactMap { URL(string: $0).map(UserRequest.init(url:)) }
55 |                         .shuffled()
56 |                 }
57 |                 else {
58 |                     nextUserRequests = []
59 |                 }
60 | 
61 |                 await page.close().asPyAsync()
62 |                 await context.close().asPyAsync()
63 | 
64 |                 return (nextUserRequests, Output(screenshotPath: screenshotPath))
65 |             }
66 |         )
67 | 
68 |         // Initial crawls.
69 |         crawler.visit(requests: [
70 |             .init(url: URL(string: "https://en.wikipedia.org")!),
71 |             .init(url: URL(string: "https://ja.wikipedia.org")!),
72 |             .init(url: URL(string: "https://zh.wikipedia.org")!),
73 |         ])
74 | 
75 |         for await event in crawler.events {
76 |             switch event {
77 |             case let .willCrawl(req):
78 |                 print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)")
79 |             case let .didCrawl(req, .success(output)):
80 |                 print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), screenshotPath = \(output.screenshotPath)")
81 |             case let .didCrawl(req, .failure(error)):
82 |                 print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
83 |             }
84 |         }
85 | 
86 |         print("Output Done")
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/Examples/ImageScraperExample/ImageScraperExample.swift:
--------------------------------------------------------------------------------
  1 | @preconcurrency import Foundation
  2 | import ActoCrawler
  3 | 
  4 | /// Image scraper example using 2 crawlers: `htmlCrawler` and `imageDownloader`.
  5 | @main
  6 | struct ImageScraperExample
  7 | {
  8 |     static func main() async
  9 |     {
 10 |         struct HtmlCrawlerOutput: Sendable
 11 |         {
 12 |             let nextLinksCount: Int
 13 |         }
 14 | 
 15 |         struct ImageDownloaderOutput: Sendable
 16 |         {
 17 |             let savedFileURL: URL
 18 |         }
 19 | 
 20 |         let imageDownloader = await Crawler<ImageDownloaderOutput, Void>.withNetworkSession(
 21 |             config: CrawlerConfig(
 22 |                 maxTotalRequests: 10,
 23 |                 domainQueueTable: [
 24 |                     ".*": .init(maxConcurrency: 10, delay: 0.1)
 25 |                 ]
 26 |             ),
 27 |             crawl: { request, urlSession in
 28 |                 let fileURL = try await urlSession.downloadImage(url: request.url)
 29 |                 return (
 30 |                     [] /* no next URLs */,
 31 |                     ImageDownloaderOutput(savedFileURL: fileURL)
 32 |                 )
 33 |             }
 34 |         )
 35 | 
 36 |         let htmlCrawler = await Crawler<HtmlCrawlerOutput, Void>.htmlScraper(
 37 |             config: CrawlerConfig(
 38 |                 maxTotalRequests: 10,
 39 | //                domainFilteringPolicy: .disallowedDomains(["wiki*"]),
 40 | //                domainFilteringPolicy: .allowedDomains(["wiki*"]),
 41 |                 domainQueueTable: [
 42 |                     ".*": .init(maxConcurrency: 10, delay: 0.1)
 43 |                 ]
 44 |             ),
 45 |             scrapeHTML: { response in
 46 |                 let html = response.data
 47 |                 let links = try html.select("a").map { try $0.attr("href") }
 48 |                 let nextRequests = links
 49 |                     .compactMap(URL.init(string:))
 50 |                     .filter { $0.scheme != nil }
 51 |                     .map { UserRequest(url: $0) }
 52 | 
 53 |                 // Send `imageURLs` to `imageDownloader`.
 54 |                 // NOTE: `imageDownloader` queues are managed separately from `htmlCrawler`.
 55 |                 let imageURLs = try html.select("img").map { try $0.attr("src") }
 56 |                     .compactMap(URL.init)
 57 |                     .filter { $0.scheme?.hasPrefix("http") == true }
 58 | 
 59 |                 for imageURL in imageURLs {
 60 |                     let request = UserRequest(url: imageURL)
 61 |                     imageDownloader.visit(request: request)
 62 |                 }
 63 | 
 64 |                 return (
 65 |                     nextRequests,
 66 |                     HtmlCrawlerOutput(nextLinksCount: imageURLs.count)
 67 |                 )
 68 |             }
 69 |         )
 70 | 
 71 |         htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!)
 72 | 
 73 |         await withThrowingTaskGroup(of: Void.self, returning: Void.self) { group in
 74 |             group.addTask {
 75 |                 for await event in imageDownloader.events {
 76 |                     switch event {
 77 |                     case let .willCrawl(req):
 78 |                         print("🖼️ Image Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)")
 79 |                     case let .didCrawl(req, .success(output)):
 80 |                         print("🖼️ Image Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), savedFileURL = \(output.savedFileURL)")
 81 |                     case let .didCrawl(req, .failure(error)):
 82 |                         print("🖼️ Image Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
 83 |                     }
 84 |                 }
 85 | 
 86 |                 print("Image Output Done")
 87 |             }
 88 |             group.addTask {
 89 |                 for await event in htmlCrawler.events {
 90 |                     switch event {
 91 |                     case let .willCrawl(req):
 92 |                         print("🌐 HTML Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)")
 93 |                     case let .didCrawl(req, .success):
 94 |                         print("🌐 HTML Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url)")
 95 |                     case let .didCrawl(req, .failure(error)):
 96 |                         print("🌐 HTML Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
 97 |                     }
 98 |                 }
 99 | 
100 |                 print("🌐 HTML Output Done")
101 |             }
102 |         }
103 | 
104 |         print("Image directory:", savingSubDirectory())
105 |     }
106 | }
107 | 
108 | // MARK: - Private
109 | 
110 | extension NetworkSession
111 | {
112 |     fileprivate func downloadImage(url: URL) async throws -> URL
113 |     {
114 |         let (data, _) = try await self.data(for: URLRequest(url: url))
115 | 
116 |         let filename = url.lastPathComponent
117 |         let dirURL = savingSubDirectory()
118 |         return try saveData(data, dirURL: dirURL, filename: filename)
119 |     }
120 | }
121 | 
122 | private func saveData(_ data: Data, dirURL: URL, filename: String) throws -> URL {
123 |     let fileURL = dirURL.appendingPathComponent(filename)
124 |     try data.write(to: fileURL)
125 |     return fileURL
126 | }
127 | 
128 | private func savingSubDirectory() -> URL {
129 |     let dirURL = URL(fileURLWithPath: NSTemporaryDirectory())
130 |         .appendingPathComponent("ActoCrawlerExample")
131 |     try? FileManager.default.createDirectory(
132 |         at: dirURL,
133 |         withIntermediateDirectories: true,
134 |         attributes: nil
135 |     )
136 |     return dirURL
137 |     }
138 | 


--------------------------------------------------------------------------------
/Examples/PagingScraperExample/PagingScraperExample.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import ActoCrawler
 3 | 
 4 | /// Pagination-based scraping example using `URLInfo`.
 5 | @main
 6 | struct PagingScraperExample
 7 | {
 8 |     static func main() async
 9 |     {
10 |         /// Additive information (page type) attached to requesting URL to track for determining next crawlings.
11 |         enum URLInfo
12 |         {
13 |             case page(UInt64)
14 |             case post
15 |         }
16 | 
17 |         struct Output: Sendable
18 |         {
19 |             let message: String
20 |         }
21 | 
22 |         let htmlCrawler = await Crawler<Output, URLInfo>.htmlScraper(
23 |             config: CrawlerConfig(
24 |                 maxTotalRequests: 50,
25 |                 domainQueueTable: [
26 |                     ".*news.ycombinator.com.*": .init(maxConcurrency: 3, delay: 0.3)
27 |                 ]
28 |             ),
29 |             scrapeHTML: { response in
30 |                 let html = response.data
31 | 
32 |                 switch response.urlInfo {
33 |                 case let .page(page):
34 |                     var nextURLs: [UserRequest<URLInfo>]
35 |                     nextURLs = try html.select("table.itemlist tr.athing")
36 |                         .map { "https://news.ycombinator.com/item?id=\($0.id())" }
37 |                         .compactMap(URL.init)
38 |                         .map { UserRequest<URLInfo>(url: $0, urlInfo: .post) }
39 | 
40 |                     if page < 100 {
41 |                         if let nextPageURL = URL(string: "https://news.ycombinator.com/news?p=\(page + 1)")
42 |                         {
43 |                             let nextPageRequest = UserRequest<URLInfo>(url: nextPageURL, urlInfo: .page(page + 1))
44 |                             nextURLs.append(nextPageRequest)
45 |                         }
46 |                     }
47 | 
48 |                     return (nextURLs, Output(message: "Crawled page = \(page)."))
49 | 
50 |                 case .post:
51 |                     let title = try html.title()
52 |                     return ([], Output(message: "Crawled post, title = \(title)"))
53 |                 }
54 |             }
55 |         )
56 | 
57 |         htmlCrawler.visit(url: URL(string: "https://news.ycombinator.com/news")!, urlInfo: .page(1))
58 | 
59 |         for await event in htmlCrawler.events {
60 |             switch event {
61 |             case let .willCrawl(req):
62 |                 print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)")
63 |             case let .didCrawl(req, .success(output)):
64 |                 print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), message = \(output.message)")
65 |             case let .didCrawl(req, .failure(error)):
66 |                 print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
67 |             }
68 |         }
69 | 
70 |         print("Output Done")
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/Examples/ScraperExample/ScraperExample.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import ActoCrawler
 3 | 
 4 | /// Basic HTML scraping example using `Crawler.htmlScraper`.
 5 | @main
 6 | struct ScraperExample
 7 | {
 8 |     static func main() async
 9 |     {
10 |         struct Output: Sendable
11 |         {
12 |             let nextLinksCount: Int
13 |         }
14 | 
15 |         let htmlCrawler = await Crawler<Output, Void>.htmlScraper(
16 |             config: CrawlerConfig(
17 |                 maxTotalRequests: 10
18 |             ),
19 |             scrapeHTML: { response in
20 |                 let html = response.data
21 |                 let links = try html.select("a").map { try $0.attr("href") }
22 | 
23 |                 let nextRequests = links
24 |                     .compactMap(URL.init(string:))
25 |                     .filter { $0.scheme != nil }
26 |                     .map { UserRequest(url: $0) }
27 | 
28 |                 return (nextRequests, Output(nextLinksCount: nextRequests.count))
29 |             }
30 |         )
31 | 
32 |         htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!)
33 | 
34 |         for await event in htmlCrawler.events {
35 |             switch event {
36 |             case let .willCrawl(req):
37 |                 print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)")
38 |             case let .didCrawl(req, .success(output)):
39 |                 print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), nextLinksCount = \(output.nextLinksCount)")
40 |             case let .didCrawl(req, .failure(error)):
41 |                 print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
42 |             }
43 |         }
44 | 
45 |         print("Output Done")
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Yasuhiro Inami
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Package.resolved:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pins" : [
 3 |     {
 4 |       "identity" : "actomaton",
 5 |       "kind" : "remoteSourceControl",
 6 |       "location" : "https://github.com/inamiy/Actomaton.git",
 7 |       "state" : {
 8 |         "branch" : "main",
 9 |         "revision" : "18b559dc4f6b89676e6a71448257df2af6887196"
10 |       }
11 |     },
12 |     {
13 |       "identity" : "pythonkit",
14 |       "kind" : "remoteSourceControl",
15 |       "location" : "https://github.com/pvieito/PythonKit.git",
16 |       "state" : {
17 |         "branch" : "master",
18 |         "revision" : "81f621d094a7c8923207efe5178f50dba1b56c39"
19 |       }
20 |     },
21 |     {
22 |       "identity" : "swift-case-paths",
23 |       "kind" : "remoteSourceControl",
24 |       "location" : "https://github.com/pointfreeco/swift-case-paths",
25 |       "state" : {
26 |         "revision" : "ce9c0d897db8a840c39de64caaa9b60119cf4be8",
27 |         "version" : "0.8.1"
28 |       }
29 |     },
30 |     {
31 |       "identity" : "swift-collections",
32 |       "kind" : "remoteSourceControl",
33 |       "location" : "https://github.com/apple/swift-collections.git",
34 |       "state" : {
35 |         "revision" : "48254824bb4248676bf7ce56014ff57b142b77eb",
36 |         "version" : "1.0.2"
37 |       }
38 |     },
39 |     {
40 |       "identity" : "swift-custom-dump",
41 |       "kind" : "remoteSourceControl",
42 |       "location" : "https://github.com/pointfreeco/swift-custom-dump",
43 |       "state" : {
44 |         "revision" : "c4f78db9b90ca57b7b6abc2223e235242739ea3c",
45 |         "version" : "0.4.0"
46 |       }
47 |     },
48 |     {
49 |       "identity" : "swiftsoup",
50 |       "kind" : "remoteSourceControl",
51 |       "location" : "https://github.com/scinfu/SwiftSoup.git",
52 |       "state" : {
53 |         "revision" : "41e7c263fb8c277e980ebcb9b0b5f6031d3d4886",
54 |         "version" : "2.4.2"
55 |       }
56 |     },
57 |     {
58 |       "identity" : "xctest-dynamic-overlay",
59 |       "kind" : "remoteSourceControl",
60 |       "location" : "https://github.com/pointfreeco/xctest-dynamic-overlay",
61 |       "state" : {
62 |         "revision" : "50a70a9d3583fe228ce672e8923010c8df2deddd",
63 |         "version" : "0.2.1"
64 |       }
65 |     }
66 |   ],
67 |   "version" : 2
68 | }
69 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version: 5.6
 2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
 3 | 
 4 | import PackageDescription
 5 | 
 6 | let package = Package(
 7 |     name: "ActoCrawler",
 8 |     platforms: [.macOS(.v12)],
 9 |     products: [
10 |         .library(
11 |             name: "ActoCrawler",
12 |             targets: ["ActoCrawler"]),
13 |         .library(
14 |             name: "ActoCrawlerPlaywright",
15 |             targets: ["ActoCrawlerPlaywright"]),
16 |         .executable(
17 |             name: "ScraperExample",
18 |             targets: ["ScraperExample"]),
19 |         .executable(
20 |             name: "ImageScraperExample",
21 |             targets: ["ImageScraperExample"]),
22 |         .executable(
23 |             name: "PagingScraperExample",
24 |             targets: ["PagingScraperExample"]),
25 |         .executable(
26 |             name: "HeadlessBrowserExample",
27 |             targets: ["HeadlessBrowserExample"]),
28 |     ],
29 |     dependencies: [
30 |         .package(url: "https://github.com/inamiy/Actomaton.git", branch: "main"),
31 |         .package(url: "https://github.com/apple/swift-collections.git", from: "1.0.0"),
32 |         // .package(url: "https://github.com/apple/swift-async-algorithms.git", branch: "main"),
33 |         .package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.4.2"),
34 |         .package(url: "https://github.com/pvieito/PythonKit.git", branch: "master"),
35 |     ],
36 |     targets: [
37 |         .target(
38 |             name: "AsyncChannel"),
39 |         .target(
40 |             name: "PythonKitAsync",
41 |             dependencies: [
42 |                 .product(name: "PythonKit", package: "PythonKit")
43 |             ],
44 |             resources: [.copy("pythonkit-async.py")]
45 |         ),
46 |         .target(
47 |             name: "ActoCrawler",
48 |             dependencies: [
49 |                 "AsyncChannel",
50 |                 .product(name: "Actomaton", package: "Actomaton"),
51 |                 .product(name: "Collections", package: "swift-collections"),
52 |                 // .product(name: "AsyncAlgorithms", package: "swift-async-algorithms"),
53 |                 .product(name: "SwiftSoup", package: "SwiftSoup"),
54 |             ],
55 |             swiftSettings: [
56 |                 .unsafeFlags([
57 |                     "-Xfrontend", "-warn-concurrency",
58 |                     "-Xfrontend", "-enable-actor-data-race-checks",
59 |                 ])
60 |             ]
61 |         ),
62 |         .target(
63 |             name: "ActoCrawlerPlaywright",
64 |             dependencies: [
65 |                 "ActoCrawler", "PythonKitAsync"
66 |             ],
67 |             swiftSettings: [
68 |                 .unsafeFlags([
69 |                     "-Xfrontend", "-warn-concurrency",
70 |                     "-Xfrontend", "-enable-actor-data-race-checks",
71 |                 ])
72 |             ]
73 |         ),
74 |         .testTarget(
75 |             name: "ActoCrawlerTests",
76 |             dependencies: ["ActoCrawler"]),
77 |         .executableTarget(
78 |             name: "ScraperExample",
79 |             dependencies: ["ActoCrawler"],
80 |             path: "Examples/ScraperExample"),
81 |         .executableTarget(
82 |             name: "ImageScraperExample",
83 |             dependencies: ["ActoCrawler"],
84 |             path: "Examples/ImageScraperExample"),
85 |         .executableTarget(
86 |             name: "PagingScraperExample",
87 |             dependencies: ["ActoCrawler"],
88 |             path: "Examples/PagingScraperExample"),
89 |         .executableTarget(
90 |             name: "HeadlessBrowserExample",
91 |             dependencies: ["ActoCrawlerPlaywright"],
92 |             path: "Examples/HeadlessBrowserExample"),
93 |     ]
94 | )
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🕸️ ActoCrawler
 2 | 
 3 | **ActoCrawler** is a Swift Concurrency-powered crawler engine on top of [Actomaton](https://github.com/inamiy/Actomaton), with flexible customizability to create various HTML scrapers, image scrapers, etc.
 4 | 
 5 | ## Example
 6 | 
 7 | - [Examples](Examples)
 8 | 
 9 | ```swift
10 | struct Output: Sendable
11 | {
12 |     let nextLinksCount: Int
13 | }
14 | 
15 | let htmlCrawler = await Crawler<Output, Void>.htmlScraper(
16 |     config: CrawlerConfig(
17 |         maxDepths: 10,
18 |         maxTotalRequests: 100,
19 |         timeoutPerRequest: 5,
20 |         userAgent: "ActoCrawler",
21 |         domainFilteringPolicy: .disallowedDomains([".*google.com*" /* ... */]),
22 |         domainQueueTable: [
23 |             ".*example1.com*": .init(maxConcurrency: 1, delay: 0),
24 |             ".*example2.com*": .init(maxConcurrency: 5, delay: 0.1 ... 0.5)
25 |         ]
26 |     ),
27 |     scrapeHTML: { response in
28 |         let html = response.data
29 |         let links = try html.select("a").map { try $0.attr("href") }
30 | 
31 |         let nextRequests = links
32 |             .compactMap(URL.init(string:))
33 |             .map { UserRequest(url: $0) }
34 | 
35 |         return (nextRequests, Output(nextLinksCount: nextRequests.count))
36 |     }
37 | )
38 | 
39 | // Visit initial page.
40 | htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!)
41 | 
42 | // Observe crawl events.
43 | for await event in htmlCrawler.events {
44 |     switch event {
45 |     case let .willCrawl(req):
46 |         print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)")
47 |     case let .didCrawl(req, .success(output)):
48 |         print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), nextLinksCount = \(output.nextLinksCount)")
49 |     case let .didCrawl(req, .failure(error)):
50 |         print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
51 |     }
52 | }
53 | 
54 | print("Output Done")
55 | ```
56 | 
57 | ## Acknowledgements
58 | 
59 | - [mattsse/voyager](https://github.com/mattsse/voyager)
60 | 
61 | ## License
62 | 
63 | [MIT](LICENSE)
64 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/CrawlError.swift:
--------------------------------------------------------------------------------
 1 | @preconcurrency import Foundation
 2 | 
 3 | /// Crawler error type.
 4 | public enum CrawlError: Error
 5 | {
 6 |     /// Failed to convert `URLResponse` into `HTTPURLResponse`.
 7 |     case invalidHTTPResponse(URLResponse)
 8 | 
 9 |     /// Failed to convert `Data` into crawler's perferred format.
10 |     case invalidData
11 | 
12 |     /// Error when ``ActoCrawlerConfig/domainFilteringPolicy`` did not allow URL to pass.
13 |     case domainNotAllowed(Domain)
14 | 
15 |     /// Crawling failed during ``Crawler/init(config:dependency:crawl:)``'s `crawl` method.
16 |     case crawlFailed(Error)
17 | }
18 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/CrawlEvent.swift:
--------------------------------------------------------------------------------
1 | /// Crawler output event to be delivered to ``Crawler/events``.
2 | public enum CrawlEvent<Output, URLInfo>: Sendable
3 |     where Output: Sendable, URLInfo: Sendable
4 | {
5 |     case willCrawl(Request<URLInfo>)
6 |     case didCrawl(Request<URLInfo>, Result<Output, CrawlError>)
7 | }
8 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Crawler.htmlScraper.swift:
--------------------------------------------------------------------------------
 1 | @preconcurrency import Foundation
 2 | @preconcurrency import SwiftSoup
 3 | 
 4 | extension Crawler
 5 | {
 6 |     /// Helper initializer using ``NetworkSession`` as network request, and [SwiftSoup](https://github.com/scinfu/SwiftSoup) as HTML scraper.
 7 |     ///
 8 |     /// - Parameters:
 9 |     ///   - scrapeHTML:
10 |     ///     Receives `Response` that contains HTML `Document` to be scraped,
11 |     ///     and returns array of next `UserRequest`s as well as `Output` for current request output.
12 |     ///     If `Error` is thrown inside this closure, it will be observed as a failure of ``Crawler/events``.
13 |     public static func htmlScraper(
14 |         config: CrawlerConfig,
15 |         scrapeHTML: @escaping @Sendable (Response<Document, URLInfo>) async throws -> ([UserRequest<URLInfo>], Output)
16 |     ) async -> Crawler
17 |         where Output: Sendable
18 |     {
19 |         await Crawler.withNetworkSession(
20 |             config: config,
21 |             crawl: { request, urlSession in
22 |                 // Network request.
23 |                 let urlRequest: URLRequest = URLRequest(url: request.url, timeoutInterval: config.timeoutPerRequest)
24 |                 let (data, httpResponse) = try await urlSession.data(for: urlRequest)
25 | 
26 |                 guard let html = String(data: data, encoding: .utf8) else {
27 |                     throw CrawlError.invalidData
28 |                 }
29 | 
30 |                 // SwiftSoup HTML parsing.
31 |                 let doc = try SwiftSoup.parse(html)
32 | 
33 |                 let response = Response(
34 |                     request: Request(
35 |                         url: request.url,
36 |                         urlInfo: request.urlInfo,
37 |                         order: request.order,
38 |                         depth: request.depth
39 |                     ),
40 |                     data: doc,
41 |                     httpResponse: httpResponse
42 |                 )
43 | 
44 |                 return try await scrapeHTML(response)
45 |             }
46 |         )
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Crawler.swift:
--------------------------------------------------------------------------------
 1 | @preconcurrency import Foundation
 2 | import Actomaton
 3 | import AsyncChannel
 4 | 
 5 | /// Swift Concurrency-powered crawler engine on top of [Actomaton](https://github.com/inamiy/Actomaton).
 6 | ///
 7 | /// Initializers:
 8 | /// - ``Crawler/init(config:dependency:crawl:)`` is the designated initializer for arbitrary effectful crawling logic.
 9 | /// - ``Crawler/withNetworkSession(config:crawl:)`` is a helper initializer that uses ``NetworkSession`` as dependency.
10 | /// - ``Crawler/htmlScraper(config:scrapeHTML:)`` is a helper initializer to scrape HTML using [SwiftSoup](https://github.com/scinfu/SwiftSoup) .
11 | public struct Crawler<Output, URLInfo>: Sendable
12 |     where Output: Sendable, URLInfo: Sendable
13 | {
14 |     private let actomaton: Actomaton<Action<Output, URLInfo>, State>
15 |     private let environment: Environment<Output, URLInfo>
16 | 
17 |     /// Designated initializer for arbitrary crawling logic.
18 |     ///
19 |     /// - Parameters:
20 |     ///   - dependency: ``Crawler``-retained reference that is passed on every `crawl`.
21 |     ///   - crawl:
22 |     ///     Receives `Request` to perform some async operations (e.g. network requesting and parsing),
23 |     ///     and returns array of next `UserRequest`s as well as `Output` for current request output.
24 |     ///     If `Error` is thrown inside this closure, it will be observed as a failure of ``Crawler/events``.
25 |     public init<Dependency>(
26 |         config: CrawlerConfig,
27 |         dependency: Dependency,
28 |         crawl: @escaping @Sendable (Request<URLInfo>, Dependency) async throws -> ([UserRequest<URLInfo>], Output)
29 |     )
30 |         where Dependency: Sendable
31 |     {
32 |         let environment = Environment(config: config, dependency: dependency, crawl: crawl)
33 | 
34 |         self.actomaton = Actomaton<Action, State>(
35 |             state: State(),
36 |             reducer: reducer(),
37 |             environment: environment
38 |         )
39 |         self.environment = environment
40 |     }
41 | 
42 |     /// Helper initializer that adds ``NetworkSession`` as dependency.
43 |     public static func withNetworkSession(
44 |         config: CrawlerConfig,
45 |         crawl: @escaping @Sendable (Request<URLInfo>, NetworkSession) async throws -> ([UserRequest<URLInfo>], Output)
46 |     ) async -> Crawler<Output, URLInfo>
47 |     {
48 |         let configuration: URLSessionConfiguration = {
49 |             let configuration = URLSessionConfiguration.default
50 |             configuration.httpAdditionalHeaders = ["User-Agent": config.userAgent]
51 |             return configuration
52 |         }()
53 | 
54 |         return .init(config: config, dependency: await NetworkSession(configuration: configuration), crawl: crawl)
55 |     }
56 | 
57 |     /// Crawler output event `AsyncSequence`.
58 |     /// - Todo: `any `AsyncSequence`.`
59 |     public var events: AsyncChannel<CrawlEvent<Output, URLInfo>>
60 |     {
61 |         self.environment.events
62 |     }
63 | 
64 |     /// Visits `url` as depth = 1 without `urlInfo`.
65 |     public func visit(url: URL) where URLInfo == Void
66 |     {
67 |         self.visit(
68 |             requests: [UserRequest(url: url)]
69 |         )
70 |     }
71 | 
72 |     /// Visits `request` as depth = 1 with `urlInfo` as additive information.
73 |     public func visit(url: URL, urlInfo: URLInfo)
74 |     {
75 |         self.visit(
76 |             request: UserRequest(url: url, urlInfo: urlInfo)
77 |         )
78 |     }
79 | 
80 |     /// Visits `request` as depth = 1.
81 |     public func visit(request: UserRequest<URLInfo>)
82 |     {
83 |         self.visit(requests: [request])
84 |     }
85 | 
86 |     /// Visits multiple `requests` as depth = 1.
87 |     public func visit(requests: [UserRequest<URLInfo>])
88 |     {
89 |         Task { [actomaton] in
90 |             await actomaton.send(.visit(requests))
91 |         }
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/CrawlerConfig.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | 
 3 | /// ``Crawler`` configuration.
 4 | public struct CrawlerConfig: Hashable, Sendable
 5 | {
 6 |     /// Maximum depth of crawling.
 7 |     public let maxDepths: UInt64
 8 | 
 9 |     /// Maximum total requests for crawling.
10 |     public let maxTotalRequests: UInt64
11 | 
12 |     /// Per request timeout (in seconds).
13 |     public let timeoutPerRequest: TimeInterval
14 | 
15 |     /// User-Agent attached to request header.
16 |     public let userAgent: String
17 | 
18 |     /// Domain filtering policy.
19 |     public let domainFilteringPolicy: DomainFilteringPolicy
20 | 
21 |     /// Domain-to-Queue mapping table where `maxConcurrency` and `delay` are configurable per `domain`.
22 |     public let domainQueueTable: DomainQueueTable
23 | 
24 |     // TODO:
25 |     // public let respectsRobotsTxt: Bool
26 | 
27 |     public init(
28 |         maxDepths: UInt64 = .max,
29 |         maxTotalRequests: UInt64 = .max,
30 |         timeoutPerRequest: TimeInterval = .greatestFiniteMagnitude,
31 |         userAgent: String = "ActoCrawler",
32 |         domainFilteringPolicy: DomainFilteringPolicy = .allDomains,
33 |         domainQueueTable: DomainQueueTable = [:]
34 |         // respectsRobotsTxt: Bool = true
35 |     )
36 |     {
37 |         self.maxDepths = maxDepths
38 |         self.maxTotalRequests = maxTotalRequests
39 |         self.timeoutPerRequest = timeoutPerRequest
40 |         self.userAgent = userAgent
41 |         self.domainQueueTable = domainQueueTable
42 |         self.domainFilteringPolicy = domainFilteringPolicy
43 |         // self.respectsRobotsTxt = respectsRobotsTxt
44 |     }
45 | }
46 | 
47 | // MARK: - DomainFilteringPolicy
48 | 
49 | public enum DomainFilteringPolicy: Hashable, Sendable
50 | {
51 |     /// All domains policy.
52 |     case allDomains
53 | 
54 |     /// Allowed domains only policy.
55 |     case allowedDomains(Set<Domain>)
56 | 
57 |     /// Ignores disallowed domains policy.
58 |     case disallowedDomains(Set<Domain>)
59 | 
60 |     func isDomainAllowed(for domain: Domain) -> Bool
61 |     {
62 |         switch self {
63 |         case .allDomains:
64 |             return true
65 | 
66 |         case let .allowedDomains(domains):
67 |             return domains.contains(where: { isRegexMatched(domain, pattern: $0) })
68 | 
69 |         case let .disallowedDomains(domains):
70 |             return !domains.contains(where: { isRegexMatched(domain, pattern: $0) })
71 |         }
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Domain.swift:
--------------------------------------------------------------------------------
1 | /// Web domain name, with possible regular expression pattern.
2 | public typealias Domain = String
3 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/DomainQueueTable.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | @preconcurrency import Collections
 3 | 
 4 | /// Domain-to-Queue mapping table where `maxConcurrency` and random `delay` are configurable per `domain`.
 5 | /// - Note: `domain` can use regular expressions.
 6 | ///
 7 | /// For example:
 8 | ///
 9 | /// ```swift
10 | /// let domainQueueTable: DomainQueueTable = [
11 | ///     ".*google.*": .init(maxConcurrency: 5, delay: 0.1) // fixed delay
12 | ///     ".*wikipedia.*": .init(maxConcurrency: 3, delay: 0.1 ... 0.3) // random delay in range
13 | ///     ".*": .default
14 | /// ]
15 | /// let config = CrawlerConfig(..., domainQueueTable: domainQueueTable)
16 | /// ```
17 | public struct DomainQueueTable: Hashable, Sendable
18 | {
19 |     let dictionary: OrderedDictionary<Key, Value>
20 | 
21 |     func buildQueue(url: URL) -> CrawlQueue
22 |     {
23 |         guard let host = url.host else { return .default }
24 | 
25 |         for (pattern, values) in self.dictionary {
26 |             guard isRegexMatched(host, pattern: pattern) else { continue }
27 | 
28 |             return CrawlQueue(
29 |                 domain: pattern,
30 |                 maxConcurrency: values.maxConcurrency,
31 |                 delay: .random(values.delay)
32 |             )
33 |         }
34 | 
35 |         return .default
36 |     }
37 | 
38 |     // MARK: Key/Value
39 | 
40 |     public typealias Key = Domain
41 | 
42 |     public struct Value: Hashable, Sendable
43 |     {
44 |         let maxConcurrency: Int
45 |         let delay: ClosedRange<TimeInterval>
46 | 
47 |         public init(maxConcurrency: Int, delay: ClosedRange<TimeInterval>)
48 |         {
49 |             self.maxConcurrency = maxConcurrency
50 |             self.delay = delay
51 |         }
52 | 
53 |         public init(maxConcurrency: Int, delay: TimeInterval)
54 |         {
55 |             self.maxConcurrency = maxConcurrency
56 |             self.delay = delay ... delay
57 |         }
58 |     }
59 | }
60 | 
61 | extension DomainQueueTable: ExpressibleByDictionaryLiteral
62 | {
63 |     public init(dictionaryLiteral elements: (Key, Value)...)
64 |     {
65 |         self.dictionary = .init(uniqueKeysWithValues: elements)
66 |     }
67 | }
68 | 
69 | extension DomainQueueTable: Sequence
70 | {
71 |     public func makeIterator() -> AnyIterator<(key: Key, value: Value)>
72 |     {
73 |         return AnyIterator(dictionary.makeIterator())
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Internal/CrawlQueue.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import Actomaton
 3 | 
 4 | /// Crawler `EffectQueue` to run on Actomaton.
 5 | /// - Note: This hashable identity is distinguishable per `domain`.
 6 | struct CrawlQueue: EffectQueueProtocol
 7 | {
 8 |     private let domain: Domain?
 9 |     private let maxConcurrency: Int
10 |     let effectQueueDelay: EffectQueueDelay
11 | 
12 |     private init(
13 |         domain: Domain?,
14 |         maxConcurrency: Int,
15 |         delay: EffectQueueDelay
16 |     )
17 |     {
18 |         self.domain = domain
19 |         self.maxConcurrency = maxConcurrency
20 |         self.effectQueueDelay = delay
21 |     }
22 | 
23 |     init(
24 |         domain: Domain,
25 |         maxConcurrency: Int,
26 |         delay: EffectQueueDelay
27 |     )
28 |     {
29 |         self.domain = domain
30 |         self.maxConcurrency = maxConcurrency
31 |         self.effectQueueDelay = delay
32 |     }
33 | 
34 |     static var `default`: CrawlQueue
35 |     {
36 |         CrawlQueue(domain: nil, maxConcurrency: .max, delay: .constant(0))
37 |     }
38 | 
39 |     var effectQueuePolicy: EffectQueuePolicy
40 |     {
41 |         .runOldest(maxCount: self.maxConcurrency, .suspendNew)
42 |     }
43 | 
44 |     func hash(into hasher: inout Hasher)
45 |     {
46 |         hasher.combine(self.domain)
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Internal/Environment.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import AsyncChannel
 3 | 
 4 | /// Effectful environment for making arbitrary crawler.
 5 | struct Environment<Output, URLInfo>: Sendable
 6 |     where Output: Sendable, URLInfo: Sendable
 7 | {
 8 |     let config: CrawlerConfig
 9 | 
10 |     /// Receives `Request` to perform some async operations (e.g. network requesting and parsing),
11 |     /// and returns array of next `Request`s as well as `Output`.
12 |     let crawl: @Sendable (Request<URLInfo>) async throws -> ([UserRequest<URLInfo>], Output)
13 | 
14 |     /// Crawler output event `AsyncSequence`.
15 |     /// - Todo: `any `AsyncSequence`.
16 |     let events: AsyncChannel<CrawlEvent<Output, URLInfo>> = .init()
17 | 
18 |     init<Dependency>(
19 |         config: CrawlerConfig,
20 |         dependency: Dependency,
21 |         crawl: @escaping @Sendable (Request<URLInfo>, Dependency) async throws -> ([UserRequest<URLInfo>], Output)
22 |     )
23 |         where Dependency: Sendable
24 |     {
25 |         self.config = config
26 |         self.crawl = { request in
27 |             try await crawl(request, dependency)
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Internal/Logic.swift:
--------------------------------------------------------------------------------
  1 | @preconcurrency import Foundation
  2 | import Actomaton
  3 | import ActomatonDebugging
  4 | 
  5 | // MARK: - Action
  6 | 
  7 | enum Action<Output, URLInfo>: Sendable
  8 |     where Output: Sendable, URLInfo: Sendable
  9 | {
 10 |     case visit([UserRequest<URLInfo>])
 11 |     case _visit(Request<URLInfo>)
 12 |     case _didVisit(Request<URLInfo>, nextRequests: [UserRequest<URLInfo>], output: Output)
 13 |     case _didFailVisit(Request<URLInfo>, CrawlError)
 14 | }
 15 | 
 16 | // MARK: - State
 17 | 
 18 | struct State: Sendable
 19 | {
 20 |     var waitingURLs: Set<URL> = []
 21 | 
 22 |     /// Total count of "waiting" + "visited" + "failed" URLs.
 23 |     var totalVisitCount: UInt64 = 0
 24 | }
 25 | 
 26 | // MARK: - Reducer
 27 | 
 28 | func reducer<Output, URLInfo>() -> Reducer<Action<Output, URLInfo>, State, Environment<Output, URLInfo>>
 29 |     where Output: Sendable, URLInfo: Sendable
 30 | {
 31 |     typealias Eff = Effect<Action<Output, URLInfo>>
 32 | 
 33 |     return Reducer { action, state, env in
 34 |         /// Common logic to update `state`, output to channel & dispatch next visits.
 35 |         func didFinish(
 36 |             request: Request<URLInfo>?,
 37 |             nextRequests: [UserRequest<URLInfo>],
 38 |             outputResult: Result<Output, CrawlError>?
 39 |         ) -> Eff
 40 |         {
 41 |             // Remove from `waitingURLs`.
 42 |             if let request = request {
 43 |                 state.waitingURLs.remove(request.url)
 44 |             }
 45 | 
 46 |             // Limit `nextRequests` by checking `config.maxTotalRequests`.
 47 |             let totalVisitCount = state.totalVisitCount
 48 |             let remainingVisitCount = max(env.config.maxTotalRequests - totalVisitCount, 0)
 49 |             let nextRequests = nextRequests.prefix(Int(clamping: remainingVisitCount))
 50 | 
 51 |             // Insert next "waiting"s.
 52 |             for nextRequest in nextRequests {
 53 |                 state.waitingURLs.insert(nextRequest.url)
 54 |             }
 55 | 
 56 |             let isFinished = state.waitingURLs.isEmpty && nextRequests.isEmpty
 57 | 
 58 |             /// AsyncChannel effect.
 59 |             let sendToChannel = Eff.fireAndForget {
 60 |                 // NOTE: `outputResult = nil` is passed only on initial crawl.
 61 |                 if let request = request, let outputResult = outputResult {
 62 |                     await env.events.send(.didCrawl(request, outputResult))
 63 |                 }
 64 |                 if isFinished {
 65 |                     env.events.finish()
 66 |                 }
 67 |             }
 68 | 
 69 |             let depth = request?.depth ?? 0
 70 | 
 71 |             /// nextCrawls effect.
 72 |             let nextCrawls = nextRequests.isEmpty || depth >= env.config.maxDepths
 73 |                 ? .empty
 74 |                 : Eff.combine( // Visit next with incrementing `depth`.
 75 |                     nextRequests.enumerated()
 76 |                         .map { i, userReq in
 77 |                             let request = Request<URLInfo>(
 78 |                                 url: userReq.url,
 79 |                                 urlInfo: userReq.urlInfo,
 80 |                                 order: totalVisitCount + UInt64(i),
 81 |                                 depth: depth + 1
 82 |                             )
 83 |                             return .nextAction(._visit(request))
 84 |                         }
 85 |                 )
 86 | 
 87 |             state.totalVisitCount += UInt64(nextRequests.count)
 88 | 
 89 |             return sendToChannel + nextCrawls
 90 |         }
 91 | 
 92 |         // Reducer pattern-matching.
 93 |         switch action {
 94 |         case let .visit(requests):
 95 |             // NOTE:
 96 |             // This is a fake `didFinish` to reuse calculation of `state.waitingURLs` etc by only sending `nextRequests`.
 97 |             return didFinish(
 98 |                 request: nil,
 99 |                 nextRequests: requests,
100 |                 outputResult: nil
101 |             )
102 | 
103 |         case let ._visit(request):
104 |             let host = request.url.host ?? ""
105 | 
106 |             let isAllowed = env.config.domainFilteringPolicy.isDomainAllowed(for: host)
107 |             guard isAllowed else {
108 |                 return .nextAction(
109 |                     ._didFailVisit(request, CrawlError.domainNotAllowed(host))
110 |                 )
111 |             }
112 | 
113 |             let queue = env.config.domainQueueTable.buildQueue(url: request.url)
114 | 
115 |             return Effect(queue: queue) {
116 |                 // Check if `queue` has already force-cancelled this effect.
117 |                 // This is important when using `EffectQueue` with delay.
118 |                 try Task.checkCancellation()
119 | 
120 |                 await env.events.send(.willCrawl(request))
121 | 
122 |                 do {
123 |                     let (nextRequests, output) = try await env.crawl(request)
124 |                     return ._didVisit(request, nextRequests: nextRequests, output: output)
125 |                 }
126 |                 catch {
127 |                     return ._didFailVisit(request, CrawlError.crawlFailed(error))
128 |                 }
129 |             }
130 | 
131 |         case let ._didVisit(request, nextRequests, output):
132 |             return didFinish(
133 |                 request: request,
134 |                 nextRequests: nextRequests,
135 |                 outputResult: .success(output)
136 |             )
137 | 
138 |         case let ._didFailVisit(request, error):
139 |             return didFinish(
140 |                 request: request,
141 |                 nextRequests: [],
142 |                 outputResult: .failure(error)
143 |             )
144 |         }
145 |     }
146 | }
147 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Internal/Regex.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | 
 3 | func isRegexMatched(_ string: String, pattern: String) -> Bool
 4 | {
 5 |     let matches = try? NSRegularExpression(pattern: pattern)
 6 |         .matches(in: string, range: .init(location: 0, length: string.utf16.count))
 7 | 
 8 |     return !(matches ?? []).isEmpty
 9 | }
10 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/NetworkSession.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | 
 3 | /// `URLSession` wrapped by `Actor`, used in ``Crawler/withNetworkSession(config:crawl:)``.
 4 | public actor NetworkSession
 5 | {
 6 |     private let urlSession: URLSession
 7 | 
 8 |     public init(configuration: URLSessionConfiguration) async
 9 |     {
10 |         self.urlSession = URLSession(configuration: configuration)
11 |     }
12 | 
13 |     public func data(for request: URLRequest) async throws -> (Data, HTTPURLResponse)
14 |     {
15 |         let (data, urlResponse) = try await urlSession.data(for: request)
16 | 
17 |         if let urlResponse = urlResponse as? HTTPURLResponse {
18 |             return (data, urlResponse)
19 |         }
20 |         else {
21 |             throw CrawlError.invalidHTTPResponse(urlResponse)
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Request.swift:
--------------------------------------------------------------------------------
 1 | @preconcurrency import Foundation
 2 | 
 3 | // MARK: - UserRequest
 4 | 
 5 | /// User-defined requesting `URL` with additional `URLInfo`.
 6 | public struct UserRequest<URLInfo>: Sendable where URLInfo: Sendable
 7 | {
 8 |     /// Requesting `URL`.
 9 |     public var url: URL
10 | 
11 |     /// Additional info that is attached next to requesting `URL`.
12 |     ///
13 |     /// For example, URL page number can be passed as `URLInfo` in ``Crawler/visit(requests:)``
14 |     /// or ``Crawler/init(config:crawl:)``'s `crawl` return value so that next request can be determined by its page number increment.
15 |     public var urlInfo: URLInfo
16 | 
17 |     public init(url: URL, urlInfo: URLInfo)
18 |     {
19 |         self.url = url
20 |         self.urlInfo = urlInfo
21 |     }
22 | 
23 |     public init(url: URL) where URLInfo == Void
24 |     {
25 |         self.url = url
26 |         self.urlInfo = ()
27 |     }
28 | }
29 | 
30 | // MARK: - Request
31 | 
32 | /// ``UserRequest`` + Acrawler-additions i.e. ``order`` + ``depth``.
33 | @dynamicMemberLookup
34 | public struct Request<URLInfo>: Sendable where URLInfo: Sendable
35 | {
36 |     /// - Note: Accessible via `@dynamicMemberLookup`.
37 |     private var userRequest: UserRequest<URLInfo>
38 | 
39 |     /// Request order number.
40 |     public let order: UInt64
41 | 
42 |     /// Request crawling depth.
43 |     public let depth: UInt64
44 | 
45 |     public init(url: URL, urlInfo: URLInfo, order: UInt64, depth: UInt64)
46 |     {
47 |         self.userRequest = .init(url: url, urlInfo: urlInfo)
48 |         self.order = order
49 |         self.depth = depth
50 |     }
51 | 
52 |     public init(url: URL, order: UInt64, depth: UInt64) where URLInfo == Void
53 |     {
54 |         self.userRequest = .init(url: url)
55 |         self.order = order
56 |         self.depth = depth
57 |     }
58 | 
59 |     public subscript<T>(dynamicMember keyPath: WritableKeyPath<UserRequest<URLInfo>, T>) -> T
60 |     {
61 |         get {
62 |             self.userRequest[keyPath: keyPath]
63 |         }
64 |         set {
65 |             self.userRequest[keyPath: keyPath] = newValue
66 |         }
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawler/Response.swift:
--------------------------------------------------------------------------------
 1 | @preconcurrency import Foundation
 2 | 
 3 | /// HTTP Response  for ``Request`` with `URLSession`'s results and additional `URLInfo`.
 4 | @dynamicMemberLookup
 5 | public struct Response<Data, URLInfo>: Sendable
 6 |     where Data: Sendable, URLInfo: Sendable
 7 | {
 8 |     /// - Note: Accessible via `@dynamicMemberLookup`.
 9 |     private var request: Request<URLInfo>
10 | 
11 |     // URLSession response.
12 |     public var data: Data
13 |     public var httpResponse: HTTPURLResponse
14 | 
15 |     public init(
16 |         request: Request<URLInfo>,
17 |         data: Data,
18 |         httpResponse: HTTPURLResponse
19 |     )
20 |     {
21 |         self.request = request
22 |         self.data = data
23 |         self.httpResponse = httpResponse
24 |     }
25 | 
26 |     public subscript<T>(dynamicMember keyPath: WritableKeyPath<Request<URLInfo>, T>) -> T
27 |     {
28 |         get {
29 |             self.request[keyPath: keyPath]
30 |         }
31 |         set {
32 |             self.request[keyPath: keyPath] = newValue
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawlerPlaywright/Crawler.withPlaywright.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import ActoCrawler
 3 | 
 4 | extension Crawler
 5 | {
 6 |     /// Helper initializer that adds [playwright-python](https://playwright.dev/python/docs/intro) (headless browser) as ActoCrawler's dependency.
 7 |     ///
 8 |     /// As written in the documentation, make sure to setup Python environment before calling this method:
 9 |     ///
10 |     /// 1. `pip install playwright`
11 |     /// 2. `playwright install`
12 |     ///
13 |     /// - Parameters:
14 |     ///   - pythonPackagePaths:
15 |     ///     Python library paths for interacting with `playwright-python`. Use `pip show playwright` to find its locaiton.
16 |     ///
17 |     ///   - browser:
18 |     ///     Creates a new `Browser` object from `playwright` to reuse during crawling iterations.
19 |     ///     If `nil`, Chromium with non-headless mode will launch.
20 |     ///
21 |     ///     Example of this closure is:
22 |     ///     ```
23 |     ///     let browser = { await $0.chromium.launch(headless: false).asPyAsync() }
24 |     ///     ```
25 |     ///
26 |     ///   - crawl:
27 |     ///     Crawling function that receives
28 |     ///     [Playwright](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L12153)
29 |     ///     and [Browser](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L11134)
30 |     ///     as `PythonObject`s to inter-op with Python.
31 |     public static func withPlaywright(
32 |         pythonPackagePaths: [String],
33 |         config: CrawlerConfig,
34 |         browser: (@Sendable (_ playwright: PythonObject) async -> PythonObject)? = nil,
35 |         crawl: @escaping @CrawlActor @Sendable (
36 |             Request<URLInfo>,
37 |             _ playwright: PythonObject,
38 |             _ browser: PythonObject
39 |         ) async throws -> ([UserRequest<URLInfo>], Output)
40 |     ) async -> Crawler<Output, URLInfo>
41 |     {
42 |         let playwrightActor = await PlaywrightActor(
43 |             pythonPackagePaths: pythonPackagePaths,
44 |             prepare: browser ?? { await $0.chromium.launch(headless: false).asPyAsync() }
45 |         )
46 | 
47 |         return Crawler<Output, URLInfo>(
48 |             config: config,
49 |             dependency: playwrightActor,
50 |             crawl: { request, playwrightActor in
51 |                 try await playwrightActor.runCrawl {
52 |                     try await crawl(request, $0, $1)
53 |                 }
54 |             }
55 |         )
56 |     }
57 | }
58 | 
59 | // MARK: - Private
60 | 
61 | /// Global actor for cooperative Playwright crawling to avoid `EXC_BAD_ACCESS`.
62 | @globalActor
63 | internal actor CrawlActor
64 | {
65 |     static let shared: CrawlActor = CrawlActor()
66 | }
67 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawlerPlaywright/PlaywrightActor.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import ActoCrawler
 3 | 
 4 | /// [playwright-python](https://playwright.dev/python/docs/intro) (headless browser) Actor wrapper.
 5 | /// - Note: This will be used as a dependency of ActoCrawler, and stored throughout its lifetime.
 6 | internal actor PlaywrightActor
 7 | {
 8 |     /// Root of `playwright/async_api`.
 9 |     /// - [async_playwright](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/__init__.py#L85)
10 |     /// - [PlaywrightContextManager](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_context_manager.py#L25)
11 |     private let playwrightContextManager: PythonObject
12 | 
13 |     /// Python [Playwright](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L12153) object.
14 |     internal let playwright: PythonObject
15 | 
16 |     /// Python object that is prepared via `init`'s `prepare`.
17 |     /// For example, preparing [Browser](https://github.com/microsoft/playwright-python/blob/v1.22.0/playwright/async_api/_generated.py#L11134)
18 |     /// is often useful not to launch multiple times and keep using the same reference.
19 |     internal let preparedObject: PythonObject
20 | 
21 |     /// - Parameters:
22 |     ///   - pythonPackagePaths:
23 |     ///     Python library paths for interacting with `playwright-python`. Use `pip show playwright` to find its locaiton.
24 |     ///   - prepare:
25 |     ///     Async closure for setting-up `preparedObject`, which is usually a reusable `Browser`.
26 |     internal init(
27 |         pythonPackagePaths: [String],
28 |         prepare: @Sendable (_ playwright: PythonObject) async -> PythonObject
29 |     ) async
30 |     {
31 |         // Set PATH.
32 |         let sys = Python.import("sys")
33 |         for path in pythonPackagePaths {
34 |             sys.path.append(path)
35 |         }
36 |         sys.path.append(PythonKitAsync.bundleResourcePath) // For importing `pythonkit-async.py`.
37 | 
38 |         let playwrightModule = Python.import("playwright.async_api")
39 |         self.playwrightContextManager = playwrightModule.async_playwright()
40 |         self.playwright = await self.playwrightContextManager.start().asPyAsync()
41 |         self.preparedObject = await prepare(self.playwright)
42 |     }
43 | 
44 |     deinit
45 |     {
46 |         Task.detached {  [playwrightContextManager] in
47 |             await playwrightContextManager.__aexit__().asPyAsync()
48 |         }
49 |     }
50 | 
51 |     internal func runCrawl<Res>(
52 |         _ crawl: @Sendable (
53 |             _ playwright: PythonObject,
54 |             _ setupObject: PythonObject
55 |         ) async throws -> Res
56 |     ) async rethrows -> Res
57 |     {
58 |         try await crawl(self.playwright, self.preparedObject)
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/Sources/ActoCrawlerPlaywright/_exported.swift:
--------------------------------------------------------------------------------
1 | @_exported import PythonKit
2 | @_exported import PythonKitAsync
3 | 


--------------------------------------------------------------------------------
/Sources/AsyncChannel/AsyncChannel.swift:
--------------------------------------------------------------------------------
  1 | //===----------------------------------------------------------------------===//
  2 | //
  3 | // This source file is part of the Swift Async Algorithms open source project
  4 | //
  5 | // Copyright (c) 2022 Apple Inc. and the Swift project authors
  6 | // Licensed under Apache License v2.0 with Runtime Library Exception
  7 | //
  8 | // See https://swift.org/LICENSE.txt for license information
  9 | //
 10 | //===----------------------------------------------------------------------===//
 11 | 
 12 | /// A channel for sending elements from one task to another with back pressure.
 13 | ///
 14 | /// The `AsyncChannel` class is intended to be used as a communication type between tasks,
 15 | /// particularly when one task produces values and another task consumes those values. The back
 16 | /// pressure applied by `send(_:)` and `finish()` via the suspension/resume ensures that
 17 | /// the production of values does not exceed the consumption of values from iteration. Each of these
 18 | /// methods suspends after enqueuing the event and is resumed when the next call to `next()`
 19 | /// on the `Iterator` is made.
 20 | public final class AsyncChannel<Element: Sendable>: AsyncSequence, Sendable {
 21 |   /// The iterator for a `AsyncChannel` instance.
 22 |   public struct Iterator: AsyncIteratorProtocol, Sendable {
 23 |     let channel: AsyncChannel<Element>
 24 |     var active: Bool = true
 25 |     
 26 |     init(_ channel: AsyncChannel<Element>) {
 27 |       self.channel = channel
 28 |     }
 29 |     
 30 |     /// Await the next sent element or finish.
 31 |     public mutating func next() async -> Element? {
 32 |       guard active else {
 33 |         return nil
 34 |       }
 35 |       let generation = channel.establish()
 36 |       let value: Element? = await withTaskCancellationHandler { [channel] in
 37 |         channel.cancel(generation)
 38 |       } operation: {
 39 |         await channel.next(generation)
 40 |       }
 41 |       
 42 |       if let value = value {
 43 |         return value
 44 |       } else {
 45 |         active = false
 46 |         return nil
 47 |       }
 48 |     }
 49 |   }
 50 |   
 51 |   struct Awaiting: Hashable {
 52 |     var generation: Int
 53 |     var continuation: UnsafeContinuation<Element?, Never>?
 54 |     let cancelled: Bool
 55 |     
 56 |     init(generation: Int, continuation: UnsafeContinuation<Element?, Never>) {
 57 |       self.generation = generation
 58 |       self.continuation = continuation
 59 |       cancelled = false
 60 |     }
 61 |     
 62 |     init(placeholder generation: Int) {
 63 |       self.generation = generation
 64 |       self.continuation = nil
 65 |       cancelled = false
 66 |     }
 67 |     
 68 |     init(cancelled generation: Int) {
 69 |       self.generation = generation
 70 |       self.continuation = nil
 71 |       cancelled = true
 72 |     }
 73 |     
 74 |     func hash(into hasher: inout Hasher) {
 75 |       hasher.combine(generation)
 76 |     }
 77 |     
 78 |     static func == (_ lhs: Awaiting, _ rhs: Awaiting) -> Bool {
 79 |       return lhs.generation == rhs.generation
 80 |     }
 81 |   }
 82 |   
 83 |   enum Emission {
 84 |     case idle
 85 |     case pending([UnsafeContinuation<UnsafeContinuation<Element?, Never>?, Never>])
 86 |     case awaiting(Set<Awaiting>)
 87 |     
 88 |     mutating func cancel(_ generation: Int) -> UnsafeContinuation<Element?, Never>? {
 89 |       switch self {
 90 |       case .awaiting(var awaiting):
 91 |         let continuation = awaiting.remove(Awaiting(placeholder: generation))?.continuation
 92 |         if awaiting.isEmpty {
 93 |           self = .idle
 94 |         } else {
 95 |           self = .awaiting(awaiting)
 96 |         }
 97 |         return continuation
 98 |       case .idle:
 99 |         self = .awaiting([Awaiting(cancelled: generation)])
100 |         return nil
101 |       default:
102 |         return nil
103 |       }
104 |     }
105 |   }
106 |   
107 |   struct State {
108 |     var emission: Emission = .idle
109 |     var generation = 0
110 |     var terminal = false
111 |   }
112 |   
113 |   let state = ManagedCriticalState(State())
114 |   
115 |   /// Create a new `AsyncChannel` given an element type.
116 |   public init(element elementType: Element.Type = Element.self) { }
117 |   
118 |   func establish() -> Int {
119 |     state.withCriticalRegion { state in
120 |       defer { state.generation &+= 1 }
121 |       return state.generation
122 |     }
123 |   }
124 |   
125 |   func cancel(_ generation: Int) {
126 |     state.withCriticalRegion { state in
127 |       state.emission.cancel(generation)
128 |     }?.resume(returning: nil)
129 |   }
130 |   
131 |   func next(_ generation: Int) async -> Element? {
132 |     return await withUnsafeContinuation { continuation in
133 |       var cancelled = false
134 |       var terminal = false
135 |       state.withCriticalRegion { state -> UnsafeResumption<UnsafeContinuation<Element?, Never>?, Never>? in
136 |         if state.terminal {
137 |           terminal = true
138 |           return nil
139 |         }
140 |         switch state.emission {
141 |         case .idle:
142 |           state.emission = .awaiting([Awaiting(generation: generation, continuation: continuation)])
143 |           return nil
144 |         case .pending(var sends):
145 |           let send = sends.removeFirst()
146 |           if sends.count == 0 {
147 |             state.emission = .idle
148 |           } else {
149 |             state.emission = .pending(sends)
150 |           }
151 |           return UnsafeResumption(continuation: send, success: continuation)
152 |         case .awaiting(var nexts):
153 |           if nexts.update(with: Awaiting(generation: generation, continuation: continuation)) != nil {
154 |             nexts.remove(Awaiting(placeholder: generation))
155 |             cancelled = true
156 |           }
157 |           if nexts.isEmpty {
158 |             state.emission = .idle
159 |           } else {
160 |             state.emission = .awaiting(nexts)
161 |           }
162 |           return nil
163 |         }
164 |       }?.resume()
165 |       if cancelled || terminal {
166 |         continuation.resume(returning: nil)
167 |       }
168 |     }
169 |   }
170 |   
171 |   func finishAll() {
172 |     let (sends, nexts) = state.withCriticalRegion { state -> ([UnsafeContinuation<UnsafeContinuation<Element?, Never>?, Never>], Set<Awaiting>) in
173 |       if state.terminal {
174 |         return ([], [])
175 |       }
176 |       state.terminal = true
177 |       switch state.emission {
178 |       case .idle:
179 |         return ([], [])
180 |       case .pending(let nexts):
181 |         state.emission = .idle
182 |         return (nexts, [])
183 |       case .awaiting(let nexts):
184 |         state.emission = .idle
185 |         return ([], nexts)
186 |       }
187 |     }
188 |     for send in sends {
189 |       send.resume(returning: nil)
190 |     }
191 |     for next in nexts {
192 |       next.continuation?.resume(returning: nil)
193 |     }
194 |   }
195 |   
196 |   func _send(_ element: Element) async {
197 |     await withTaskCancellationHandler {
198 |       finishAll()
199 |     } operation: {
200 |       let continuation: UnsafeContinuation<Element?, Never>? = await withUnsafeContinuation { continuation in
201 |         state.withCriticalRegion { state -> UnsafeResumption<UnsafeContinuation<Element?, Never>?, Never>? in
202 |           if state.terminal {
203 |             return UnsafeResumption(continuation: continuation, success: nil)
204 |           }
205 |           switch state.emission {
206 |           case .idle:
207 |             state.emission = .pending([continuation])
208 |             return nil
209 |           case .pending(var sends):
210 |             sends.append(continuation)
211 |             state.emission = .pending(sends)
212 |             return nil
213 |           case .awaiting(var nexts):
214 |             let next = nexts.removeFirst().continuation
215 |             if nexts.count == 0 {
216 |               state.emission = .idle
217 |             } else {
218 |               state.emission = .awaiting(nexts)
219 |             }
220 |             return UnsafeResumption(continuation: continuation, success: next)
221 |           }
222 |         }?.resume()
223 |       }
224 |       continuation?.resume(returning: element)
225 |     }
226 |   }
227 |   
228 |   /// Send an element to an awaiting iteration. This function will resume when the next call to `next()` is made.
229 |   /// If the channel is already finished then this returns immediately
230 |   public func send(_ element: Element) async {
231 |     await _send(element)
232 |   }
233 |   
234 |   /// Send a finish to all awaiting iterations.
235 |   public func finish() {
236 |     finishAll()
237 |   }
238 |   
239 |   /// Create an `Iterator` for iteration of an `AsyncChannel`
240 |   public func makeAsyncIterator() -> Iterator {
241 |     return Iterator(self)
242 |   }
243 | }
244 | 


--------------------------------------------------------------------------------
/Sources/AsyncChannel/Locking.swift:
--------------------------------------------------------------------------------
  1 | //===----------------------------------------------------------------------===//
  2 | //
  3 | // This source file is part of the Swift Async Algorithms open source project
  4 | //
  5 | // Copyright (c) 2022 Apple Inc. and the Swift project authors
  6 | // Licensed under Apache License v2.0 with Runtime Library Exception
  7 | //
  8 | // See https://swift.org/LICENSE.txt for license information
  9 | //
 10 | //===----------------------------------------------------------------------===//
 11 | 
 12 | #if canImport(Darwin)
 13 | @_implementationOnly import Darwin
 14 | #elseif canImport(Glibc)
 15 | @_implementationOnly import Glibc
 16 | #elseif canImport(WinSDK)
 17 | @_implementationOnly import WinSDK
 18 | #endif
 19 | 
 20 | internal struct Lock {
 21 | #if canImport(Darwin)
 22 |   typealias Primitive = os_unfair_lock
 23 | #elseif canImport(Glibc)
 24 |   typealias Primitive = pthread_mutex_t
 25 | #elseif canImport(WinSDK)
 26 |   typealias Primitive = SRWLOCK
 27 | #endif
 28 |   
 29 |   typealias PlatformLock = UnsafeMutablePointer<Primitive>
 30 |   let platformLock: PlatformLock
 31 | 
 32 |   private init(_ platformLock: PlatformLock) {
 33 |     self.platformLock = platformLock
 34 |   }
 35 |   
 36 |   fileprivate static func initialize(_ platformLock: PlatformLock) {
 37 | #if canImport(Darwin)
 38 |     platformLock.initialize(to: os_unfair_lock())
 39 | #elseif canImport(Glibc)
 40 |     pthread_mutex_init(platformLock, nil)
 41 | #elseif canImport(WinSDK)
 42 |     InitializeSRWLock(platformLock)
 43 | #endif
 44 |   }
 45 |   
 46 |   fileprivate static func deinitialize(_ platformLock: PlatformLock) {
 47 | #if canImport(Glibc)
 48 |     pthread_mutex_destroy(platformLock)
 49 | #endif
 50 |     platformLock.deinitialize(count: 1)
 51 |   }
 52 |   
 53 |   fileprivate static func lock(_ platformLock: PlatformLock) {
 54 | #if canImport(Darwin)
 55 |     os_unfair_lock_lock(platformLock)
 56 | #elseif canImport(Glibc)
 57 |     pthread_mutex_lock(platformLock)
 58 | #elseif canImport(WinSDK)
 59 |     AcquireSRWLockExclusive(platformLock)
 60 | #endif
 61 |   }
 62 |   
 63 |   fileprivate static func unlock(_ platformLock: PlatformLock) {
 64 | #if canImport(Darwin)
 65 |     os_unfair_lock_unlock(platformLock)
 66 | #elseif canImport(Glibc)
 67 |     pthread_mutex_unlock(platformLock)
 68 | #elseif canImport(WinSDK)
 69 |     ReleaseSRWLockExclusive(platformLock)
 70 | #endif
 71 |   }
 72 | 
 73 |   static func allocate() -> Lock {
 74 |     let platformLock = PlatformLock.allocate(capacity: 1)
 75 |     initialize(platformLock)
 76 |     return Lock(platformLock)
 77 |   }
 78 | 
 79 |   func deinitialize() {
 80 |     Lock.deinitialize(platformLock)
 81 |   }
 82 | 
 83 |   func lock() {
 84 |     Lock.lock(platformLock)
 85 |   }
 86 | 
 87 |   func unlock() {
 88 |     Lock.unlock(platformLock)
 89 |   }
 90 | }
 91 | 
 92 | struct ManagedCriticalState<State> {
 93 |   private final class LockedBuffer: ManagedBuffer<State, Lock.Primitive> {
 94 |     deinit {
 95 |       withUnsafeMutablePointerToElements { Lock.deinitialize($0) }
 96 |     }
 97 |   }
 98 |   
 99 |   private let buffer: ManagedBuffer<State, Lock.Primitive>
100 |   
101 |   init(_ initial: State) {
102 |     buffer = LockedBuffer.create(minimumCapacity: 1) { buffer in
103 |       buffer.withUnsafeMutablePointerToElements { Lock.initialize($0) }
104 |       return initial
105 |     }
106 |   }
107 |   
108 |   func withCriticalRegion<R>(_ critical: (inout State) throws -> R) rethrows -> R {
109 |     try buffer.withUnsafeMutablePointers { header, lock in
110 |       Lock.lock(lock)
111 |       defer { Lock.unlock(lock) }
112 |       return try critical(&header.pointee)
113 |     }
114 |   }
115 | }
116 | 
117 | extension ManagedCriticalState: @unchecked Sendable where State: Sendable { }
118 | 


--------------------------------------------------------------------------------
/Sources/AsyncChannel/UnsafeResumption.swift:
--------------------------------------------------------------------------------
 1 | struct UnsafeResumption<Success, Failure: Error> {
 2 |   let continuation: UnsafeContinuation<Success, Failure>
 3 |   let result: Result<Success, Failure>
 4 |   
 5 |   init(continuation: UnsafeContinuation<Success, Failure>, result: Result<Success, Failure>) {
 6 |     self.continuation = continuation
 7 |     self.result = result
 8 |   }
 9 |   
10 |   init(continuation: UnsafeContinuation<Success, Failure>, success: Success) {
11 |     self.init(continuation: continuation, result: .success(success))
12 |   }
13 |   
14 |   init(continuation: UnsafeContinuation<Success, Failure>, failure: Failure) {
15 |     self.init(continuation: continuation, result: .failure(failure))
16 |   }
17 |   
18 |   func resume() {
19 |     continuation.resume(with: result)
20 |   }
21 | }
22 | 
23 | extension UnsafeResumption where Failure == Error {
24 |   init(continuation: UnsafeContinuation<Success, Failure>, catching body: () throws -> Success) {
25 |     self.init(continuation: continuation, result: Result(catching: body))
26 |   }
27 | }
28 | 
29 | extension UnsafeResumption where Success == Void {
30 |   init(continuation: UnsafeContinuation<Success, Failure>) {
31 |     self.init(continuation: continuation, result: .success(()))
32 |   }
33 | }
34 | 
35 | extension UnsafeResumption: Sendable where Success: Sendable { }
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/Sources/PythonKitAsync/Bundle.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | 
 3 | /// Path to `pythonkit-async.py`
 4 | public let bundleResourcePath: String = {
 5 |     BundleToken.bundle.resourcePath!
 6 | }()
 7 | 
 8 | // MARK: - Private
 9 | 
10 | private final class BundleToken
11 | {
12 |     static let bundle: Bundle = {
13 | #if SWIFT_PACKAGE
14 |         return Bundle.module
15 | #else
16 |         return Bundle(for: BundleToken.self)
17 | #endif
18 |     }()
19 | }
20 | 


--------------------------------------------------------------------------------
/Sources/PythonKitAsync/asPyAsync.swift:
--------------------------------------------------------------------------------
 1 | import PythonKit
 2 | 
 3 | // NOTE: 
 4 | private let pythonKitAsync = Python.import("pythonkit-async")
 5 | 
 6 | extension PythonObject
 7 | {
 8 |     /// Converts `self` as Python's coroutine (`async def`) object into Swift async function.
 9 |     /// - Important: `self` must be Python coroutine object to run properly. Otherwise, async-returned value will be `self`.
10 |     @discardableResult
11 |     public func asPyAsync() async -> PythonObject
12 |     {
13 |         let pyObj: PythonObject = await withCheckedContinuation { continuation in
14 |             // NOTE: Uses `pythonkit-async.py`'s `coroutine_to_callback`.
15 |             pythonKitAsync.coroutine_to_callback(self, PythonFunction { (arg: PythonObject) in
16 |                 continuation.resume(returning: arg)
17 |                 return 0
18 |             })
19 |         }
20 | 
21 |         // NOTE: Required to run other concurrent coroutines.
22 |         await Task.yield()
23 | 
24 |         return pyObj
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/Sources/PythonKitAsync/pythonkit-async.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | async def coroutine_wrapper(coroutine, callback):
 4 |     val = await coroutine
 5 |     callback(val)
 6 | 
 7 | def coroutine_to_callback(coroutine, callback):
 8 |     if asyncio.iscoroutine(coroutine):
 9 |         loop = asyncio.get_event_loop()
10 |         loop.run_until_complete(coroutine_wrapper(coroutine, callback))
11 |     else:
12 |         callback(coroutine) # Calls back immediately with non-coroutine object.
13 | 


--------------------------------------------------------------------------------
/Tests/ActoCrawlerTests/CrawlerTests.swift:
--------------------------------------------------------------------------------
1 | import XCTest
2 | import class Foundation.Bundle
3 | 
4 | final class CrawlerTests: XCTestCase
5 | {
6 |     // TBD
7 | }
8 | 


--------------------------------------------------------------------------------
/Tests/ActoCrawlerTests/ReadMeExample.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import ActoCrawler
 3 | 
 4 | /// - Note: Only for compiling purpose.
 5 | /// @main
 6 | struct ReadMeExample
 7 | {
 8 |     static func main() async
 9 |     {
10 |         struct Output: Sendable
11 |         {
12 |             let nextLinksCount: Int
13 |         }
14 | 
15 |         let htmlCrawler = await Crawler<Output, Void>.htmlScraper(
16 |             config: CrawlerConfig(
17 |                 maxDepths: 10,
18 |                 maxTotalRequests: 100,
19 |                 timeoutPerRequest: 5,
20 |                 userAgent: "ActoCrawler",
21 |                 domainFilteringPolicy: .disallowedDomains([".*google.com*" /* ... */]),
22 |                 domainQueueTable: [
23 |                     ".*example1.com*": .init(maxConcurrency: 1, delay: 0),
24 |                     ".*example2.com*": .init(maxConcurrency: 5, delay: 0.1 ... 0.5)
25 |                 ]
26 |             ),
27 |             scrapeHTML: { response in
28 |                 let html = response.data
29 |                 let links = try html.select("a").map { try $0.attr("href") }
30 | 
31 |                 let nextRequests = links
32 |                     .compactMap(URL.init(string:))
33 |                     .map { UserRequest(url: $0) }
34 | 
35 |                 return (nextRequests, Output(nextLinksCount: nextRequests.count))
36 |             }
37 |         )
38 | 
39 |         // Visit initial page.
40 |         htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!)
41 | 
42 |         // Observe crawl events.
43 |         for await event in htmlCrawler.events {
44 |             switch event {
45 |             case let .willCrawl(req):
46 |                 print("Crawl : 🕸️ [\(req.order)] [d=\(req.depth)] \(req.url)")
47 |             case let .didCrawl(req, .success(output)):
48 |                 print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), nextLinksCount = \(output.nextLinksCount)")
49 |             case let .didCrawl(req, .failure(error)):
50 |                 print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
51 |             }
52 |         }
53 | 
54 |         print("Output Done")
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------