├── .gitignore
├── LICENSE
├── Package.resolved
├── Package.swift
├── README.md
└── Sources
├── AppleComScraper.swift
├── Identifiable.swift
├── Session+JSON.swift
├── Session.swift
├── URLExtensions.swift
├── main.swift
├── meta.swift
├── root.swift
├── setup.swift
└── webvtt.swift
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.gitignore.io/api/macos,xcode,swiftpackagemanager
3 |
4 | ### macOS ###
5 | *.DS_Store
6 | .AppleDouble
7 | .LSOverride
8 |
9 | # Icon must end with two \r
10 | Icon
11 |
12 | # Thumbnails
13 | ._*
14 |
15 | # Files that might appear in the root of a volume
16 | .DocumentRevisions-V100
17 | .fseventsd
18 | .Spotlight-V100
19 | .TemporaryItems
20 | .Trashes
21 | .VolumeIcon.icns
22 | .com.apple.timemachine.donotpresent
23 |
24 | # Directories potentially created on remote AFP share
25 | .AppleDB
26 | .AppleDesktop
27 | Network Trash Folder
28 | Temporary Items
29 | .apdisk
30 |
31 | ### SwiftPackageManager ###
32 | Packages
33 | .build
34 | xcuserdata
35 | *.xcodeproj
36 | DerivedData/
37 |
38 | ### Xcode ###
39 | # Xcode
40 | #
41 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
42 |
43 | ## Build generated
44 | build/
45 |
46 | ## Various settings
47 | *.pbxuser
48 | !default.pbxuser
49 | *.mode1v3
50 | !default.mode1v3
51 | *.mode2v3
52 | !default.mode2v3
53 | *.perspectivev3
54 | !default.perspectivev3
55 | xcuserdata/
56 |
57 | ## Other
58 | *.moved-aside
59 | *.xccheckout
60 | *.xcscmblueprint
61 |
62 | ### Xcode Patch ###
63 | *.xcodeproj/*
64 | !*.xcodeproj/project.pbxproj
65 | !*.xcodeproj/xcshareddata/
66 | !*.xcworkspace/contents.xcworkspacedata
67 | /*.gcno
68 |
69 | # End of https://www.gitignore.io/api/macos,xcode,swiftpackagemanager
70 |
71 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Michael Pederson
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "object": {
3 | "pins": [
4 | {
5 | "package": "FileUtils",
6 | "repositoryURL": "https://github.com/oarrabi/FileUtils.git",
7 | "state": {
8 | "branch": null,
9 | "revision": "f4a5dcbfe912ee5076b6417c90aac5bfa7b12f55",
10 | "version": "0.1.1"
11 | }
12 | },
13 | {
14 | "package": "Guaka",
15 | "repositoryURL": "https://github.com/oarrabi/Guaka.git",
16 | "state": {
17 | "branch": null,
18 | "revision": "9dfb2ea95b8ab9e799d0f65a79a2316679a758e0",
19 | "version": "0.1.3"
20 | }
21 | },
22 | {
23 | "package": "Kanna",
24 | "repositoryURL": "https://github.com/tid-kijyun/Kanna.git",
25 | "state": {
26 | "branch": null,
27 | "revision": "44b169e1698d596f2eed698d8a67558fb0542b2a",
28 | "version": "4.0.0"
29 | }
30 | },
31 | {
32 | "package": "Regex",
33 | "repositoryURL": "https://github.com/sharplet/Regex.git",
34 | "state": {
35 | "branch": null,
36 | "revision": "3e671ed911b467c0d9c05e56f03d9e5bcb535f39",
37 | "version": "1.1.0"
38 | }
39 | },
40 | {
41 | "package": "StringScanner",
42 | "repositoryURL": "https://github.com/oarrabi/StringScanner",
43 | "state": {
44 | "branch": null,
45 | "revision": "246c697efe2f57d9042f58b1b53ace4fddb1efc4",
46 | "version": "0.2.0"
47 | }
48 | },
49 | {
50 | "package": "HTMLEntities",
51 | "repositoryURL": "https://github.com/IBM-Swift/swift-html-entities.git",
52 | "state": {
53 | "branch": null,
54 | "revision": "9112c12bc5cc2b20fcaf1393c65a9cbbfda5a155",
55 | "version": "3.0.10"
56 | }
57 | },
58 | {
59 | "package": "SwiftClibxml2",
60 | "repositoryURL": "https://github.com/tid-kijyun/SwiftClibxml2.git",
61 | "state": {
62 | "branch": null,
63 | "revision": "c4e67cc970273fc2bee978d12e422974ff184de7",
64 | "version": "1.0.2"
65 | }
66 | },
67 | {
68 | "package": "Yams",
69 | "repositoryURL": "https://github.com/jpsim/Yams.git",
70 | "state": {
71 | "branch": null,
72 | "revision": "618582e09699b577fa183bab7d88e3ee7d9a1d19",
73 | "version": "1.0.0"
74 | }
75 | }
76 | ]
77 | },
78 | "version": 1
79 | }
80 |
--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
1 | // swift-tools-version:4.1
2 |
3 | import PackageDescription
4 |
5 | let package = Package(
6 | name: "jonyfive",
7 | dependencies: [
8 | .package(url: "https://github.com/IBM-Swift/swift-html-entities.git", from: "3.0.0"),
9 | .package(url: "https://github.com/jpsim/Yams.git", from: "1.0.0"),
10 | .package(url: "https://github.com/nsomar/FileUtils", from: "0.0.0"),
11 | .package(url: "https://github.com/nsomar/Guaka", from: "0.0.0"),
12 | .package(url: "https://github.com/sharplet/Regex.git", from: "1.0.0"),
13 | .package(url: "https://github.com/tid-kijyun/Kanna.git", from: "4.0.0"),
14 | ],
15 | targets: [
16 | .target(
17 | name: "jonyfive",
18 | dependencies: [
19 | "HTMLEntities",
20 | "Yams",
21 | "FileUtils",
22 | "Guaka",
23 | "Regex",
24 | "Kanna",
25 | ],
26 | path: "Sources"
27 | )
28 | ]
29 | )
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ⚠️ Archived
2 | This tool is currently broken. For starters, the XPaths are hopelessly out of date.
3 |
4 | This tool was originally very hastily slapped together to the point of working, in support of [`my fork`](https://github.com/rlwimi/wwdc-session-transcripts) of [`ASCIIwwdc/wwdc-session-transcripts`](https://github.com/ASCIIwwdc/wwdc-session-transcripts) while that repo seemed abandoned, both in support of [`rlwimi/major-input`](https://github.com/rlwimi/major-input).
5 |
6 | I still see value in a tool like this. If I revive the project, I imagine moving to [`apple/swift-argument-parser`](https://github.com/apple/swift-argument-parser) and [Foundation's XML/XPath support](https://developer.apple.com/documentation/foundation/archives_and_serialization/xml_processing_and_modeling).
7 |
8 | ---
9 |
10 | # jonyfive
11 |
12 | CLI tool for doing useful things after scraping WWDC session metadata
13 |
14 | ## Usage
15 |
16 | Once you are up and running, use `-h`/`--help` flags for more information about the interface and functionality.
17 |
18 | ## Getting Started
19 |
20 | 1. Follow the installation instructions for dependency [Kanna](https://github.com/tid-kijyun/Kanna), specifically the instructions for Swift 4 via Swift Package Manager.
21 | 1. Run `swift package update`.
22 | 1. Optionally run `swift package generate-xcodeproj` if you're interested in working with the implementation in Xcode.
23 | 1. Run `swift build`.
24 | 1. Find executable at `.build/debug/jonyfive`.
25 |
26 | ## Warning
27 |
28 | Web scraping is fragile by nature, and small changes to the page HTML naming or structure can break this tool. It may be you that finds this breakage first–please file an issue.
29 |
30 | ## Thanks
31 |
32 | Based on [Guaka](https://github.com/nsomar/Guaka) and [Kanna](https://github.com/tid-kijyun/Kanna).
33 |
34 | ## License
35 |
36 | This tool is provided under the terms of the [LICENSE](https://github.com/rlwimi/jonyfive/blob/master/LICENSE).
37 |
38 | All content copyright © 2010 – 2017 Apple Inc. All rights reserved.
39 |
--------------------------------------------------------------------------------
/Sources/AppleComScraper.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | import Kanna
3 | import Regex
4 |
5 | fileprivate var baseUrl: URL {
6 | return URL(string: "https://developer.apple.com")!
7 | }
8 |
9 | fileprivate func wwdcVideosUrlPath(for year: Int) -> String {
10 | return "/videos/wwdc\(year)"
11 | }
12 |
13 | /// Beginning in 2016, the Keynote has its own special event page at www.apple.com. The 2017 Keynote
14 | /// does not have an entry on the page with the 2017 sessions.
15 | fileprivate func wwdcKeynoteUrl(for year: Int) -> URL {
16 | return URL(string: "https://www.apple.com/apple-events/june-\(year)/")!
17 | }
18 |
19 | fileprivate func wwdcVideosUrl(for year: Int) -> URL {
20 | return URL(string: wwdcVideosUrlPath(for: year), relativeTo: baseUrl)!
21 | }
22 |
23 | func scrapeSessions(filterBy filterYear: Int? = nil, session filterSession: String? = nil) -> [Session] {
24 | var sessions: [Session] = []
25 |
26 | sessions.append(contentsOf: scrapeKeynotes(filterBy: filterYear, session: filterSession))
27 |
28 | (2012...2017).forEach { year in
29 | if let filter = filterYear, year != filter {
30 | return
31 | }
32 | sessions.append(contentsOf: scrapeSessions(from: year, filterBy: filterSession))
33 | }
34 | return sessions
35 | }
36 |
37 | fileprivate func scrapeKeynotes(filterBy filterYear: Int? = nil, session filterSession: String? = nil) -> [Session] {
38 | var sessions: [Session] = []
39 | (2017...2017).forEach { year in
40 | if let filter = filterYear, year != filter {
41 | return
42 | }
43 | if let session = filterSession, session != "101" {
44 | return
45 | }
46 | if let session = scrapeKeynote(from: year) {
47 | sessions.append(session)
48 | }
49 | }
50 | return sessions
51 | }
52 |
53 | fileprivate func scrapeKeynote(from year: Int) -> Session? {
54 | if verboseEnabled { print("Scraping \(year) Keynote...") }
55 | guard let html = try? String(contentsOf: wwdcKeynoteUrl(for: year)) else {
56 | if verboseEnabled { print("could not read from \(wwdcKeynoteUrl(for: year))") }
57 | return nil
58 | }
59 | guard let metadataUrl = keynoteMetadataUrl(from: html) else {
60 | if verboseEnabled { print("could not find metadata URL") }
61 | return nil
62 | }
63 | guard let metadata = keynoteMetadata(from: metadataUrl) else {
64 | if verboseEnabled { print("could not read metadata") }
65 | return nil
66 | }
67 | let session = makeKeynoteSession(for: year, download: metadata.download, image: keynoteImageUrl(from: html), webVtt: metadata.webVtt)
68 | return session
69 | }
70 |
71 | fileprivate func keynoteMetadataUrl(from html: String) -> URL? {
72 | return Regex("var urljson_path = '(.+)';")
73 | .firstMatch(in: html)?
74 | .captures.first?
75 | .flatMap { URL(string: $0) }
76 | }
77 |
78 | fileprivate func keynoteImageUrl(from html: String) -> URL? {
79 | return Regex("")
80 | .firstMatch(in: html)?
81 | .captures.first?
82 | .flatMap { URL(string: $0) }
83 | }
84 |
85 | fileprivate func keynoteMetadata(from url: URL) -> (download: URL, webVtt: URL)? {
86 | guard
87 | let metadata = try? Data(contentsOf: url),
88 | let json = try? JSONSerialization.jsonObject(with: metadata, options: []) as? [String: Any],
89 | let videos = json?["videoSrc"] as? [String: Any],
90 | // let hlsUrl = videos["hls"] as? String,
91 | // let hls = URL(string: hlsUrl),
92 | let downloadUrl = videos["nonhls"] as? String,
93 | let download = URL(string: downloadUrl),
94 | let webVttUrl = json?["videoCC"] as? String,
95 | let webVtt = URL(string: webVttUrl)
96 | else {
97 | return nil
98 | }
99 | return (download, webVtt)
100 | }
101 |
102 | fileprivate func makeKeynoteSession(for year: Int, download: URL, image: URL?, webVtt: URL) -> Session {
103 | return Session(
104 | conference: Conference.wwdc,
105 | description: "WWDC \(year) Keynote",
106 | downloadHD: download,
107 | downloadSD: download,
108 | duration: nil,
109 | focuses: [.iOS, .macOS, .tvOS, .watchOS],
110 | image: image,
111 | number: "101",
112 | title: "Keynote",
113 | track: Track.featured,
114 | webVtt: webVtt,
115 | year: String(year)
116 | )
117 | }
118 |
119 | fileprivate func scrapeSessions(from year: Int, filterBy filterSession: String? = nil) -> [Session] {
120 | guard let yearDoc = try? HTML(url: wwdcVideosUrl(for: year), encoding: .utf8) else {
121 | if verboseEnabled { print("could not read URL for year \(year)") }
122 | return []
123 | }
124 |
125 | if verboseEnabled { print("Scraping \(wwdcVideosUrl(for: year))") }
126 |
127 | var sessions: [Session] = []
128 |
129 | yearDoc.xpath("//li[contains(@class, 'collection-focus-group')]").forEach { li in
130 | guard case .NodeSet(let nodes) = li.xpath("child::*") else {
131 | return
132 | }
133 | sessions.append(contentsOf: scrapeSessions(from: year, inTrackWith: nodes))
134 | }
135 | return sessions
136 | }
137 |
138 | fileprivate func scrapeSessions(from year: Int, inTrackWith nodes: XMLNodeSet) -> [Session] {
139 | guard
140 | let header = nodes.first,
141 | let items = nodes.last,
142 | let track = header.content?.trimmingCharacters(in: .whitespacesAndNewlines)
143 | else {
144 | if verboseEnabled { print("could not parse node set: \(String(describing: nodes.toHTML))") }
145 | return []
146 | }
147 | if verboseEnabled { print("Scanning sessions in track: \(track)") }
148 |
149 | var sessions: [Session] = []
150 |
151 | var sessionImages: [Identifier: URL] = [:]
152 |
153 | items.xpath(".//a").forEach { anchor in
154 | let number = scrapeSessionNumber(from: anchor)
155 | let identifier = Session.makeIdentifier(conference: .wwdc, year: String(year), number: number)
156 |
157 | // Sessions before 2015 do not have an image. If this is an image anchor, dig the URL, cache, and pass.
158 | if let imageUrl = scrapeSessionImage(from: anchor) {
159 | sessionImages[identifier] = imageUrl
160 | return
161 | }
162 |
163 | // `anchor` is the title link, which proceeds any image link.
164 |
165 | let imageUrl = sessionImages[identifier]
166 |
167 | let title = scrapeSessionTitle(from: anchor)
168 | let webpageUrl = scrapeSessionPageUrl(from: anchor)
169 |
170 | if let filter = filterSession, number != filter {
171 | return
172 | }
173 |
174 | guard let sessionDoc = try? HTML(url: webpageUrl, encoding: .utf8) else {
175 | if verboseEnabled { print("could not read session page: \(webpageUrl.absoluteString)") }
176 | return
177 | }
178 |
179 | if verboseEnabled { print("Scraping \(year) session #\(number)...", terminator: "") }
180 |
181 | guard let (description, focuses) = scrapeSessionDetails(from: sessionDoc) else {
182 | if verboseEnabled { print("could not find the description of \(year) session #\(number)") }
183 | return
184 | }
185 |
186 | guard let (sdVideoUrl, hdVideoUrl) = scrapeSessionResources(from: sessionDoc) else {
187 | if verboseEnabled { print("could not find any resources for \(year) session #\(number)") }
188 | return
189 | }
190 |
191 | let yearString = String(year)
192 | let session = Session(
193 | conference: .wwdc,
194 | description: description,
195 | downloadHD: hdVideoUrl,
196 | downloadSD: sdVideoUrl,
197 | duration: nil,
198 | focuses: focuses.components(separatedBy: ", ").compactMap(Focus.init(rawValue:)),
199 | image: imageUrl,
200 | number: number,
201 | title: title,
202 | track: Track(rawValue: track)!,
203 | year: yearString
204 | )
205 |
206 | if verboseEnabled { print("done.") }
207 |
208 | sessions.append(session)
209 | }
210 | return sessions
211 | }
212 |
213 | fileprivate func scrapeSessionNumber(from anchor: Kanna.XMLElement) -> String {
214 | return scrapeSessionPageUrl(from: anchor).lastPathComponent
215 | }
216 |
217 | fileprivate func scrapeSessionImage(from anchor: Kanna.XMLElement) -> URL? {
218 | guard
219 | let image = anchor.xpath("child::*").first,
220 | let imageUrlValue = image["src"],
221 | let imageUrl = URL(string: imageUrlValue)
222 | else {
223 | return nil
224 | }
225 | return imageUrl
226 | }
227 |
228 | fileprivate func scrapeSessionTitle(from anchor: Kanna.XMLElement) -> String {
229 | return anchor.text?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
230 | }
231 |
232 | fileprivate func scrapeSessionPageUrl(from anchor: Kanna.XMLElement) -> URL {
233 | guard
234 | let href = anchor["href"],
235 | let hrefUrl = URL(string: href, relativeTo: baseUrl)
236 | else {
237 | return baseUrl
238 | }
239 | return hrefUrl
240 | }
241 |
242 | fileprivate func scrapeSessionDetails(from doc: HTMLDocument) -> (description: String, focuses: String)? {
243 | for listItem in doc.xpath("//li[contains(@data-supplement-id, 'details')]") {
244 | // Skip the details tab element.
245 | if let `class` = listItem["class"], `class`.range(of: "supplement details") == nil {
246 | continue
247 | }
248 |
249 | let paragraphs = listItem.xpath(".//p")
250 | guard paragraphs.count >= 2 else {
251 | return nil
252 | }
253 |
254 | guard
255 | let description = paragraphs[0].text,
256 | let tagsLine = paragraphs[1].text
257 | else {
258 | return nil
259 | }
260 |
261 | let tags = tagsLine.components(separatedBy: " - ")
262 |
263 | guard let focuses = tags.last else {
264 | return nil
265 | }
266 |
267 | return (description, focuses)
268 | }
269 |
270 | return nil
271 | }
272 |
273 | fileprivate func scrapeSessionResources(from doc: HTMLDocument) -> (sdVideoUrl: URL, hdVideoUrl: URL)? {
274 | for resourcesListItem in doc.xpath("//li[contains(@data-supplement-id, 'details')]") {
275 | // Skip the tab element, it's the tab's content we want.
276 | if let `class` = resourcesListItem["class"], `class`.range(of: "supplement details") == nil {
277 | continue
278 | }
279 |
280 | var sdVideoUrl: URL?
281 | var hdVideoUrl: URL?
282 |
283 | resourcesListItem.xpath(".//a").forEach { anchor in
284 | guard let text = anchor.innerHTML else {
285 | return
286 | }
287 | switch text {
288 | case "HD Video":
289 | guard let value = anchor["href"], let url = URL(string: value) else {
290 | return
291 | }
292 | hdVideoUrl = url
293 | case "SD Video":
294 | guard let value = anchor["href"], let url = URL(string: value) else {
295 | return
296 | }
297 | sdVideoUrl = url
298 | default:
299 | // Not yet handling non-video resources
300 | break
301 | }
302 | }
303 |
304 | // Not handling the case when missing only SD or HD
305 | if let sdVideoUrl = sdVideoUrl, let hdVideoUrl = hdVideoUrl {
306 | return (sdVideoUrl, hdVideoUrl)
307 | }
308 | }
309 | return nil
310 | }
311 |
--------------------------------------------------------------------------------
/Sources/Identifiable.swift:
--------------------------------------------------------------------------------
1 | struct Identifier: RawRepresentable {
2 | let rawValue: String
3 |
4 | init(rawValue: String) {
5 | self.rawValue = rawValue
6 | }
7 | }
8 |
9 | protocol IdentifierProtocol: Hashable {}
10 |
11 | extension Identifier: IdentifierProtocol {}
12 |
13 | extension Identifier: Equatable {
14 | static func == (lhs: Identifier, rhs: Identifier) -> Bool {
15 | return lhs.rawValue == rhs.rawValue
16 | }
17 | }
18 |
19 | extension Identifier: Hashable {
20 | var hashValue: Int {
21 | return rawValue.hashValue
22 | }
23 | }
24 |
25 | protocol Identifiable: Hashable {
26 | associatedtype IdentifierType: IdentifierProtocol
27 | var identifier: IdentifierType { get }
28 | }
29 |
30 | extension Identifiable {
31 | var hashValue: Int {
32 | return identifier.hashValue
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/Sources/Session+JSON.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | extension Session {
4 | var dictionary: [String: Any] {
5 | var d: [String: Any] = [:]
6 | d["description"] = description
7 | d["download_hd"] = downloadHD.absoluteString
8 | d["download_sd"] = downloadSD.absoluteString
9 | d["duration"] = duration ?? nil
10 | d["focus"] = focuses.map({ $0.rawValue })
11 | d["image"] = image?.absoluteString
12 | d["id"] = number
13 | d["track"] = track.rawValue
14 | d["title"] = title
15 | d["year"] = Int(year) ?? nil
16 | return d
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/Sources/Session.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | enum Focus: String {
4 | case iOS
5 | case macOS
6 | case tvOS
7 | case watchOS
8 | }
9 |
10 | enum Track: String {
11 | case appFrameworks = "App Frameworks"
12 | case systemFrameworks = "System Frameworks"
13 | case developerTools = "Developer Tools"
14 | case featured = "Featured"
15 | case graphicsAndGames = "Graphics and Games"
16 | case design = "Design"
17 | case media = "Media"
18 | case distribution = "Distribution"
19 |
20 | // pre-2015 tracks
21 | case appServices = "App Services"
22 | case coreOS = "Core OS"
23 | case essentials = "Essentials"
24 | case general = "General"
25 | case graphicsMediaAndGames = "Graphics, Media & Games"
26 | case safariAndWeb = "Safari & Web"
27 | case frameworks = "Frameworks"
28 | case services = "Services"
29 | case specialEvents = "Special Events"
30 | case tools = "Tools"
31 | }
32 |
33 | enum Conference: String {
34 | case wwdc = "WWDC"
35 | }
36 |
37 | struct Session: Identifiable {
38 | let identifier: Identifier
39 | let conference: Conference
40 | let description: String
41 | let downloadHD: URL
42 | let downloadSD: URL
43 | let duration: Int?
44 | let focuses: [Focus]
45 | let image: URL?
46 | let number: String
47 | let title: String
48 | let track: Track
49 | let year: String
50 | let webVtt: URL
51 |
52 | init(
53 | conference: Conference,
54 | description: String,
55 | downloadHD: URL,
56 | downloadSD: URL,
57 | duration: Int?,
58 | focuses: [Focus],
59 | image: URL?,
60 | number: String,
61 | title: String,
62 | track: Track,
63 | webVtt: URL? = nil,
64 | year: String) {
65 |
66 | self.identifier = Session.makeIdentifier(conference: conference, year: year, number: number)
67 | self.conference = conference
68 | self.description = description
69 | self.downloadHD = downloadHD
70 | self.downloadSD = downloadSD
71 | self.duration = duration
72 | self.focuses = focuses
73 | self.image = image
74 | self.number = number
75 | self.title = title
76 | self.track = track
77 | self.year = year
78 |
79 | if let webVtt = webVtt {
80 | self.webVtt = webVtt
81 | } else {
82 | var url = downloadSD.deletingQuery
83 | url.deletePathExtension()
84 | let basename = url.lastPathComponent
85 | url.deleteLastPathComponent()
86 | url.appendPathComponent("subtitles/eng/\(basename).vtt")
87 | self.webVtt = url
88 | }
89 | }
90 | }
91 |
92 | extension Session {
93 | static func makeIdentifier(conference: Conference, year: String, number: String) -> Identifier {
94 | return Identifier(rawValue: [conference.rawValue, year, number].joined(separator: "-"))
95 | }
96 | }
97 |
98 | extension Session: Equatable {
99 |
100 | static func == (lhs: Session, rhs: Session) -> Bool {
101 | return lhs.year == rhs.year &&
102 | lhs.number == rhs.number &&
103 | lhs.description == rhs.description &&
104 | lhs.downloadHD == rhs.downloadHD &&
105 | lhs.downloadSD == rhs.downloadSD &&
106 | lhs.duration == rhs.duration &&
107 | lhs.focuses == rhs.focuses &&
108 | lhs.image == rhs.image &&
109 | lhs.title == rhs.title &&
110 | lhs.track == rhs.track
111 | }
112 | }
113 |
114 | extension Session: Hashable {
115 | var hashValue: Int {
116 | return identifier.hashValue
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/Sources/URLExtensions.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | public extension URL {
4 | /// Return a URL with query removed. If something goes wrong, return the instance.
5 | var deletingQuery: URL {
6 | guard var components = URLComponents(url: self, resolvingAgainstBaseURL: true) else {
7 | return self
8 | }
9 | components.query = nil
10 | guard let url = components.url else {
11 | return self
12 | }
13 | return url
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/Sources/main.swift:
--------------------------------------------------------------------------------
1 | import Guaka
2 |
3 | setupCommands()
4 |
5 | rootCommand.execute()
6 |
--------------------------------------------------------------------------------
/Sources/meta.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | import Guaka
3 | import HTMLEntities
4 | import Yams
5 |
6 | var metaCommand = Command(usage: "meta", configuration: configuration, run: execute)
7 |
8 | fileprivate let outputFormatOption = Flag(
9 | shortName: "f",
10 | longName: "format",
11 | type: String.self,
12 | description: "output format: json (default, for Major Input) or yaml (for ASCIIwwdc)",
13 | required: false,
14 | inheritable: false
15 | )
16 |
17 | fileprivate enum Format: String {
18 | /// Format used by Major Input. Includes all scraped information.
19 | case json
20 | /// Format used by ASCIIwwdc.com. Includes limited information.
21 | case yaml
22 | }
23 |
24 | fileprivate var format: Format = .json
25 |
26 | private func configuration(command: Command) {
27 | command.shortMessage = "Collect session information in a file."
28 | command.longMessage = "Collect meta information on each session, write it to a file."
29 |
30 | command.add(flags: [outputFormatOption])
31 |
32 | command.preRun = { flags, args in
33 | guard let formatRawValue = flags.getString(name: outputFormatOption.longName) else {
34 | return true
35 | }
36 | guard let formatValue = Format(rawValue: formatRawValue) else {
37 | print("Invalid output format \"\(formatRawValue)\".")
38 | return false
39 | }
40 | format = formatValue
41 | return true
42 | }
43 | }
44 |
45 | private func execute(flags: Flags, args: [String]) {
46 | let sessions = scrapeSessions(filterBy: filterYear, session: filterSession)
47 | switch format {
48 | case .json:
49 | outputJson(for: sessions)
50 | case .yaml:
51 | outputYaml(for: sessions)
52 | }
53 | }
54 |
55 | private func outputJson(for sessions: [Session]) {
56 | do {
57 | let dictionaries = sessions.map({ $0.dictionary })
58 | try JSONSerialization
59 | .data(withJSONObject: dictionaries, options: .prettyPrinted)
60 | .write(to: outputPath ?? URL(fileURLWithPath: "./sessions.json"))
61 | } catch {
62 | if verboseEnabled { print(error) }
63 | }
64 | }
65 |
66 | private func outputYaml(for sessions: [Session]) {
67 | do {
68 | let yaml = try dump(object: sessions.asciiWwdcYamlObject, width: -1)
69 | try yaml.write(to: outputPath ?? URL(fileURLWithPath: "./sessions.yml"), atomically: true, encoding: .utf8)
70 | } catch {
71 | if verboseEnabled { print(error) }
72 | }
73 | }
74 |
75 | extension Array where Element == Session {
76 | /// Provides an object emittable as YAML of the form expected by ASCIIwwdc.
77 | var asciiWwdcYamlObject: [Node: NodeRepresentable] {
78 | var structured: [Node: NodeRepresentable] = [:]
79 | forEach { session in
80 | let key = Node(session.number)
81 | let value = [
82 | ":title": Node(session.title.asciiwwdcEscaped),
83 | ":track": Node(session.track.rawValue),
84 | ":description": Node(session.description.asciiwwdcEscaped)
85 | ]
86 | structured[key] = value
87 | }
88 | return structured
89 | }
90 | }
91 |
92 | private extension String {
93 | /// Mimic escaping found in existing asciiwwdc.com YAML.
94 | ///
95 | /// Yams emits colon-containing strings in single quotes, but asciiwwdc.com may not expect this.
96 | /// asciiwwdc.com may expect HTML enitities and not unicode escape sequences.
97 | var asciiwwdcEscaped: String {
98 | return self
99 | .replacingOccurrences(of: ":", with: ":")
100 | .htmlEscape(allowUnsafeSymbols: false, decimal: true, encodeEverything: false, useNamedReferences: true)
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/Sources/root.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | import Guaka
3 |
4 | var rootCommand = Command(usage: "jonyfive", configuration: configuration, run: execute)
5 |
6 | private let verboseOption = Flag(
7 | shortName: "v",
8 | longName: "verbose",
9 | value: false,
10 | description: "show work along the way",
11 | inheritable: true
12 | )
13 |
14 | private let yearOption = Flag(
15 | shortName: "y",
16 | longName: "year",
17 | type: Int.self,
18 | description: "filter by year",
19 | required: false,
20 | inheritable: true
21 | )
22 |
23 | private let sessionOption = Flag(
24 | shortName: "s",
25 | longName: "session",
26 | type: String.self,
27 | description: "filter by session",
28 | required: false,
29 | inheritable: true
30 | )
31 |
32 | private let outputPathOption = Flag(
33 | shortName: "o",
34 | longName: "output",
35 | type: String.self,
36 | description: "output path",
37 | required: false,
38 | inheritable: true
39 | )
40 |
41 | var verboseEnabled = false
42 | var filterYear: Int?
43 | var filterSession: String?
44 | var outputPath: URL?
45 |
46 | private func configuration(command: Command) {
47 | command.longMessage = "Collect public information available at Apple's developer site and act on it in various ways."
48 | command.add(flags: [verboseOption, yearOption, sessionOption, outputPathOption])
49 |
50 | command.inheritablePreRun = { flags, args in
51 |
52 | if let enabled = flags.getBool(name: verboseOption.longName) {
53 | verboseEnabled = enabled
54 | }
55 |
56 | if let year = flags.getInt(name: yearOption.longName) {
57 | // TODO: can we hook into validation to fail the command?
58 | if year < 2012 || year > 2017 {
59 | print("Year not supported: \(year)")
60 | return false
61 | }
62 | filterYear = year
63 | }
64 |
65 | if let session = flags.getString(name: sessionOption.longName) {
66 | if filterYear == nil {
67 | print("Session filtering requires year filtering. Use `--year` flag to select a year.")
68 | return false
69 | } else {
70 | filterSession = session
71 | }
72 | }
73 |
74 | if let output = flags.getString(name: outputPathOption.longName) {
75 | outputPath = URL(fileURLWithPath: output)
76 | }
77 |
78 | return true
79 | }
80 | }
81 |
82 | private func execute(flags: Flags, args: [String]) {
83 | }
84 |
--------------------------------------------------------------------------------
/Sources/setup.swift:
--------------------------------------------------------------------------------
1 | import Guaka
2 |
3 | // Generated, dont update
4 | func setupCommands() {
5 | rootCommand.add(subCommand: metaCommand)
6 | rootCommand.add(subCommand: webvttCommand)
7 | // Command adding placeholder, edit this line
8 | }
9 |
--------------------------------------------------------------------------------
/Sources/webvtt.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | import Guaka
3 | import FileUtils
4 |
5 | var webvttCommand = Command(
6 | usage: "webvtt", configuration: configuration, run: execute)
7 |
8 | fileprivate let methodOption = Flag(
9 | shortName: "p",
10 | longName: "playlist",
11 | type: Bool.self,
12 | description: "WebVTT cues acquisition method. Default is false for direct download.\n\t[true] Concatenate HLS subtitles playlists.\n\t[false] Download file from the URL with expected format.",
13 | required: false,
14 | inheritable: false
15 | )
16 |
17 | fileprivate let fallbackOption = Flag(
18 | shortName: "f",
19 | longName: "fallback",
20 | type: Bool.self,
21 | description: "On failure of acquisition method, fall back to other methods. Default is true.",
22 | required: false,
23 | inheritable: false
24 | )
25 |
26 | /// Method of acquiring WebVTT cues.
27 | fileprivate enum AcquisitionMethod {
28 | /// Fetch the full WebVTT file from a likely location. The full WebVTT file is commonly located at
29 | /// a URL following a particular format, though this does not work for all sessions.
30 | case directDownload
31 | /// Read the session's HLS master playlist, read its subtitles media playlist, fetch each file in
32 | /// the sequence, and concatenate the files. Perform some post-processing to eliminate artifacts
33 | /// of concatenation.
34 | case subtitlesPlaylist
35 | }
36 |
37 | /// Strategy for acquiring WebVTT cues.
38 | ///
39 | /// Our two methods of acquiring WebVTT cues yield different result sets. The timing can be
40 | /// different by milliseconds, a largely inconsequential difference. Also, transcription
41 | /// content may differ. Most notably, the direct download transcript typically includes a closing
42 | /// caption ("Thank you", or "[Applause]") not included in the streaming captions.
43 | fileprivate enum AcquisitionStrategy {
44 | /// Use HLS subtitles playlist file concatenation exclusively.
45 | case onlySubtitlesPlaylist
46 | /// Use direct download, exclusively.
47 | case onlyDirectDownload
48 | /// Attempt subtitles playlist file concatenation, falling back to URL download if necessary.
49 | case preferSubtitlesPlaylist
50 | /// Attempt URL download, falling back to subtitles playlist file concatenation, if necessary.
51 | case preferDirectDownload
52 |
53 | var methods: [AcquisitionMethod] {
54 | switch self {
55 | case .onlySubtitlesPlaylist:
56 | return [.subtitlesPlaylist]
57 | case .onlyDirectDownload:
58 | return [.directDownload]
59 | case .preferSubtitlesPlaylist:
60 | return [.subtitlesPlaylist, .directDownload]
61 | case .preferDirectDownload:
62 | return [.directDownload, .subtitlesPlaylist]
63 | }
64 | }
65 | }
66 |
67 | fileprivate var acquisitionStrategy: AcquisitionStrategy = .preferSubtitlesPlaylist
68 |
69 | private func configuration(command: Command) {
70 | command.shortMessage = "download WebVTT files"
71 | command.longMessage = "Download each session's WebVTT file, write to disk."
72 |
73 | command.add(flags: [methodOption, fallbackOption])
74 |
75 | command.preRun = { flags, args in
76 | let usePlaylists = flags.getBool(name: methodOption.longName) ?? false
77 | let fallback = flags.getBool(name: fallbackOption.longName) ?? true
78 |
79 | if usePlaylists && fallback {
80 | acquisitionStrategy = .preferSubtitlesPlaylist
81 | } else if usePlaylists && fallback == false {
82 | acquisitionStrategy = .onlySubtitlesPlaylist
83 | } else if usePlaylists == false && fallback {
84 | acquisitionStrategy = .preferDirectDownload
85 | } else if usePlaylists == false && fallback == false {
86 | acquisitionStrategy = .onlyDirectDownload
87 | }
88 | return true
89 | }
90 | }
91 |
92 | private func execute(flags: Flags, args: [String]) {
93 | let sessions = scrapeSessions(filterBy: filterYear, session: filterSession)
94 |
95 | Directory.create(atPath: path)
96 |
97 | let years = Set(sessions.map { String($0.year) })
98 | years.forEach { year in
99 | Directory.create(atPath: [path, year].joined(separator: "/"))
100 | }
101 |
102 | sessions.forEach { session in
103 | if verboseEnabled { print("##### \(session.year) session #\(session.number) #####") }
104 |
105 | if var webVttText = acquireWebVttText(for: session, using: acquisitionStrategy.methods) {
106 | webVttText = normalize(webVttText)
107 | write(webVttText, for: session)
108 | }
109 | }
110 | }
111 |
112 | fileprivate var path: String {
113 | return outputPath?.path ?? "."
114 | }
115 |
116 | fileprivate func path(for session: Session) -> String {
117 | return [path, String(session.year), "\(session.number).vtt"].joined(separator: "/")
118 | }
119 |
120 | fileprivate func acquireWebVttText(for session: Session, using methods: [AcquisitionMethod]) -> String? {
121 | for method in methods {
122 | if let text = acquireWebVttText(for: session, using: method) {
123 | return text
124 | }
125 | }
126 | return nil
127 | }
128 |
129 | fileprivate func acquireWebVttText(for session: Session, using method: AcquisitionMethod) -> String? {
130 | switch method {
131 | case .directDownload:
132 | return webVttText(from: session.webVtt)
133 | case .subtitlesPlaylist:
134 | return concatenateSubtitlesPlaylistFiles(for: session)
135 | }
136 | }
137 |
138 | fileprivate func webVttText(from url: URL) -> String? {
139 | var vttText: String!
140 | do {
141 | if verboseEnabled { print("Fetching WebVTT from \(url.absoluteString)") }
142 | vttText = try String(contentsOf: url)
143 | } catch {
144 | if verboseEnabled { print("Could not fetch WebVTT at \(url.absoluteString)") }
145 | return nil
146 | }
147 |
148 | if vttText.range(of: "WEBVTT") == nil {
149 | if verboseEnabled { print("Received non-WebVTT response") }
150 | return nil
151 | }
152 |
153 | return vttText
154 | }
155 |
156 | fileprivate func concatenateSubtitlesPlaylistFiles(for session: Session) -> String? {
157 | if verboseEnabled { print("Concatenating subtitles media playlist files") }
158 |
159 | let queryless = session.downloadSD.deletingQuery
160 | let baseUrl = queryless.deletingLastPathComponent()
161 | let m3u8Url = baseUrl.appendingPathComponent("subtitles/eng/prog_index.m3u8")
162 |
163 | var vttText = ""
164 | var m3u8Text: String = ""
165 |
166 | do {
167 | m3u8Text = try String(contentsOf: m3u8Url, encoding: .utf8)
168 | } catch {
169 | if verboseEnabled { print("Could not fetch subtitles media playlist: \(error)") }
170 | }
171 |
172 | if let signature = m3u8Text.components(separatedBy: .whitespacesAndNewlines).first {
173 | if signature.range(of: "#EXTM3U") == nil {
174 | if verboseEnabled { print("Subtitles media playlist unavailable") }
175 | return nil
176 | }
177 | }
178 |
179 | let fileLines = m3u8Text
180 | .components(separatedBy: .newlines)
181 | .filter { $0.range(of: ".webvtt") != nil }
182 |
183 | for fileLine in fileLines {
184 | let file = fileLine.trimmingCharacters(in: .whitespacesAndNewlines)
185 | let fileUrl = baseUrl.appendingPathComponent("subtitles/eng/\(file)")
186 | do {
187 | let fileText = try String(contentsOf: fileUrl, encoding: .utf8)
188 | vttText.append(fileText)
189 | } catch {
190 | if verboseEnabled { print("Could not fetch subtitles sequence file: \(fileUrl)") }
191 | return nil
192 | }
193 | }
194 |
195 | vttText = vttText.trimmingCharacters(in: .whitespacesAndNewlines)
196 | if vttText.isEmpty {
197 | return nil
198 | } else {
199 | return vttText
200 | }
201 | }
202 |
203 | /// Resolves artifacts of subtitle media playlist files concatenation.
204 | fileprivate func normalize(_ vttText: String) -> String {
205 | var text = removeCarriageReturns(from: vttText)
206 | text = removeRedundantFileSignatures(from: text)
207 | text = removeTimestampHeaders(from: text)
208 | text = removeRedundantCues(from: text)
209 | return text
210 | }
211 |
212 | /// Line breaks within cue text are CRLF, while the rest of the file uses LF. Convert entire file to
213 | /// LF, simplifying processing and generally making life easier.
214 | fileprivate func removeCarriageReturns(from text: String) -> String {
215 | return text.replacingOccurrences(of: "\r\n", with: "\n")
216 | }
217 |
218 | /// A WebVTT file begins with file signature "WEBVTT" to identify it as such. Processing removes
219 | /// subsequent `WEBVTT` lines introduced by concatenating secondary sequence files.
220 | fileprivate func removeRedundantFileSignatures(from webVttText: String) -> String {
221 | let lines = webVttText.components(separatedBy: .newlines)
222 | let filtered = lines.filter { $0.range(of: "WEBVTT") == nil }
223 | let text = "WEBVTT\n".appending(filtered.joined(separator: "\n"))
224 | return text
225 | }
226 |
227 | /// The `X-TIMESTAMP-MAP` header synchronizes timestamps between audio and video. In a monolithic
228 | /// WebVTT file, synchronization is unnecessary. Remove these headers.
229 | fileprivate func removeTimestampHeaders(from webVttText: String) -> String {
230 | let lines = webVttText.components(separatedBy: .newlines)
231 | let filtered = lines.filter { line in
232 | let include = line.range(of: "X-TIMESTAMP-MAP") == nil
233 | return include
234 | }//$0.range(of: "X-TIMESTAMP-MAP") == nil }
235 | var text = filtered.joined(separator: "\n")
236 | text = text.replacingOccurrences(of: "\n\n\n", with: "\n\n")
237 | return text
238 | }
239 |
240 | /// Cues are intended to be unique, and concatenation introduces redundant cues.
241 | /// - note: Handles one cue repeated once. Does not handle multiple repetition or repetition of
242 | /// multiple cues.
243 | fileprivate func removeRedundantCues(from webVttText: String) -> String {
244 | // Hypothesis is that splitting on "\n\n" effectively chunks by cue, which supports a minimal
245 | // comparison of consecutive elements. Line-by-line processing would be significantly more
246 | // complex with multiple comparisons per element and handling false matching of empty lines.
247 | let lines = webVttText.components(separatedBy: "\n\n")
248 |
249 | guard lines.count > 1 else {
250 | return webVttText
251 | }
252 |
253 | let firstElements = lines[0..