├── .gitignore ├── LICENSE ├── Package.resolved ├── Package.swift ├── README.md └── Sources ├── AppleComScraper.swift ├── Identifiable.swift ├── Session+JSON.swift ├── Session.swift ├── URLExtensions.swift ├── main.swift ├── meta.swift ├── root.swift ├── setup.swift └── webvtt.swift /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/macos,xcode,swiftpackagemanager 3 | 4 | ### macOS ### 5 | *.DS_Store 6 | .AppleDouble 7 | .LSOverride 8 | 9 | # Icon must end with two \r 10 | Icon 11 | 12 | # Thumbnails 13 | ._* 14 | 15 | # Files that might appear in the root of a volume 16 | .DocumentRevisions-V100 17 | .fseventsd 18 | .Spotlight-V100 19 | .TemporaryItems 20 | .Trashes 21 | .VolumeIcon.icns 22 | .com.apple.timemachine.donotpresent 23 | 24 | # Directories potentially created on remote AFP share 25 | .AppleDB 26 | .AppleDesktop 27 | Network Trash Folder 28 | Temporary Items 29 | .apdisk 30 | 31 | ### SwiftPackageManager ### 32 | Packages 33 | .build 34 | xcuserdata 35 | *.xcodeproj 36 | DerivedData/ 37 | 38 | ### Xcode ### 39 | # Xcode 40 | # 41 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 42 | 43 | ## Build generated 44 | build/ 45 | 46 | ## Various settings 47 | *.pbxuser 48 | !default.pbxuser 49 | *.mode1v3 50 | !default.mode1v3 51 | *.mode2v3 52 | !default.mode2v3 53 | *.perspectivev3 54 | !default.perspectivev3 55 | xcuserdata/ 56 | 57 | ## Other 58 | *.moved-aside 59 | *.xccheckout 60 | *.xcscmblueprint 61 | 62 | ### Xcode Patch ### 63 | *.xcodeproj/* 64 | !*.xcodeproj/project.pbxproj 65 | !*.xcodeproj/xcshareddata/ 66 | !*.xcworkspace/contents.xcworkspacedata 67 | /*.gcno 68 | 69 | # End of https://www.gitignore.io/api/macos,xcode,swiftpackagemanager 70 | 71 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Michael Pederson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "object": { 3 | "pins": [ 4 | { 5 | "package": "FileUtils", 6 | "repositoryURL": "https://github.com/oarrabi/FileUtils.git", 7 | "state": { 8 | "branch": null, 9 | "revision": "f4a5dcbfe912ee5076b6417c90aac5bfa7b12f55", 10 | "version": "0.1.1" 11 | } 12 | }, 13 | { 14 | "package": "Guaka", 15 | "repositoryURL": "https://github.com/oarrabi/Guaka.git", 16 | "state": { 17 | "branch": null, 18 | "revision": "9dfb2ea95b8ab9e799d0f65a79a2316679a758e0", 19 | "version": "0.1.3" 20 | } 21 | }, 22 | { 23 | "package": "Kanna", 24 | "repositoryURL": "https://github.com/tid-kijyun/Kanna.git", 25 | "state": { 26 | "branch": null, 27 | "revision": "44b169e1698d596f2eed698d8a67558fb0542b2a", 28 | "version": "4.0.0" 29 | } 30 | }, 31 | { 32 | "package": "Regex", 33 | "repositoryURL": "https://github.com/sharplet/Regex.git", 34 | "state": { 35 | "branch": null, 36 | "revision": "3e671ed911b467c0d9c05e56f03d9e5bcb535f39", 37 | "version": "1.1.0" 38 | } 39 | }, 40 | { 41 | "package": "StringScanner", 42 | "repositoryURL": "https://github.com/oarrabi/StringScanner", 43 | "state": { 44 | "branch": null, 45 | "revision": "246c697efe2f57d9042f58b1b53ace4fddb1efc4", 46 | "version": "0.2.0" 47 | } 48 | }, 49 | { 50 | "package": "HTMLEntities", 51 | "repositoryURL": "https://github.com/IBM-Swift/swift-html-entities.git", 52 | "state": { 53 | "branch": null, 54 | "revision": "9112c12bc5cc2b20fcaf1393c65a9cbbfda5a155", 55 | "version": "3.0.10" 56 | } 57 | }, 58 | { 59 | "package": "SwiftClibxml2", 60 | "repositoryURL": "https://github.com/tid-kijyun/SwiftClibxml2.git", 61 | "state": { 62 | "branch": null, 63 | "revision": "c4e67cc970273fc2bee978d12e422974ff184de7", 64 | "version": "1.0.2" 65 | } 66 | }, 67 | { 68 | "package": "Yams", 69 | "repositoryURL": "https://github.com/jpsim/Yams.git", 70 | "state": { 71 | "branch": null, 72 | "revision": "618582e09699b577fa183bab7d88e3ee7d9a1d19", 73 | "version": "1.0.0" 74 | } 75 | } 76 | ] 77 | }, 78 | "version": 1 79 | } 80 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:4.1 2 | 3 | import PackageDescription 4 | 5 | let package = Package( 6 | name: "jonyfive", 7 | dependencies: [ 8 | .package(url: "https://github.com/IBM-Swift/swift-html-entities.git", from: "3.0.0"), 9 | .package(url: "https://github.com/jpsim/Yams.git", from: "1.0.0"), 10 | .package(url: "https://github.com/nsomar/FileUtils", from: "0.0.0"), 11 | .package(url: "https://github.com/nsomar/Guaka", from: "0.0.0"), 12 | .package(url: "https://github.com/sharplet/Regex.git", from: "1.0.0"), 13 | .package(url: "https://github.com/tid-kijyun/Kanna.git", from: "4.0.0"), 14 | ], 15 | targets: [ 16 | .target( 17 | name: "jonyfive", 18 | dependencies: [ 19 | "HTMLEntities", 20 | "Yams", 21 | "FileUtils", 22 | "Guaka", 23 | "Regex", 24 | "Kanna", 25 | ], 26 | path: "Sources" 27 | ) 28 | ] 29 | ) 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ⚠️ Archived 2 | This tool is currently broken. For starters, the XPaths are hopelessly out of date. 3 | 4 | This tool was originally very hastily slapped together to the point of working, in support of [`my fork`](https://github.com/rlwimi/wwdc-session-transcripts) of [`ASCIIwwdc/wwdc-session-transcripts`](https://github.com/ASCIIwwdc/wwdc-session-transcripts) while that repo seemed abandoned, both in support of [`rlwimi/major-input`](https://github.com/rlwimi/major-input). 5 | 6 | I still see value in a tool like this. If I revive the project, I imagine moving to [`apple/swift-argument-parser`](https://github.com/apple/swift-argument-parser) and [Foundation's XML/XPath support](https://developer.apple.com/documentation/foundation/archives_and_serialization/xml_processing_and_modeling). 7 | 8 | --- 9 | 10 | # jonyfive 11 | 12 | CLI tool for doing useful things after scraping WWDC session metadata 13 | 14 | ## Usage 15 | 16 | Once you are up and running, use `-h`/`--help` flags for more information about the interface and functionality. 17 | 18 | ## Getting Started 19 | 20 | 1. Follow the installation instructions for dependency [Kanna](https://github.com/tid-kijyun/Kanna), specifically the instructions for Swift 4 via Swift Package Manager. 21 | 1. Run `swift package update`. 22 | 1. Optionally run `swift package generate-xcodeproj` if you're interested in working with the implementation in Xcode. 23 | 1. Run `swift build`. 24 | 1. Find executable at `.build/debug/jonyfive`. 25 | 26 | ## Warning 27 | 28 | Web scraping is fragile by nature, and small changes to the page HTML naming or structure can break this tool. It may be you that finds this breakage first–please file an issue. 29 | 30 | ## Thanks 31 | 32 | Based on [Guaka](https://github.com/nsomar/Guaka) and [Kanna](https://github.com/tid-kijyun/Kanna). 33 | 34 | ## License 35 | 36 | This tool is provided under the terms of the [LICENSE](https://github.com/rlwimi/jonyfive/blob/master/LICENSE). 37 | 38 | All content copyright © 2010 – 2017 Apple Inc. All rights reserved. 39 | -------------------------------------------------------------------------------- /Sources/AppleComScraper.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Kanna 3 | import Regex 4 | 5 | fileprivate var baseUrl: URL { 6 | return URL(string: "https://developer.apple.com")! 7 | } 8 | 9 | fileprivate func wwdcVideosUrlPath(for year: Int) -> String { 10 | return "/videos/wwdc\(year)" 11 | } 12 | 13 | /// Beginning in 2016, the Keynote has its own special event page at www.apple.com. The 2017 Keynote 14 | /// does not have an entry on the page with the 2017 sessions. 15 | fileprivate func wwdcKeynoteUrl(for year: Int) -> URL { 16 | return URL(string: "https://www.apple.com/apple-events/june-\(year)/")! 17 | } 18 | 19 | fileprivate func wwdcVideosUrl(for year: Int) -> URL { 20 | return URL(string: wwdcVideosUrlPath(for: year), relativeTo: baseUrl)! 21 | } 22 | 23 | func scrapeSessions(filterBy filterYear: Int? = nil, session filterSession: String? = nil) -> [Session] { 24 | var sessions: [Session] = [] 25 | 26 | sessions.append(contentsOf: scrapeKeynotes(filterBy: filterYear, session: filterSession)) 27 | 28 | (2012...2017).forEach { year in 29 | if let filter = filterYear, year != filter { 30 | return 31 | } 32 | sessions.append(contentsOf: scrapeSessions(from: year, filterBy: filterSession)) 33 | } 34 | return sessions 35 | } 36 | 37 | fileprivate func scrapeKeynotes(filterBy filterYear: Int? = nil, session filterSession: String? = nil) -> [Session] { 38 | var sessions: [Session] = [] 39 | (2017...2017).forEach { year in 40 | if let filter = filterYear, year != filter { 41 | return 42 | } 43 | if let session = filterSession, session != "101" { 44 | return 45 | } 46 | if let session = scrapeKeynote(from: year) { 47 | sessions.append(session) 48 | } 49 | } 50 | return sessions 51 | } 52 | 53 | fileprivate func scrapeKeynote(from year: Int) -> Session? { 54 | if verboseEnabled { print("Scraping \(year) Keynote...") } 55 | guard let html = try? String(contentsOf: wwdcKeynoteUrl(for: year)) else { 56 | if verboseEnabled { print("could not read from \(wwdcKeynoteUrl(for: year))") } 57 | return nil 58 | } 59 | guard let metadataUrl = keynoteMetadataUrl(from: html) else { 60 | if verboseEnabled { print("could not find metadata URL") } 61 | return nil 62 | } 63 | guard let metadata = keynoteMetadata(from: metadataUrl) else { 64 | if verboseEnabled { print("could not read metadata") } 65 | return nil 66 | } 67 | let session = makeKeynoteSession(for: year, download: metadata.download, image: keynoteImageUrl(from: html), webVtt: metadata.webVtt) 68 | return session 69 | } 70 | 71 | fileprivate func keynoteMetadataUrl(from html: String) -> URL? { 72 | return Regex("var urljson_path = '(.+)';") 73 | .firstMatch(in: html)? 74 | .captures.first? 75 | .flatMap { URL(string: $0) } 76 | } 77 | 78 | fileprivate func keynoteImageUrl(from html: String) -> URL? { 79 | return Regex("") 80 | .firstMatch(in: html)? 81 | .captures.first? 82 | .flatMap { URL(string: $0) } 83 | } 84 | 85 | fileprivate func keynoteMetadata(from url: URL) -> (download: URL, webVtt: URL)? { 86 | guard 87 | let metadata = try? Data(contentsOf: url), 88 | let json = try? JSONSerialization.jsonObject(with: metadata, options: []) as? [String: Any], 89 | let videos = json?["videoSrc"] as? [String: Any], 90 | // let hlsUrl = videos["hls"] as? String, 91 | // let hls = URL(string: hlsUrl), 92 | let downloadUrl = videos["nonhls"] as? String, 93 | let download = URL(string: downloadUrl), 94 | let webVttUrl = json?["videoCC"] as? String, 95 | let webVtt = URL(string: webVttUrl) 96 | else { 97 | return nil 98 | } 99 | return (download, webVtt) 100 | } 101 | 102 | fileprivate func makeKeynoteSession(for year: Int, download: URL, image: URL?, webVtt: URL) -> Session { 103 | return Session( 104 | conference: Conference.wwdc, 105 | description: "WWDC \(year) Keynote", 106 | downloadHD: download, 107 | downloadSD: download, 108 | duration: nil, 109 | focuses: [.iOS, .macOS, .tvOS, .watchOS], 110 | image: image, 111 | number: "101", 112 | title: "Keynote", 113 | track: Track.featured, 114 | webVtt: webVtt, 115 | year: String(year) 116 | ) 117 | } 118 | 119 | fileprivate func scrapeSessions(from year: Int, filterBy filterSession: String? = nil) -> [Session] { 120 | guard let yearDoc = try? HTML(url: wwdcVideosUrl(for: year), encoding: .utf8) else { 121 | if verboseEnabled { print("could not read URL for year \(year)") } 122 | return [] 123 | } 124 | 125 | if verboseEnabled { print("Scraping \(wwdcVideosUrl(for: year))") } 126 | 127 | var sessions: [Session] = [] 128 | 129 | yearDoc.xpath("//li[contains(@class, 'collection-focus-group')]").forEach { li in 130 | guard case .NodeSet(let nodes) = li.xpath("child::*") else { 131 | return 132 | } 133 | sessions.append(contentsOf: scrapeSessions(from: year, inTrackWith: nodes)) 134 | } 135 | return sessions 136 | } 137 | 138 | fileprivate func scrapeSessions(from year: Int, inTrackWith nodes: XMLNodeSet) -> [Session] { 139 | guard 140 | let header = nodes.first, 141 | let items = nodes.last, 142 | let track = header.content?.trimmingCharacters(in: .whitespacesAndNewlines) 143 | else { 144 | if verboseEnabled { print("could not parse node set: \(String(describing: nodes.toHTML))") } 145 | return [] 146 | } 147 | if verboseEnabled { print("Scanning sessions in track: \(track)") } 148 | 149 | var sessions: [Session] = [] 150 | 151 | var sessionImages: [Identifier: URL] = [:] 152 | 153 | items.xpath(".//a").forEach { anchor in 154 | let number = scrapeSessionNumber(from: anchor) 155 | let identifier = Session.makeIdentifier(conference: .wwdc, year: String(year), number: number) 156 | 157 | // Sessions before 2015 do not have an image. If this is an image anchor, dig the URL, cache, and pass. 158 | if let imageUrl = scrapeSessionImage(from: anchor) { 159 | sessionImages[identifier] = imageUrl 160 | return 161 | } 162 | 163 | // `anchor` is the title link, which proceeds any image link. 164 | 165 | let imageUrl = sessionImages[identifier] 166 | 167 | let title = scrapeSessionTitle(from: anchor) 168 | let webpageUrl = scrapeSessionPageUrl(from: anchor) 169 | 170 | if let filter = filterSession, number != filter { 171 | return 172 | } 173 | 174 | guard let sessionDoc = try? HTML(url: webpageUrl, encoding: .utf8) else { 175 | if verboseEnabled { print("could not read session page: \(webpageUrl.absoluteString)") } 176 | return 177 | } 178 | 179 | if verboseEnabled { print("Scraping \(year) session #\(number)...", terminator: "") } 180 | 181 | guard let (description, focuses) = scrapeSessionDetails(from: sessionDoc) else { 182 | if verboseEnabled { print("could not find the description of \(year) session #\(number)") } 183 | return 184 | } 185 | 186 | guard let (sdVideoUrl, hdVideoUrl) = scrapeSessionResources(from: sessionDoc) else { 187 | if verboseEnabled { print("could not find any resources for \(year) session #\(number)") } 188 | return 189 | } 190 | 191 | let yearString = String(year) 192 | let session = Session( 193 | conference: .wwdc, 194 | description: description, 195 | downloadHD: hdVideoUrl, 196 | downloadSD: sdVideoUrl, 197 | duration: nil, 198 | focuses: focuses.components(separatedBy: ", ").compactMap(Focus.init(rawValue:)), 199 | image: imageUrl, 200 | number: number, 201 | title: title, 202 | track: Track(rawValue: track)!, 203 | year: yearString 204 | ) 205 | 206 | if verboseEnabled { print("done.") } 207 | 208 | sessions.append(session) 209 | } 210 | return sessions 211 | } 212 | 213 | fileprivate func scrapeSessionNumber(from anchor: Kanna.XMLElement) -> String { 214 | return scrapeSessionPageUrl(from: anchor).lastPathComponent 215 | } 216 | 217 | fileprivate func scrapeSessionImage(from anchor: Kanna.XMLElement) -> URL? { 218 | guard 219 | let image = anchor.xpath("child::*").first, 220 | let imageUrlValue = image["src"], 221 | let imageUrl = URL(string: imageUrlValue) 222 | else { 223 | return nil 224 | } 225 | return imageUrl 226 | } 227 | 228 | fileprivate func scrapeSessionTitle(from anchor: Kanna.XMLElement) -> String { 229 | return anchor.text?.trimmingCharacters(in: .whitespacesAndNewlines) ?? "" 230 | } 231 | 232 | fileprivate func scrapeSessionPageUrl(from anchor: Kanna.XMLElement) -> URL { 233 | guard 234 | let href = anchor["href"], 235 | let hrefUrl = URL(string: href, relativeTo: baseUrl) 236 | else { 237 | return baseUrl 238 | } 239 | return hrefUrl 240 | } 241 | 242 | fileprivate func scrapeSessionDetails(from doc: HTMLDocument) -> (description: String, focuses: String)? { 243 | for listItem in doc.xpath("//li[contains(@data-supplement-id, 'details')]") { 244 | // Skip the details tab element. 245 | if let `class` = listItem["class"], `class`.range(of: "supplement details") == nil { 246 | continue 247 | } 248 | 249 | let paragraphs = listItem.xpath(".//p") 250 | guard paragraphs.count >= 2 else { 251 | return nil 252 | } 253 | 254 | guard 255 | let description = paragraphs[0].text, 256 | let tagsLine = paragraphs[1].text 257 | else { 258 | return nil 259 | } 260 | 261 | let tags = tagsLine.components(separatedBy: " - ") 262 | 263 | guard let focuses = tags.last else { 264 | return nil 265 | } 266 | 267 | return (description, focuses) 268 | } 269 | 270 | return nil 271 | } 272 | 273 | fileprivate func scrapeSessionResources(from doc: HTMLDocument) -> (sdVideoUrl: URL, hdVideoUrl: URL)? { 274 | for resourcesListItem in doc.xpath("//li[contains(@data-supplement-id, 'details')]") { 275 | // Skip the tab element, it's the tab's content we want. 276 | if let `class` = resourcesListItem["class"], `class`.range(of: "supplement details") == nil { 277 | continue 278 | } 279 | 280 | var sdVideoUrl: URL? 281 | var hdVideoUrl: URL? 282 | 283 | resourcesListItem.xpath(".//a").forEach { anchor in 284 | guard let text = anchor.innerHTML else { 285 | return 286 | } 287 | switch text { 288 | case "HD Video": 289 | guard let value = anchor["href"], let url = URL(string: value) else { 290 | return 291 | } 292 | hdVideoUrl = url 293 | case "SD Video": 294 | guard let value = anchor["href"], let url = URL(string: value) else { 295 | return 296 | } 297 | sdVideoUrl = url 298 | default: 299 | // Not yet handling non-video resources 300 | break 301 | } 302 | } 303 | 304 | // Not handling the case when missing only SD or HD 305 | if let sdVideoUrl = sdVideoUrl, let hdVideoUrl = hdVideoUrl { 306 | return (sdVideoUrl, hdVideoUrl) 307 | } 308 | } 309 | return nil 310 | } 311 | -------------------------------------------------------------------------------- /Sources/Identifiable.swift: -------------------------------------------------------------------------------- 1 | struct Identifier: RawRepresentable { 2 | let rawValue: String 3 | 4 | init(rawValue: String) { 5 | self.rawValue = rawValue 6 | } 7 | } 8 | 9 | protocol IdentifierProtocol: Hashable {} 10 | 11 | extension Identifier: IdentifierProtocol {} 12 | 13 | extension Identifier: Equatable { 14 | static func == (lhs: Identifier, rhs: Identifier) -> Bool { 15 | return lhs.rawValue == rhs.rawValue 16 | } 17 | } 18 | 19 | extension Identifier: Hashable { 20 | var hashValue: Int { 21 | return rawValue.hashValue 22 | } 23 | } 24 | 25 | protocol Identifiable: Hashable { 26 | associatedtype IdentifierType: IdentifierProtocol 27 | var identifier: IdentifierType { get } 28 | } 29 | 30 | extension Identifiable { 31 | var hashValue: Int { 32 | return identifier.hashValue 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /Sources/Session+JSON.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | extension Session { 4 | var dictionary: [String: Any] { 5 | var d: [String: Any] = [:] 6 | d["description"] = description 7 | d["download_hd"] = downloadHD.absoluteString 8 | d["download_sd"] = downloadSD.absoluteString 9 | d["duration"] = duration ?? nil 10 | d["focus"] = focuses.map({ $0.rawValue }) 11 | d["image"] = image?.absoluteString 12 | d["id"] = number 13 | d["track"] = track.rawValue 14 | d["title"] = title 15 | d["year"] = Int(year) ?? nil 16 | return d 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Sources/Session.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | enum Focus: String { 4 | case iOS 5 | case macOS 6 | case tvOS 7 | case watchOS 8 | } 9 | 10 | enum Track: String { 11 | case appFrameworks = "App Frameworks" 12 | case systemFrameworks = "System Frameworks" 13 | case developerTools = "Developer Tools" 14 | case featured = "Featured" 15 | case graphicsAndGames = "Graphics and Games" 16 | case design = "Design" 17 | case media = "Media" 18 | case distribution = "Distribution" 19 | 20 | // pre-2015 tracks 21 | case appServices = "App Services" 22 | case coreOS = "Core OS" 23 | case essentials = "Essentials" 24 | case general = "General" 25 | case graphicsMediaAndGames = "Graphics, Media & Games" 26 | case safariAndWeb = "Safari & Web" 27 | case frameworks = "Frameworks" 28 | case services = "Services" 29 | case specialEvents = "Special Events" 30 | case tools = "Tools" 31 | } 32 | 33 | enum Conference: String { 34 | case wwdc = "WWDC" 35 | } 36 | 37 | struct Session: Identifiable { 38 | let identifier: Identifier 39 | let conference: Conference 40 | let description: String 41 | let downloadHD: URL 42 | let downloadSD: URL 43 | let duration: Int? 44 | let focuses: [Focus] 45 | let image: URL? 46 | let number: String 47 | let title: String 48 | let track: Track 49 | let year: String 50 | let webVtt: URL 51 | 52 | init( 53 | conference: Conference, 54 | description: String, 55 | downloadHD: URL, 56 | downloadSD: URL, 57 | duration: Int?, 58 | focuses: [Focus], 59 | image: URL?, 60 | number: String, 61 | title: String, 62 | track: Track, 63 | webVtt: URL? = nil, 64 | year: String) { 65 | 66 | self.identifier = Session.makeIdentifier(conference: conference, year: year, number: number) 67 | self.conference = conference 68 | self.description = description 69 | self.downloadHD = downloadHD 70 | self.downloadSD = downloadSD 71 | self.duration = duration 72 | self.focuses = focuses 73 | self.image = image 74 | self.number = number 75 | self.title = title 76 | self.track = track 77 | self.year = year 78 | 79 | if let webVtt = webVtt { 80 | self.webVtt = webVtt 81 | } else { 82 | var url = downloadSD.deletingQuery 83 | url.deletePathExtension() 84 | let basename = url.lastPathComponent 85 | url.deleteLastPathComponent() 86 | url.appendPathComponent("subtitles/eng/\(basename).vtt") 87 | self.webVtt = url 88 | } 89 | } 90 | } 91 | 92 | extension Session { 93 | static func makeIdentifier(conference: Conference, year: String, number: String) -> Identifier { 94 | return Identifier(rawValue: [conference.rawValue, year, number].joined(separator: "-")) 95 | } 96 | } 97 | 98 | extension Session: Equatable { 99 | 100 | static func == (lhs: Session, rhs: Session) -> Bool { 101 | return lhs.year == rhs.year && 102 | lhs.number == rhs.number && 103 | lhs.description == rhs.description && 104 | lhs.downloadHD == rhs.downloadHD && 105 | lhs.downloadSD == rhs.downloadSD && 106 | lhs.duration == rhs.duration && 107 | lhs.focuses == rhs.focuses && 108 | lhs.image == rhs.image && 109 | lhs.title == rhs.title && 110 | lhs.track == rhs.track 111 | } 112 | } 113 | 114 | extension Session: Hashable { 115 | var hashValue: Int { 116 | return identifier.hashValue 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /Sources/URLExtensions.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | public extension URL { 4 | /// Return a URL with query removed. If something goes wrong, return the instance. 5 | var deletingQuery: URL { 6 | guard var components = URLComponents(url: self, resolvingAgainstBaseURL: true) else { 7 | return self 8 | } 9 | components.query = nil 10 | guard let url = components.url else { 11 | return self 12 | } 13 | return url 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Sources/main.swift: -------------------------------------------------------------------------------- 1 | import Guaka 2 | 3 | setupCommands() 4 | 5 | rootCommand.execute() 6 | -------------------------------------------------------------------------------- /Sources/meta.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Guaka 3 | import HTMLEntities 4 | import Yams 5 | 6 | var metaCommand = Command(usage: "meta", configuration: configuration, run: execute) 7 | 8 | fileprivate let outputFormatOption = Flag( 9 | shortName: "f", 10 | longName: "format", 11 | type: String.self, 12 | description: "output format: json (default, for Major Input) or yaml (for ASCIIwwdc)", 13 | required: false, 14 | inheritable: false 15 | ) 16 | 17 | fileprivate enum Format: String { 18 | /// Format used by Major Input. Includes all scraped information. 19 | case json 20 | /// Format used by ASCIIwwdc.com. Includes limited information. 21 | case yaml 22 | } 23 | 24 | fileprivate var format: Format = .json 25 | 26 | private func configuration(command: Command) { 27 | command.shortMessage = "Collect session information in a file." 28 | command.longMessage = "Collect meta information on each session, write it to a file." 29 | 30 | command.add(flags: [outputFormatOption]) 31 | 32 | command.preRun = { flags, args in 33 | guard let formatRawValue = flags.getString(name: outputFormatOption.longName) else { 34 | return true 35 | } 36 | guard let formatValue = Format(rawValue: formatRawValue) else { 37 | print("Invalid output format \"\(formatRawValue)\".") 38 | return false 39 | } 40 | format = formatValue 41 | return true 42 | } 43 | } 44 | 45 | private func execute(flags: Flags, args: [String]) { 46 | let sessions = scrapeSessions(filterBy: filterYear, session: filterSession) 47 | switch format { 48 | case .json: 49 | outputJson(for: sessions) 50 | case .yaml: 51 | outputYaml(for: sessions) 52 | } 53 | } 54 | 55 | private func outputJson(for sessions: [Session]) { 56 | do { 57 | let dictionaries = sessions.map({ $0.dictionary }) 58 | try JSONSerialization 59 | .data(withJSONObject: dictionaries, options: .prettyPrinted) 60 | .write(to: outputPath ?? URL(fileURLWithPath: "./sessions.json")) 61 | } catch { 62 | if verboseEnabled { print(error) } 63 | } 64 | } 65 | 66 | private func outputYaml(for sessions: [Session]) { 67 | do { 68 | let yaml = try dump(object: sessions.asciiWwdcYamlObject, width: -1) 69 | try yaml.write(to: outputPath ?? URL(fileURLWithPath: "./sessions.yml"), atomically: true, encoding: .utf8) 70 | } catch { 71 | if verboseEnabled { print(error) } 72 | } 73 | } 74 | 75 | extension Array where Element == Session { 76 | /// Provides an object emittable as YAML of the form expected by ASCIIwwdc. 77 | var asciiWwdcYamlObject: [Node: NodeRepresentable] { 78 | var structured: [Node: NodeRepresentable] = [:] 79 | forEach { session in 80 | let key = Node(session.number) 81 | let value = [ 82 | ":title": Node(session.title.asciiwwdcEscaped), 83 | ":track": Node(session.track.rawValue), 84 | ":description": Node(session.description.asciiwwdcEscaped) 85 | ] 86 | structured[key] = value 87 | } 88 | return structured 89 | } 90 | } 91 | 92 | private extension String { 93 | /// Mimic escaping found in existing asciiwwdc.com YAML. 94 | /// 95 | /// Yams emits colon-containing strings in single quotes, but asciiwwdc.com may not expect this. 96 | /// asciiwwdc.com may expect HTML enitities and not unicode escape sequences. 97 | var asciiwwdcEscaped: String { 98 | return self 99 | .replacingOccurrences(of: ":", with: ":") 100 | .htmlEscape(allowUnsafeSymbols: false, decimal: true, encodeEverything: false, useNamedReferences: true) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /Sources/root.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Guaka 3 | 4 | var rootCommand = Command(usage: "jonyfive", configuration: configuration, run: execute) 5 | 6 | private let verboseOption = Flag( 7 | shortName: "v", 8 | longName: "verbose", 9 | value: false, 10 | description: "show work along the way", 11 | inheritable: true 12 | ) 13 | 14 | private let yearOption = Flag( 15 | shortName: "y", 16 | longName: "year", 17 | type: Int.self, 18 | description: "filter by year", 19 | required: false, 20 | inheritable: true 21 | ) 22 | 23 | private let sessionOption = Flag( 24 | shortName: "s", 25 | longName: "session", 26 | type: String.self, 27 | description: "filter by session", 28 | required: false, 29 | inheritable: true 30 | ) 31 | 32 | private let outputPathOption = Flag( 33 | shortName: "o", 34 | longName: "output", 35 | type: String.self, 36 | description: "output path", 37 | required: false, 38 | inheritable: true 39 | ) 40 | 41 | var verboseEnabled = false 42 | var filterYear: Int? 43 | var filterSession: String? 44 | var outputPath: URL? 45 | 46 | private func configuration(command: Command) { 47 | command.longMessage = "Collect public information available at Apple's developer site and act on it in various ways." 48 | command.add(flags: [verboseOption, yearOption, sessionOption, outputPathOption]) 49 | 50 | command.inheritablePreRun = { flags, args in 51 | 52 | if let enabled = flags.getBool(name: verboseOption.longName) { 53 | verboseEnabled = enabled 54 | } 55 | 56 | if let year = flags.getInt(name: yearOption.longName) { 57 | // TODO: can we hook into validation to fail the command? 58 | if year < 2012 || year > 2017 { 59 | print("Year not supported: \(year)") 60 | return false 61 | } 62 | filterYear = year 63 | } 64 | 65 | if let session = flags.getString(name: sessionOption.longName) { 66 | if filterYear == nil { 67 | print("Session filtering requires year filtering. Use `--year` flag to select a year.") 68 | return false 69 | } else { 70 | filterSession = session 71 | } 72 | } 73 | 74 | if let output = flags.getString(name: outputPathOption.longName) { 75 | outputPath = URL(fileURLWithPath: output) 76 | } 77 | 78 | return true 79 | } 80 | } 81 | 82 | private func execute(flags: Flags, args: [String]) { 83 | } 84 | -------------------------------------------------------------------------------- /Sources/setup.swift: -------------------------------------------------------------------------------- 1 | import Guaka 2 | 3 | // Generated, dont update 4 | func setupCommands() { 5 | rootCommand.add(subCommand: metaCommand) 6 | rootCommand.add(subCommand: webvttCommand) 7 | // Command adding placeholder, edit this line 8 | } 9 | -------------------------------------------------------------------------------- /Sources/webvtt.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Guaka 3 | import FileUtils 4 | 5 | var webvttCommand = Command( 6 | usage: "webvtt", configuration: configuration, run: execute) 7 | 8 | fileprivate let methodOption = Flag( 9 | shortName: "p", 10 | longName: "playlist", 11 | type: Bool.self, 12 | description: "WebVTT cues acquisition method. Default is false for direct download.\n\t[true] Concatenate HLS subtitles playlists.\n\t[false] Download file from the URL with expected format.", 13 | required: false, 14 | inheritable: false 15 | ) 16 | 17 | fileprivate let fallbackOption = Flag( 18 | shortName: "f", 19 | longName: "fallback", 20 | type: Bool.self, 21 | description: "On failure of acquisition method, fall back to other methods. Default is true.", 22 | required: false, 23 | inheritable: false 24 | ) 25 | 26 | /// Method of acquiring WebVTT cues. 27 | fileprivate enum AcquisitionMethod { 28 | /// Fetch the full WebVTT file from a likely location. The full WebVTT file is commonly located at 29 | /// a URL following a particular format, though this does not work for all sessions. 30 | case directDownload 31 | /// Read the session's HLS master playlist, read its subtitles media playlist, fetch each file in 32 | /// the sequence, and concatenate the files. Perform some post-processing to eliminate artifacts 33 | /// of concatenation. 34 | case subtitlesPlaylist 35 | } 36 | 37 | /// Strategy for acquiring WebVTT cues. 38 | /// 39 | /// Our two methods of acquiring WebVTT cues yield different result sets. The timing can be 40 | /// different by milliseconds, a largely inconsequential difference. Also, transcription 41 | /// content may differ. Most notably, the direct download transcript typically includes a closing 42 | /// caption ("Thank you", or "[Applause]") not included in the streaming captions. 43 | fileprivate enum AcquisitionStrategy { 44 | /// Use HLS subtitles playlist file concatenation exclusively. 45 | case onlySubtitlesPlaylist 46 | /// Use direct download, exclusively. 47 | case onlyDirectDownload 48 | /// Attempt subtitles playlist file concatenation, falling back to URL download if necessary. 49 | case preferSubtitlesPlaylist 50 | /// Attempt URL download, falling back to subtitles playlist file concatenation, if necessary. 51 | case preferDirectDownload 52 | 53 | var methods: [AcquisitionMethod] { 54 | switch self { 55 | case .onlySubtitlesPlaylist: 56 | return [.subtitlesPlaylist] 57 | case .onlyDirectDownload: 58 | return [.directDownload] 59 | case .preferSubtitlesPlaylist: 60 | return [.subtitlesPlaylist, .directDownload] 61 | case .preferDirectDownload: 62 | return [.directDownload, .subtitlesPlaylist] 63 | } 64 | } 65 | } 66 | 67 | fileprivate var acquisitionStrategy: AcquisitionStrategy = .preferSubtitlesPlaylist 68 | 69 | private func configuration(command: Command) { 70 | command.shortMessage = "download WebVTT files" 71 | command.longMessage = "Download each session's WebVTT file, write to disk." 72 | 73 | command.add(flags: [methodOption, fallbackOption]) 74 | 75 | command.preRun = { flags, args in 76 | let usePlaylists = flags.getBool(name: methodOption.longName) ?? false 77 | let fallback = flags.getBool(name: fallbackOption.longName) ?? true 78 | 79 | if usePlaylists && fallback { 80 | acquisitionStrategy = .preferSubtitlesPlaylist 81 | } else if usePlaylists && fallback == false { 82 | acquisitionStrategy = .onlySubtitlesPlaylist 83 | } else if usePlaylists == false && fallback { 84 | acquisitionStrategy = .preferDirectDownload 85 | } else if usePlaylists == false && fallback == false { 86 | acquisitionStrategy = .onlyDirectDownload 87 | } 88 | return true 89 | } 90 | } 91 | 92 | private func execute(flags: Flags, args: [String]) { 93 | let sessions = scrapeSessions(filterBy: filterYear, session: filterSession) 94 | 95 | Directory.create(atPath: path) 96 | 97 | let years = Set(sessions.map { String($0.year) }) 98 | years.forEach { year in 99 | Directory.create(atPath: [path, year].joined(separator: "/")) 100 | } 101 | 102 | sessions.forEach { session in 103 | if verboseEnabled { print("##### \(session.year) session #\(session.number) #####") } 104 | 105 | if var webVttText = acquireWebVttText(for: session, using: acquisitionStrategy.methods) { 106 | webVttText = normalize(webVttText) 107 | write(webVttText, for: session) 108 | } 109 | } 110 | } 111 | 112 | fileprivate var path: String { 113 | return outputPath?.path ?? "." 114 | } 115 | 116 | fileprivate func path(for session: Session) -> String { 117 | return [path, String(session.year), "\(session.number).vtt"].joined(separator: "/") 118 | } 119 | 120 | fileprivate func acquireWebVttText(for session: Session, using methods: [AcquisitionMethod]) -> String? { 121 | for method in methods { 122 | if let text = acquireWebVttText(for: session, using: method) { 123 | return text 124 | } 125 | } 126 | return nil 127 | } 128 | 129 | fileprivate func acquireWebVttText(for session: Session, using method: AcquisitionMethod) -> String? { 130 | switch method { 131 | case .directDownload: 132 | return webVttText(from: session.webVtt) 133 | case .subtitlesPlaylist: 134 | return concatenateSubtitlesPlaylistFiles(for: session) 135 | } 136 | } 137 | 138 | fileprivate func webVttText(from url: URL) -> String? { 139 | var vttText: String! 140 | do { 141 | if verboseEnabled { print("Fetching WebVTT from \(url.absoluteString)") } 142 | vttText = try String(contentsOf: url) 143 | } catch { 144 | if verboseEnabled { print("Could not fetch WebVTT at \(url.absoluteString)") } 145 | return nil 146 | } 147 | 148 | if vttText.range(of: "WEBVTT") == nil { 149 | if verboseEnabled { print("Received non-WebVTT response") } 150 | return nil 151 | } 152 | 153 | return vttText 154 | } 155 | 156 | fileprivate func concatenateSubtitlesPlaylistFiles(for session: Session) -> String? { 157 | if verboseEnabled { print("Concatenating subtitles media playlist files") } 158 | 159 | let queryless = session.downloadSD.deletingQuery 160 | let baseUrl = queryless.deletingLastPathComponent() 161 | let m3u8Url = baseUrl.appendingPathComponent("subtitles/eng/prog_index.m3u8") 162 | 163 | var vttText = "" 164 | var m3u8Text: String = "" 165 | 166 | do { 167 | m3u8Text = try String(contentsOf: m3u8Url, encoding: .utf8) 168 | } catch { 169 | if verboseEnabled { print("Could not fetch subtitles media playlist: \(error)") } 170 | } 171 | 172 | if let signature = m3u8Text.components(separatedBy: .whitespacesAndNewlines).first { 173 | if signature.range(of: "#EXTM3U") == nil { 174 | if verboseEnabled { print("Subtitles media playlist unavailable") } 175 | return nil 176 | } 177 | } 178 | 179 | let fileLines = m3u8Text 180 | .components(separatedBy: .newlines) 181 | .filter { $0.range(of: ".webvtt") != nil } 182 | 183 | for fileLine in fileLines { 184 | let file = fileLine.trimmingCharacters(in: .whitespacesAndNewlines) 185 | let fileUrl = baseUrl.appendingPathComponent("subtitles/eng/\(file)") 186 | do { 187 | let fileText = try String(contentsOf: fileUrl, encoding: .utf8) 188 | vttText.append(fileText) 189 | } catch { 190 | if verboseEnabled { print("Could not fetch subtitles sequence file: \(fileUrl)") } 191 | return nil 192 | } 193 | } 194 | 195 | vttText = vttText.trimmingCharacters(in: .whitespacesAndNewlines) 196 | if vttText.isEmpty { 197 | return nil 198 | } else { 199 | return vttText 200 | } 201 | } 202 | 203 | /// Resolves artifacts of subtitle media playlist files concatenation. 204 | fileprivate func normalize(_ vttText: String) -> String { 205 | var text = removeCarriageReturns(from: vttText) 206 | text = removeRedundantFileSignatures(from: text) 207 | text = removeTimestampHeaders(from: text) 208 | text = removeRedundantCues(from: text) 209 | return text 210 | } 211 | 212 | /// Line breaks within cue text are CRLF, while the rest of the file uses LF. Convert entire file to 213 | /// LF, simplifying processing and generally making life easier. 214 | fileprivate func removeCarriageReturns(from text: String) -> String { 215 | return text.replacingOccurrences(of: "\r\n", with: "\n") 216 | } 217 | 218 | /// A WebVTT file begins with file signature "WEBVTT" to identify it as such. Processing removes 219 | /// subsequent `WEBVTT` lines introduced by concatenating secondary sequence files. 220 | fileprivate func removeRedundantFileSignatures(from webVttText: String) -> String { 221 | let lines = webVttText.components(separatedBy: .newlines) 222 | let filtered = lines.filter { $0.range(of: "WEBVTT") == nil } 223 | let text = "WEBVTT\n".appending(filtered.joined(separator: "\n")) 224 | return text 225 | } 226 | 227 | /// The `X-TIMESTAMP-MAP` header synchronizes timestamps between audio and video. In a monolithic 228 | /// WebVTT file, synchronization is unnecessary. Remove these headers. 229 | fileprivate func removeTimestampHeaders(from webVttText: String) -> String { 230 | let lines = webVttText.components(separatedBy: .newlines) 231 | let filtered = lines.filter { line in 232 | let include = line.range(of: "X-TIMESTAMP-MAP") == nil 233 | return include 234 | }//$0.range(of: "X-TIMESTAMP-MAP") == nil } 235 | var text = filtered.joined(separator: "\n") 236 | text = text.replacingOccurrences(of: "\n\n\n", with: "\n\n") 237 | return text 238 | } 239 | 240 | /// Cues are intended to be unique, and concatenation introduces redundant cues. 241 | /// - note: Handles one cue repeated once. Does not handle multiple repetition or repetition of 242 | /// multiple cues. 243 | fileprivate func removeRedundantCues(from webVttText: String) -> String { 244 | // Hypothesis is that splitting on "\n\n" effectively chunks by cue, which supports a minimal 245 | // comparison of consecutive elements. Line-by-line processing would be significantly more 246 | // complex with multiple comparisons per element and handling false matching of empty lines. 247 | let lines = webVttText.components(separatedBy: "\n\n") 248 | 249 | guard lines.count > 1 else { 250 | return webVttText 251 | } 252 | 253 | let firstElements = lines[0..