├── .gitignore ├── LICENSE.md ├── README.md ├── pom.xml ├── serve └── style.css └── src └── main └── java ├── dev └── nipafx │ └── demo │ └── modern │ ├── GitHubCrawl.java │ ├── crawler │ ├── PageFactory.java │ ├── PageTreeFactory.java │ └── PageWithLinks.java │ ├── operations │ ├── Pretty.java │ ├── ResultServer.java │ └── Statistician.java │ └── page │ ├── ErrorPage.java │ ├── ExternalPage.java │ ├── GitHubIssuePage.java │ ├── GitHubPage.java │ ├── GitHubPrPage.java │ ├── Page.java │ └── SuccessfulPage.java └── module-info.java /.gitignore: -------------------------------------------------------------------------------- 1 | # Eclipse 2 | 3 | .metadata 4 | bin/ 5 | tmp/ 6 | *.tmp 7 | *.bak 8 | *.swp 9 | *~.nib 10 | local.properties 11 | .settings/ 12 | .loadpath 13 | .recommenders 14 | 15 | .project 16 | .classpath 17 | *.launch 18 | 19 | # JetBrains 20 | 21 | .idea/ 22 | *.iws 23 | *.iml 24 | /out/ 25 | 26 | # Visual Studio Code 27 | 28 | .factoryPath 29 | .vscode/ 30 | 31 | # Gradle 32 | 33 | .gradle/ 34 | build/ 35 | 36 | # Maven 37 | 38 | target/ 39 | 40 | # JVM crash logs 41 | # see https://www.java.com/en/download/help/error_hotspot.html 42 | hs_err_pid* 43 | 44 | # MacOS 45 | .DS_Store 46 | 47 | # thead dumps 48 | threads.json 49 | 50 | # app 51 | serve/index.html 52 | jars/* 53 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Eclipse Public License - v 2.0 2 | 3 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE 4 | PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION 5 | OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 6 | 7 | 1. DEFINITIONS 8 | 9 | "Contribution" means: 10 | 11 | a) in the case of the initial Contributor, the initial content 12 | Distributed under this Agreement, and 13 | 14 | b) in the case of each subsequent Contributor: 15 | i) changes to the Program, and 16 | ii) additions to the Program; 17 | where such changes and/or additions to the Program originate from 18 | and are Distributed by that particular Contributor. A Contribution 19 | "originates" from a Contributor if it was added to the Program by 20 | such Contributor itself or anyone acting on such Contributor's behalf. 21 | Contributions do not include changes or additions to the Program that 22 | are not Modified Works. 23 | 24 | "Contributor" means any person or entity that Distributes the Program. 25 | 26 | "Licensed Patents" mean patent claims licensable by a Contributor which 27 | are necessarily infringed by the use or sale of its Contribution alone 28 | or when combined with the Program. 29 | 30 | "Program" means the Contributions Distributed in accordance with this 31 | Agreement. 32 | 33 | "Recipient" means anyone who receives the Program under this Agreement 34 | or any Secondary License (as applicable), including Contributors. 35 | 36 | "Derivative Works" shall mean any work, whether in Source Code or other 37 | form, that is based on (or derived from) the Program and for which the 38 | editorial revisions, annotations, elaborations, or other modifications 39 | represent, as a whole, an original work of authorship. 40 | 41 | "Modified Works" shall mean any work in Source Code or other form that 42 | results from an addition to, deletion from, or modification of the 43 | contents of the Program, including, for purposes of clarity any new file 44 | in Source Code form that contains any contents of the Program. Modified 45 | Works shall not include works that contain only declarations, 46 | interfaces, types, classes, structures, or files of the Program solely 47 | in each case in order to link to, bind by name, or subclass the Program 48 | or Modified Works thereof. 49 | 50 | "Distribute" means the acts of a) distributing or b) making available 51 | in any manner that enables the transfer of a copy. 52 | 53 | "Source Code" means the form of a Program preferred for making 54 | modifications, including but not limited to software source code, 55 | documentation source, and configuration files. 56 | 57 | "Secondary License" means either the GNU General Public License, 58 | Version 2.0, or any later versions of that license, including any 59 | exceptions or additional permissions as identified by the initial 60 | Contributor. 61 | 62 | 2. GRANT OF RIGHTS 63 | 64 | a) Subject to the terms of this Agreement, each Contributor hereby 65 | grants Recipient a non-exclusive, worldwide, royalty-free copyright 66 | license to reproduce, prepare Derivative Works of, publicly display, 67 | publicly perform, Distribute and sublicense the Contribution of such 68 | Contributor, if any, and such Derivative Works. 69 | 70 | b) Subject to the terms of this Agreement, each Contributor hereby 71 | grants Recipient a non-exclusive, worldwide, royalty-free patent 72 | license under Licensed Patents to make, use, sell, offer to sell, 73 | import and otherwise transfer the Contribution of such Contributor, 74 | if any, in Source Code or other form. This patent license shall 75 | apply to the combination of the Contribution and the Program if, at 76 | the time the Contribution is added by the Contributor, such addition 77 | of the Contribution causes such combination to be covered by the 78 | Licensed Patents. The patent license shall not apply to any other 79 | combinations which include the Contribution. No hardware per se is 80 | licensed hereunder. 81 | 82 | c) Recipient understands that although each Contributor grants the 83 | licenses to its Contributions set forth herein, no assurances are 84 | provided by any Contributor that the Program does not infringe the 85 | patent or other intellectual property rights of any other entity. 86 | Each Contributor disclaims any liability to Recipient for claims 87 | brought by any other entity based on infringement of intellectual 88 | property rights or otherwise. As a condition to exercising the 89 | rights and licenses granted hereunder, each Recipient hereby 90 | assumes sole responsibility to secure any other intellectual 91 | property rights needed, if any. For example, if a third party 92 | patent license is required to allow Recipient to Distribute the 93 | Program, it is Recipient's responsibility to acquire that license 94 | before distributing the Program. 95 | 96 | d) Each Contributor represents that to its knowledge it has 97 | sufficient copyright rights in its Contribution, if any, to grant 98 | the copyright license set forth in this Agreement. 99 | 100 | e) Notwithstanding the terms of any Secondary License, no 101 | Contributor makes additional grants to any Recipient (other than 102 | those set forth in this Agreement) as a result of such Recipient's 103 | receipt of the Program under the terms of a Secondary License 104 | (if permitted under the terms of Section 3). 105 | 106 | 3. REQUIREMENTS 107 | 108 | 3.1 If a Contributor Distributes the Program in any form, then: 109 | 110 | a) the Program must also be made available as Source Code, in 111 | accordance with section 3.2, and the Contributor must accompany 112 | the Program with a statement that the Source Code for the Program 113 | is available under this Agreement, and informs Recipients how to 114 | obtain it in a reasonable manner on or through a medium customarily 115 | used for software exchange; and 116 | 117 | b) the Contributor may Distribute the Program under a license 118 | different than this Agreement, provided that such license: 119 | i) effectively disclaims on behalf of all other Contributors all 120 | warranties and conditions, express and implied, including 121 | warranties or conditions of title and non-infringement, and 122 | implied warranties or conditions of merchantability and fitness 123 | for a particular purpose; 124 | 125 | ii) effectively excludes on behalf of all other Contributors all 126 | liability for damages, including direct, indirect, special, 127 | incidental and consequential damages, such as lost profits; 128 | 129 | iii) does not attempt to limit or alter the recipients' rights 130 | in the Source Code under section 3.2; and 131 | 132 | iv) requires any subsequent distribution of the Program by any 133 | party to be under a license that satisfies the requirements 134 | of this section 3. 135 | 136 | 3.2 When the Program is Distributed as Source Code: 137 | 138 | a) it must be made available under this Agreement, or if the 139 | Program (i) is combined with other material in a separate file or 140 | files made available under a Secondary License, and (ii) the initial 141 | Contributor attached to the Source Code the notice described in 142 | Exhibit A of this Agreement, then the Program may be made available 143 | under the terms of such Secondary Licenses, and 144 | 145 | b) a copy of this Agreement must be included with each copy of 146 | the Program. 147 | 148 | 3.3 Contributors may not remove or alter any copyright, patent, 149 | trademark, attribution notices, disclaimers of warranty, or limitations 150 | of liability ("notices") contained within the Program from any copy of 151 | the Program which they Distribute, provided that Contributors may add 152 | their own appropriate notices. 153 | 154 | 4. COMMERCIAL DISTRIBUTION 155 | 156 | Commercial distributors of software may accept certain responsibilities 157 | with respect to end users, business partners and the like. While this 158 | license is intended to facilitate the commercial use of the Program, 159 | the Contributor who includes the Program in a commercial product 160 | offering should do so in a manner which does not create potential 161 | liability for other Contributors. Therefore, if a Contributor includes 162 | the Program in a commercial product offering, such Contributor 163 | ("Commercial Contributor") hereby agrees to defend and indemnify every 164 | other Contributor ("Indemnified Contributor") against any losses, 165 | damages and costs (collectively "Losses") arising from claims, lawsuits 166 | and other legal actions brought by a third party against the Indemnified 167 | Contributor to the extent caused by the acts or omissions of such 168 | Commercial Contributor in connection with its distribution of the Program 169 | in a commercial product offering. The obligations in this section do not 170 | apply to any claims or Losses relating to any actual or alleged 171 | intellectual property infringement. In order to qualify, an Indemnified 172 | Contributor must: a) promptly notify the Commercial Contributor in 173 | writing of such claim, and b) allow the Commercial Contributor to control, 174 | and cooperate with the Commercial Contributor in, the defense and any 175 | related settlement negotiations. The Indemnified Contributor may 176 | participate in any such claim at its own expense. 177 | 178 | For example, a Contributor might include the Program in a commercial 179 | product offering, Product X. That Contributor is then a Commercial 180 | Contributor. If that Commercial Contributor then makes performance 181 | claims, or offers warranties related to Product X, those performance 182 | claims and warranties are such Commercial Contributor's responsibility 183 | alone. Under this section, the Commercial Contributor would have to 184 | defend claims against the other Contributors related to those performance 185 | claims and warranties, and if a court requires any other Contributor to 186 | pay any damages as a result, the Commercial Contributor must pay 187 | those damages. 188 | 189 | 5. NO WARRANTY 190 | 191 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT 192 | PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS" 193 | BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR 194 | IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF 195 | TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR 196 | PURPOSE. Each Recipient is solely responsible for determining the 197 | appropriateness of using and distributing the Program and assumes all 198 | risks associated with its exercise of rights under this Agreement, 199 | including but not limited to the risks and costs of program errors, 200 | compliance with applicable laws, damage to or loss of data, programs 201 | or equipment, and unavailability or interruption of operations. 202 | 203 | 6. DISCLAIMER OF LIABILITY 204 | 205 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT 206 | PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS 207 | SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 208 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST 209 | PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 210 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 211 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 212 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE 213 | POSSIBILITY OF SUCH DAMAGES. 214 | 215 | 7. GENERAL 216 | 217 | If any provision of this Agreement is invalid or unenforceable under 218 | applicable law, it shall not affect the validity or enforceability of 219 | the remainder of the terms of this Agreement, and without further 220 | action by the parties hereto, such provision shall be reformed to the 221 | minimum extent necessary to make such provision valid and enforceable. 222 | 223 | If Recipient institutes patent litigation against any entity 224 | (including a cross-claim or counterclaim in a lawsuit) alleging that the 225 | Program itself (excluding combinations of the Program with other software 226 | or hardware) infringes such Recipient's patent(s), then such Recipient's 227 | rights granted under Section 2(b) shall terminate as of the date such 228 | litigation is filed. 229 | 230 | All Recipient's rights under this Agreement shall terminate if it 231 | fails to comply with any of the material terms or conditions of this 232 | Agreement and does not cure such failure in a reasonable period of 233 | time after becoming aware of such noncompliance. If all Recipient's 234 | rights under this Agreement terminate, Recipient agrees to cease use 235 | and distribution of the Program as soon as reasonably practicable. 236 | However, Recipient's obligations under this Agreement and any licenses 237 | granted by Recipient relating to the Program shall continue and survive. 238 | 239 | Everyone is permitted to copy and distribute copies of this Agreement, 240 | but in order to avoid inconsistency the Agreement is copyrighted and 241 | may only be modified in the following manner. The Agreement Steward 242 | reserves the right to publish new versions (including revisions) of 243 | this Agreement from time to time. No one other than the Agreement 244 | Steward has the right to modify this Agreement. The Eclipse Foundation 245 | is the initial Agreement Steward. The Eclipse Foundation may assign the 246 | responsibility to serve as the Agreement Steward to a suitable separate 247 | entity. Each new version of the Agreement will be given a distinguishing 248 | version number. The Program (including Contributions) may always be 249 | Distributed subject to the version of the Agreement under which it was 250 | received. In addition, after a new version of the Agreement is published, 251 | Contributor may elect to Distribute the Program (including its 252 | Contributions) under the new version. 253 | 254 | Except as expressly stated in Sections 2(a) and 2(b) above, Recipient 255 | receives no rights or licenses to the intellectual property of any 256 | Contributor under this Agreement, whether expressly, by implication, 257 | estoppel or otherwise. All rights in the Program not expressly granted 258 | under this Agreement are reserved. Nothing in this Agreement is intended 259 | to be enforceable by any entity that is not a Contributor or Recipient. 260 | No third-party beneficiary rights are created under this Agreement. 261 | 262 | Exhibit A - Form of Secondary Licenses Notice 263 | 264 | "This Source Code may also be made available under the following 265 | Secondary Licenses when the conditions for such availability set forth 266 | in the Eclipse Public License, v. 2.0 are satisfied: {name license(s), 267 | version(s), and exceptions or additional permissions here}." 268 | 269 | Simply including a copy of this Agreement, including this Exhibit A 270 | is not sufficient to license the Source Code under Secondary Licenses. 271 | 272 | If it is not possible or desirable to put the notice in a particular 273 | file, then You may include the notice in a location (such as a LICENSE 274 | file in a relevant directory) where a recipient would be likely to 275 | look for such a notice. 276 | 277 | You may add additional accurate notices of copyright ownership. 278 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Modern Java in Action 2 | 3 | A repository for my live-coding talk [Modern Java in Action](https://nipafx.dev/talk-java-action). 4 | 5 | Each step is its own commit. 6 | Check them out to see here what needs to be done. 7 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | dev.nipafx.demo 8 | modern-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 23 13 | UTF-8 14 | 15 | 16 | 17 | 18 | org.jsoup 19 | jsoup 20 | 1.17.2 21 | 22 | 23 | 24 | 25 | 26 | 27 | org.apache.maven.plugins 28 | maven-dependency-plugin 29 | 3.6.1 30 | 31 | 32 | copy-dependencies 33 | package 34 | 35 | copy-dependencies 36 | 37 | 38 | jars 39 | 40 | 41 | 42 | 43 | 44 | org.apache.maven.plugins 45 | maven-compiler-plugin 46 | 3.12.1 47 | 48 | true 49 | true 50 | 51 | 52 | 53 | org.apache.maven.plugins 54 | maven-jar-plugin 55 | 3.3.0 56 | 57 | gh-crawler 58 | 59 | 60 | dev.nipafx.demo.modern.GitHubCrawl 61 | true 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /serve/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: #262429; 3 | color: white; 4 | font-size: 24px; 5 | } 6 | 7 | .container { 8 | width: 600px; 9 | margin: 2em auto; 10 | } 11 | 12 | .page { 13 | margin: 1em 0; 14 | padding: 0.5em; 15 | border: 1px solid #69ea7d; 16 | border-radius: 0.25em; 17 | 18 | display: grid; 19 | grid-template-columns: 1fr auto; 20 | } 21 | 22 | .page .ref:before { 23 | content: "⤴"; 24 | } 25 | 26 | .page.level-1 { 27 | margin-left: 1em; 28 | } 29 | 30 | .page.level-2 { 31 | margin-left: 2em; 32 | } 33 | 34 | .page.level-3 { 35 | margin-left: 3em; 36 | } 37 | 38 | .page.level-4 { 39 | margin-left: 4em; 40 | } 41 | 42 | .page.level-5 { 43 | margin-left: 5em; 44 | } 45 | 46 | a, a:visited { 47 | color: white; 48 | text-decoration: none; 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/GitHubCrawl.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern; 2 | 3 | import dev.nipafx.demo.modern.crawler.PageTreeFactory; 4 | import dev.nipafx.demo.modern.operations.Pretty; 5 | import dev.nipafx.demo.modern.operations.ResultServer; 6 | import dev.nipafx.demo.modern.operations.Statistician; 7 | 8 | import java.net.URI; 9 | import java.net.URISyntaxException; 10 | import java.net.http.HttpClient; 11 | import java.nio.file.Path; 12 | 13 | public class GitHubCrawl { 14 | 15 | /** 16 | * @param args 0: path to GitHub issue or PR page 17 | * 1: depth of tree that will be built 18 | */ 19 | public static void main(String[] args) throws Exception { 20 | var config = Configuration.parse(args); 21 | 22 | System.out.printf("%nTo see virtual threads in action, run this while the app is resolving a bunch of links:%n"); 23 | System.out.printf("jcmd %s Thread.dump_to_file -format=json -overwrite threads.json%n%n", ProcessHandle.current().pid()); 24 | 25 | var client = HttpClient.newHttpClient(); 26 | var factory = new PageTreeFactory(client); 27 | var rootPage = factory.createPage(config.seedUrl(), config.depth()); 28 | 29 | System.out.printf(""" 30 | 31 | --- 32 | 33 | %s 34 | 35 | %s 36 | 37 | 38 | """, Statistician.evaluate(rootPage), Pretty.pageList(rootPage)); 39 | 40 | ResultServer.serve(rootPage, Path.of("serve")); 41 | } 42 | 43 | private record Configuration(URI seedUrl, int depth) { 44 | 45 | static Configuration parse(String[] args) throws URISyntaxException { 46 | if (args.length < 2) 47 | throw new IllegalArgumentException("Please specify the seed URL and depth."); 48 | var seedUrl = new URI(args[0]); 49 | var depth = Integer.parseInt(args[1]); 50 | return new Configuration(seedUrl, depth); 51 | } 52 | 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/crawler/PageFactory.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.crawler; 2 | 3 | import dev.nipafx.demo.modern.page.ExternalPage; 4 | import dev.nipafx.demo.modern.page.GitHubIssuePage; 5 | import dev.nipafx.demo.modern.page.GitHubPrPage; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | 9 | import java.net.URI; 10 | import java.net.URISyntaxException; 11 | import java.util.Set; 12 | import java.util.regex.Pattern; 13 | import java.util.stream.Stream; 14 | 15 | import static java.util.stream.Collectors.toSet; 16 | 17 | class PageFactory { 18 | 19 | private static final Set GITHUB_HOSTS = Set.of("github.com", "user-images.githubusercontent.com"); 20 | private static final Pattern GITHUB_TRACKED_PAGE = Pattern.compile("/issues/\\d+/?$|/pull/\\d+/?$"); 21 | private static final Pattern GITHUB_ISSUE_NUMBER = Pattern.compile(".*/issues/(\\d+)/?.*"); 22 | private static final Pattern GITHUB_PR_NUMBER = Pattern.compile(".*/pull/(\\d+)/?.*"); 23 | 24 | private static final String GITHUB_ISSUE_CONTENT_SELECTOR = "#show_issue"; 25 | private static final String GITHUB_PR_CONTENT_SELECTOR = ".clearfix.js-issues-results"; 26 | 27 | private PageFactory() { 28 | // private constructor to prevent instantiation of factory class 29 | } 30 | 31 | public static PageWithLinks parsePage(URI url, String html) { 32 | // turn this into an `if`, I dare you! 33 | return switch (url) { 34 | case URI u when u.getHost().equals("github.com") && u.getPath().contains("/issues/") -> parseIssuePage(url, html); 35 | case URI u when u.getHost().equals("github.com") && u.getPath().contains("/pull/") -> parsePrPage(url, html); 36 | default -> parseExternalPage(url, html); 37 | }; 38 | } 39 | 40 | static PageWithLinks parseIssuePage(URI url, String html) { 41 | var document = Jsoup.parse(html); 42 | var content = extractContent(document, GITHUB_ISSUE_CONTENT_SELECTOR); 43 | var links = extractLinks(url, document, GITHUB_ISSUE_CONTENT_SELECTOR); 44 | var issueNr = getFirstMatchAsNumber(GITHUB_ISSUE_NUMBER, url); 45 | return new PageWithLinks(new GitHubIssuePage(url, content, issueNr), links); 46 | } 47 | 48 | static PageWithLinks parsePrPage(URI url, String html) { 49 | var document = Jsoup.parse(html); 50 | var content = extractContent(document, GITHUB_PR_CONTENT_SELECTOR); 51 | var links = extractLinks(url, document, GITHUB_PR_CONTENT_SELECTOR); 52 | var issueNr = getFirstMatchAsNumber(GITHUB_PR_NUMBER, url); 53 | return new PageWithLinks(new GitHubPrPage(url, content, issueNr), links); 54 | } 55 | 56 | private static PageWithLinks parseExternalPage(URI url, String html) { 57 | return new PageWithLinks(new ExternalPage(url, html), Set.of()); 58 | } 59 | 60 | private static String extractContent(Document document, String cssContentSelector) { 61 | var selectedElements = document.select(cssContentSelector); 62 | if (selectedElements.size() != 1) 63 | throw new IllegalArgumentException("The CSS selector '%s' yielded %d elements".formatted(cssContentSelector, selectedElements.size())); 64 | return selectedElements.getFirst().toString(); 65 | } 66 | 67 | private static Set extractLinks(URI url, Document document, String cssContentSelector) { 68 | return document 69 | .select(cssContentSelector + " a[href]").stream() 70 | .map(element -> element.attribute("href").getValue()) 71 | .flatMap(href -> normalizePotentialLink(url, href)) 72 | .filter(PageFactory::shouldRegisterLink) 73 | .collect(toSet()); 74 | } 75 | 76 | private static Stream normalizePotentialLink(URI pageUrl, String href) { 77 | if (href == null || href.isBlank()) 78 | return Stream.empty(); 79 | 80 | try { 81 | var url = pageUrl.resolve(new URI(href)); 82 | var isCyclicLink = url.equals(pageUrl); 83 | if (isCyclicLink) 84 | return Stream.empty(); 85 | return Stream.of(url); 86 | } catch (URISyntaxException ex) { 87 | // nothing to be done 88 | return Stream.empty(); 89 | } 90 | } 91 | 92 | private static boolean shouldRegisterLink(URI url) { 93 | if (url.getHost() == null) 94 | return false; 95 | 96 | var isExternalUrl = !GITHUB_HOSTS.contains(url.getHost()); 97 | return isExternalUrl || GITHUB_TRACKED_PAGE.matcher(url.toString()).find(); 98 | } 99 | 100 | private static int getFirstMatchAsNumber(Pattern pattern, URI url) { 101 | var issueNumberMatcher = pattern.matcher(url.toString()); 102 | var found = issueNumberMatcher.find(); 103 | if (!found) 104 | throw new IllegalStateException("Alleged issue/PR URL %s does not seem to contain a number.".formatted(url)); 105 | return Integer.parseInt(issueNumberMatcher.group(1)); 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/crawler/PageTreeFactory.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.crawler; 2 | 3 | import dev.nipafx.demo.modern.page.ErrorPage; 4 | import dev.nipafx.demo.modern.page.ExternalPage; 5 | import dev.nipafx.demo.modern.page.GitHubIssuePage; 6 | import dev.nipafx.demo.modern.page.GitHubPrPage; 7 | import dev.nipafx.demo.modern.page.Page; 8 | 9 | import java.io.IOException; 10 | import java.net.URI; 11 | import java.net.http.HttpClient; 12 | import java.net.http.HttpRequest; 13 | import java.net.http.HttpResponse.BodyHandlers; 14 | import java.util.ArrayList; 15 | import java.util.Collections; 16 | import java.util.Set; 17 | import java.util.concurrent.ConcurrentHashMap; 18 | import java.util.concurrent.ConcurrentMap; 19 | import java.util.concurrent.ExecutionException; 20 | import java.util.concurrent.StructuredTaskScope; 21 | import java.util.concurrent.StructuredTaskScope.Subtask; 22 | 23 | import static java.util.Objects.requireNonNull; 24 | import static java.util.stream.Collectors.toSet; 25 | 26 | public class PageTreeFactory { 27 | 28 | private final HttpClient client; 29 | private final ConcurrentMap resolvedPages; 30 | 31 | public PageTreeFactory(HttpClient client) { 32 | this.client = requireNonNull(client); 33 | resolvedPages = new ConcurrentHashMap<>(); 34 | } 35 | 36 | public Page createPage(URI url, int depth) throws InterruptedException { 37 | if (resolvedPages.containsKey(url)) { 38 | System.out.printf("Found cached '%s'%n", url); 39 | return resolvedPages.get(url); 40 | } 41 | 42 | System.out.printf("Resolving '%s'...%n", url); 43 | var pageWithLinks = fetchPageWithLinks(url); 44 | var page = pageWithLinks.page(); 45 | resolvedPages.computeIfAbsent(page.url(), __ -> page); 46 | System.out.printf("Resolved '%s' with children: %s%n", url, pageWithLinks.links()); 47 | 48 | return switch (page) { 49 | case GitHubIssuePage(var isUrl, var content, _, int nr) -> 50 | new GitHubIssuePage(isUrl, content, resolveLinks(pageWithLinks.links(), depth - 1), nr); 51 | case GitHubPrPage(var prUrl, var content, _, int nr) -> 52 | new GitHubIssuePage(prUrl, content, resolveLinks(pageWithLinks.links(), depth - 1), nr); 53 | case ExternalPage _, ErrorPage _ -> page; 54 | }; 55 | } 56 | 57 | private PageWithLinks fetchPageWithLinks(URI url) throws InterruptedException { 58 | try { 59 | var pageBody = fetchPageAsString(url); 60 | return PageFactory.parsePage(url, pageBody); 61 | } catch (InterruptedException iex) { 62 | throw iex; 63 | } catch (Exception ex) { 64 | return new PageWithLinks(new ErrorPage(url, ex)); 65 | } 66 | } 67 | 68 | private String fetchPageAsString(URI url) throws IOException, InterruptedException { 69 | var request = HttpRequest 70 | .newBuilder(url) 71 | .GET() 72 | .build(); 73 | return client 74 | .send(request, BodyHandlers.ofString()) 75 | .body(); 76 | } 77 | 78 | private Set resolveLinks(Set links, int depth) throws InterruptedException { 79 | if (depth < 0) 80 | return Collections.emptySet(); 81 | 82 | try (var scope = new StructuredTaskScope.ShutdownOnFailure()) { 83 | var futurePages = new ArrayList>(); 84 | for (URI link : links) 85 | futurePages.add(scope.fork(() -> createPage(link, depth))); 86 | 87 | scope.join(); 88 | scope.throwIfFailed(); 89 | 90 | return futurePages.stream() 91 | .map(Subtask::get) 92 | .collect(toSet()); 93 | } catch (ExecutionException ex) { 94 | // this should not happen as `ErrorPage` instances should have been created for all errors 95 | throw new IllegalStateException("Error cases should have been handled during page creation!", ex); 96 | } 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/crawler/PageWithLinks.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.crawler; 2 | 3 | import dev.nipafx.demo.modern.page.Page; 4 | 5 | import java.net.URI; 6 | import java.util.Set; 7 | 8 | import static java.util.Objects.requireNonNull; 9 | 10 | record PageWithLinks(Page page, Set links) { 11 | 12 | PageWithLinks { 13 | requireNonNull(page); 14 | requireNonNull(links); 15 | links = Set.copyOf(links); 16 | } 17 | 18 | public PageWithLinks(Page page) { 19 | this(page, Set.of()); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/operations/Pretty.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.operations; 2 | 3 | import dev.nipafx.demo.modern.page.ErrorPage; 4 | import dev.nipafx.demo.modern.page.ExternalPage; 5 | import dev.nipafx.demo.modern.page.GitHubIssuePage; 6 | import dev.nipafx.demo.modern.page.GitHubPage; 7 | import dev.nipafx.demo.modern.page.GitHubPrPage; 8 | import dev.nipafx.demo.modern.page.Page; 9 | 10 | import java.net.URI; 11 | 12 | import static java.util.stream.Collectors.joining; 13 | 14 | public class Pretty { 15 | 16 | private Pretty() { 17 | // private constructor to prevent instantiation 18 | } 19 | 20 | public static String pageList(Page rootPage) { 21 | if (!(rootPage instanceof GitHubPage ghPage)) 22 | return pageName(rootPage); 23 | 24 | return ghPage 25 | .subtree() 26 | .map(Pretty::pageName) 27 | .collect(joining("\n")); 28 | } 29 | 30 | public static String pageName(Page page) { 31 | return switch (page) { 32 | case ErrorPage(URI url, _) -> "💥 ERROR: " + url.getHost(); 33 | case ExternalPage(URI url, _) -> "💤 EXTERNAL: " + url.getHost(); 34 | case GitHubIssuePage(_, _, _, int nr) -> "🐈 ISSUE #" + nr; 35 | case GitHubPrPage(_, _, _, int nr) -> "🐙 PR #" + nr; 36 | }; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/operations/ResultServer.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.operations; 2 | 3 | import com.sun.net.httpserver.SimpleFileServer; 4 | import com.sun.net.httpserver.SimpleFileServer.OutputLevel; 5 | import dev.nipafx.demo.modern.page.GitHubPage; 6 | import dev.nipafx.demo.modern.page.Page; 7 | import org.jsoup.Jsoup; 8 | 9 | import java.io.IOException; 10 | import java.net.InetSocketAddress; 11 | import java.nio.file.Files; 12 | import java.nio.file.Path; 13 | import java.util.HashSet; 14 | import java.util.Set; 15 | 16 | import static java.util.stream.Collectors.joining; 17 | 18 | public class ResultServer { 19 | 20 | public static void serve(Page rootPage, Path serverDir) throws IOException { 21 | if (!Files.exists(serverDir)) 22 | Files.createDirectory(serverDir); 23 | 24 | var html = Jsoup.parse(""" 25 | 26 | 27 | 28 | 29 | %s 30 | 31 | 32 | 33 |
34 | %s 35 |
36 | 37 | 38 | """.formatted(Pretty.pageName(rootPage), pageTreeHtml(rootPage))); 39 | Files.writeString(serverDir.resolve("index.html"), html.html()); 40 | 41 | launchWebServer(serverDir); 42 | } 43 | 44 | private static void launchWebServer(Path serverDir) { 45 | System.out.println("Visit localhost:8080"); 46 | new Thread(() -> 47 | SimpleFileServer 48 | .createFileServer( 49 | new InetSocketAddress(8080), 50 | serverDir.toAbsolutePath(), 51 | OutputLevel.INFO) 52 | .start()) 53 | .start(); 54 | } 55 | 56 | private static String pageTreeHtml(Page rootPage) { 57 | var printedPages = new HashSet(); 58 | return appendPageTreeHtml(printedPages, rootPage, 0); 59 | } 60 | 61 | private static String appendPageTreeHtml(Set printedPages, Page page, int level) { 62 | var pageHtml = pageHtml(page, printedPages.contains(page), level); 63 | if (printedPages.contains(page)) { 64 | printedPages.add(page); 65 | return pageHtml; 66 | } else { 67 | printedPages.add(page); 68 | var descendantsHtml = page instanceof GitHubPage ghPage 69 | ? ghPage 70 | .links().stream() 71 | .map(linkedPage -> appendPageTreeHtml(printedPages, linkedPage, level + 1)) 72 | .collect(joining("\n")) 73 | : ""; 74 | return """ 75 | %s 76 | %s 77 | """.formatted(pageHtml, descendantsHtml); 78 | } 79 | } 80 | 81 | private static String pageHtml(Page page, boolean reference, int level) { 82 | return """ 83 |
84 | %s 85 | %s 86 |
87 | """.formatted( 88 | level, 89 | page.url().toString(), 90 | Pretty.pageName(page), 91 | reference ? "" : ""); 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/operations/Statistician.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.operations; 2 | 3 | import dev.nipafx.demo.modern.page.ErrorPage; 4 | import dev.nipafx.demo.modern.page.ExternalPage; 5 | import dev.nipafx.demo.modern.page.GitHubIssuePage; 6 | import dev.nipafx.demo.modern.page.GitHubPage; 7 | import dev.nipafx.demo.modern.page.GitHubPrPage; 8 | import dev.nipafx.demo.modern.page.Page; 9 | 10 | import java.util.HashSet; 11 | import java.util.Set; 12 | 13 | public class Statistician { 14 | 15 | private final Set evaluatedPages; 16 | 17 | private int numberOfIssues; 18 | private int numberOfPrs; 19 | private int numberOfExternalLinks; 20 | private int numberOfErrors; 21 | 22 | private Statistician() { 23 | this.evaluatedPages = new HashSet<>(); 24 | } 25 | 26 | public static Stats evaluate(Page rootPage) { 27 | Statistician statistician = new Statistician(); 28 | statistician.evaluateTree(rootPage); 29 | return statistician.result(); 30 | } 31 | 32 | private void evaluateTree(Page page) { 33 | if (page instanceof GitHubPage ghPage) 34 | ghPage.subtree().forEach(this::evaluatePage); 35 | else 36 | evaluatePage(page); 37 | } 38 | 39 | private void evaluatePage(Page page) { 40 | if (evaluatedPages.contains(page)) 41 | return; 42 | evaluatedPages.add(page); 43 | 44 | switch (page) { 45 | case ErrorPage _ -> numberOfErrors++; 46 | case ExternalPage _ -> numberOfExternalLinks++; 47 | case GitHubIssuePage _ -> numberOfIssues++; 48 | case GitHubPrPage _ -> numberOfPrs++; 49 | } 50 | } 51 | 52 | private Stats result() { 53 | return new Stats(numberOfIssues, numberOfPrs, numberOfExternalLinks, numberOfErrors); 54 | } 55 | 56 | public record Stats(int numberOfIssues, int numberOfPrs, int numberOfExternalLinks, int numberOfErrors) { } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/page/ErrorPage.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.page; 2 | 3 | import java.net.URI; 4 | import java.util.Objects; 5 | 6 | import static java.util.Objects.requireNonNull; 7 | 8 | public record ErrorPage(URI url, Exception ex) implements Page { 9 | 10 | public ErrorPage { 11 | requireNonNull(url); 12 | requireNonNull(ex); 13 | } 14 | 15 | @Override 16 | public boolean equals(Object other) { 17 | return other == this 18 | || other instanceof GitHubIssuePage page 19 | && this.url.equals(page.url()); 20 | } 21 | 22 | @Override 23 | public int hashCode() { 24 | return Objects.hash(url); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/page/ExternalPage.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.page; 2 | 3 | import java.net.URI; 4 | import java.util.Objects; 5 | 6 | import static java.util.Objects.requireNonNull; 7 | 8 | public record ExternalPage(URI url, String content) implements SuccessfulPage { 9 | 10 | public ExternalPage { 11 | requireNonNull(url); 12 | requireNonNull(content); 13 | } 14 | 15 | @Override 16 | public boolean equals(Object other) { 17 | return other == this 18 | || other instanceof GitHubIssuePage page 19 | && this.url.equals(page.url()); 20 | } 21 | 22 | @Override 23 | public int hashCode() { 24 | return Objects.hash(url); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/page/GitHubIssuePage.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.page; 2 | 3 | import java.net.URI; 4 | import java.util.HashSet; 5 | import java.util.Objects; 6 | import java.util.Set; 7 | 8 | import static java.util.Objects.requireNonNull; 9 | 10 | public record GitHubIssuePage(URI url, String content, Set links, int issueNumber) implements GitHubPage { 11 | 12 | public GitHubIssuePage { 13 | requireNonNull(url); 14 | requireNonNull(content); 15 | links = Set.copyOf(links); 16 | if (issueNumber <= 0) 17 | throw new IllegalArgumentException("Issue number must be 1 or greater - was '%s' at '%s'.".formatted(issueNumber, url)); 18 | } 19 | 20 | public GitHubIssuePage(URI url, String content, int issueNumber) { 21 | this(url, content, new HashSet<>(), issueNumber); 22 | } 23 | 24 | @Override 25 | public boolean equals(Object other) { 26 | return other == this 27 | || other instanceof GitHubIssuePage page 28 | && this.url.equals(page.url()); 29 | } 30 | 31 | @Override 32 | public int hashCode() { 33 | return Objects.hash(url); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/page/GitHubPage.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.page; 2 | 3 | import java.util.ArrayList; 4 | import java.util.LinkedHashSet; 5 | import java.util.Set; 6 | import java.util.stream.Stream; 7 | 8 | public sealed interface GitHubPage extends SuccessfulPage permits GitHubIssuePage, GitHubPrPage { 9 | 10 | Set links(); 11 | 12 | default Stream subtree() { 13 | var subtree = new ArrayList(Set.of(this)); 14 | var upcomingPages = new LinkedHashSet<>(this.links()); 15 | 16 | while (!upcomingPages.isEmpty()) { 17 | var nextPage = upcomingPages.removeFirst(); 18 | if (!subtree.contains(nextPage) && nextPage instanceof GitHubPage nextGhPage) 19 | new LinkedHashSet<>(nextGhPage.links()) 20 | .reversed() 21 | .forEach(upcomingPages::addFirst); 22 | subtree.add(nextPage); 23 | } 24 | 25 | return subtree.stream(); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/page/GitHubPrPage.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.page; 2 | 3 | import java.net.URI; 4 | import java.util.HashSet; 5 | import java.util.Objects; 6 | import java.util.Set; 7 | 8 | import static java.util.Objects.requireNonNull; 9 | 10 | public record GitHubPrPage(URI url, String content, Set links, int prNumber) implements GitHubPage { 11 | 12 | public GitHubPrPage { 13 | requireNonNull(url); 14 | requireNonNull(content); 15 | links = Set.copyOf(links); 16 | if (prNumber <= 0) 17 | throw new IllegalArgumentException("PR number must be 1 or greater - was '%s' at '%s'.".formatted(prNumber, url)); 18 | } 19 | 20 | public GitHubPrPage(URI url, String content, int prNumber) { 21 | this(url, content, new HashSet<>(), prNumber); 22 | } 23 | 24 | @Override 25 | public boolean equals(Object other) { 26 | return other == this 27 | || other instanceof GitHubPrPage page 28 | && this.url.equals(page.url()); 29 | } 30 | 31 | @Override 32 | public int hashCode() { 33 | return Objects.hash(url); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/page/Page.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.page; 2 | 3 | import java.net.URI; 4 | 5 | public sealed interface Page permits ErrorPage, SuccessfulPage { 6 | 7 | URI url(); 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/dev/nipafx/demo/modern/page/SuccessfulPage.java: -------------------------------------------------------------------------------- 1 | package dev.nipafx.demo.modern.page; 2 | 3 | public sealed interface SuccessfulPage extends Page permits ExternalPage, GitHubPage { 4 | 5 | String content(); 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/module-info.java: -------------------------------------------------------------------------------- 1 | module gh.crawler { 2 | requires java.net.http; 3 | requires jdk.httpserver; 4 | requires org.jsoup; 5 | } 6 | --------------------------------------------------------------------------------