├── .gitignore
├── LICENSE.md
├── README.md
├── pom.xml
├── serve
└── style.css
└── src
└── main
└── java
├── dev
└── nipafx
│ └── demo
│ └── modern
│ ├── GitHubCrawl.java
│ ├── crawler
│ ├── PageFactory.java
│ ├── PageTreeFactory.java
│ └── PageWithLinks.java
│ ├── operations
│ ├── Pretty.java
│ ├── ResultServer.java
│ └── Statistician.java
│ └── page
│ ├── ErrorPage.java
│ ├── ExternalPage.java
│ ├── GitHubIssuePage.java
│ ├── GitHubPage.java
│ ├── GitHubPrPage.java
│ ├── Page.java
│ └── SuccessfulPage.java
└── module-info.java
/.gitignore:
--------------------------------------------------------------------------------
1 | # Eclipse
2 |
3 | .metadata
4 | bin/
5 | tmp/
6 | *.tmp
7 | *.bak
8 | *.swp
9 | *~.nib
10 | local.properties
11 | .settings/
12 | .loadpath
13 | .recommenders
14 |
15 | .project
16 | .classpath
17 | *.launch
18 |
19 | # JetBrains
20 |
21 | .idea/
22 | *.iws
23 | *.iml
24 | /out/
25 |
26 | # Visual Studio Code
27 |
28 | .factoryPath
29 | .vscode/
30 |
31 | # Gradle
32 |
33 | .gradle/
34 | build/
35 |
36 | # Maven
37 |
38 | target/
39 |
40 | # JVM crash logs
41 | # see https://www.java.com/en/download/help/error_hotspot.html
42 | hs_err_pid*
43 |
44 | # MacOS
45 | .DS_Store
46 |
47 | # thead dumps
48 | threads.json
49 |
50 | # app
51 | serve/index.html
52 | jars/*
53 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Eclipse Public License - v 2.0
2 |
3 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE
4 | PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION
5 | OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
6 |
7 | 1. DEFINITIONS
8 |
9 | "Contribution" means:
10 |
11 | a) in the case of the initial Contributor, the initial content
12 | Distributed under this Agreement, and
13 |
14 | b) in the case of each subsequent Contributor:
15 | i) changes to the Program, and
16 | ii) additions to the Program;
17 | where such changes and/or additions to the Program originate from
18 | and are Distributed by that particular Contributor. A Contribution
19 | "originates" from a Contributor if it was added to the Program by
20 | such Contributor itself or anyone acting on such Contributor's behalf.
21 | Contributions do not include changes or additions to the Program that
22 | are not Modified Works.
23 |
24 | "Contributor" means any person or entity that Distributes the Program.
25 |
26 | "Licensed Patents" mean patent claims licensable by a Contributor which
27 | are necessarily infringed by the use or sale of its Contribution alone
28 | or when combined with the Program.
29 |
30 | "Program" means the Contributions Distributed in accordance with this
31 | Agreement.
32 |
33 | "Recipient" means anyone who receives the Program under this Agreement
34 | or any Secondary License (as applicable), including Contributors.
35 |
36 | "Derivative Works" shall mean any work, whether in Source Code or other
37 | form, that is based on (or derived from) the Program and for which the
38 | editorial revisions, annotations, elaborations, or other modifications
39 | represent, as a whole, an original work of authorship.
40 |
41 | "Modified Works" shall mean any work in Source Code or other form that
42 | results from an addition to, deletion from, or modification of the
43 | contents of the Program, including, for purposes of clarity any new file
44 | in Source Code form that contains any contents of the Program. Modified
45 | Works shall not include works that contain only declarations,
46 | interfaces, types, classes, structures, or files of the Program solely
47 | in each case in order to link to, bind by name, or subclass the Program
48 | or Modified Works thereof.
49 |
50 | "Distribute" means the acts of a) distributing or b) making available
51 | in any manner that enables the transfer of a copy.
52 |
53 | "Source Code" means the form of a Program preferred for making
54 | modifications, including but not limited to software source code,
55 | documentation source, and configuration files.
56 |
57 | "Secondary License" means either the GNU General Public License,
58 | Version 2.0, or any later versions of that license, including any
59 | exceptions or additional permissions as identified by the initial
60 | Contributor.
61 |
62 | 2. GRANT OF RIGHTS
63 |
64 | a) Subject to the terms of this Agreement, each Contributor hereby
65 | grants Recipient a non-exclusive, worldwide, royalty-free copyright
66 | license to reproduce, prepare Derivative Works of, publicly display,
67 | publicly perform, Distribute and sublicense the Contribution of such
68 | Contributor, if any, and such Derivative Works.
69 |
70 | b) Subject to the terms of this Agreement, each Contributor hereby
71 | grants Recipient a non-exclusive, worldwide, royalty-free patent
72 | license under Licensed Patents to make, use, sell, offer to sell,
73 | import and otherwise transfer the Contribution of such Contributor,
74 | if any, in Source Code or other form. This patent license shall
75 | apply to the combination of the Contribution and the Program if, at
76 | the time the Contribution is added by the Contributor, such addition
77 | of the Contribution causes such combination to be covered by the
78 | Licensed Patents. The patent license shall not apply to any other
79 | combinations which include the Contribution. No hardware per se is
80 | licensed hereunder.
81 |
82 | c) Recipient understands that although each Contributor grants the
83 | licenses to its Contributions set forth herein, no assurances are
84 | provided by any Contributor that the Program does not infringe the
85 | patent or other intellectual property rights of any other entity.
86 | Each Contributor disclaims any liability to Recipient for claims
87 | brought by any other entity based on infringement of intellectual
88 | property rights or otherwise. As a condition to exercising the
89 | rights and licenses granted hereunder, each Recipient hereby
90 | assumes sole responsibility to secure any other intellectual
91 | property rights needed, if any. For example, if a third party
92 | patent license is required to allow Recipient to Distribute the
93 | Program, it is Recipient's responsibility to acquire that license
94 | before distributing the Program.
95 |
96 | d) Each Contributor represents that to its knowledge it has
97 | sufficient copyright rights in its Contribution, if any, to grant
98 | the copyright license set forth in this Agreement.
99 |
100 | e) Notwithstanding the terms of any Secondary License, no
101 | Contributor makes additional grants to any Recipient (other than
102 | those set forth in this Agreement) as a result of such Recipient's
103 | receipt of the Program under the terms of a Secondary License
104 | (if permitted under the terms of Section 3).
105 |
106 | 3. REQUIREMENTS
107 |
108 | 3.1 If a Contributor Distributes the Program in any form, then:
109 |
110 | a) the Program must also be made available as Source Code, in
111 | accordance with section 3.2, and the Contributor must accompany
112 | the Program with a statement that the Source Code for the Program
113 | is available under this Agreement, and informs Recipients how to
114 | obtain it in a reasonable manner on or through a medium customarily
115 | used for software exchange; and
116 |
117 | b) the Contributor may Distribute the Program under a license
118 | different than this Agreement, provided that such license:
119 | i) effectively disclaims on behalf of all other Contributors all
120 | warranties and conditions, express and implied, including
121 | warranties or conditions of title and non-infringement, and
122 | implied warranties or conditions of merchantability and fitness
123 | for a particular purpose;
124 |
125 | ii) effectively excludes on behalf of all other Contributors all
126 | liability for damages, including direct, indirect, special,
127 | incidental and consequential damages, such as lost profits;
128 |
129 | iii) does not attempt to limit or alter the recipients' rights
130 | in the Source Code under section 3.2; and
131 |
132 | iv) requires any subsequent distribution of the Program by any
133 | party to be under a license that satisfies the requirements
134 | of this section 3.
135 |
136 | 3.2 When the Program is Distributed as Source Code:
137 |
138 | a) it must be made available under this Agreement, or if the
139 | Program (i) is combined with other material in a separate file or
140 | files made available under a Secondary License, and (ii) the initial
141 | Contributor attached to the Source Code the notice described in
142 | Exhibit A of this Agreement, then the Program may be made available
143 | under the terms of such Secondary Licenses, and
144 |
145 | b) a copy of this Agreement must be included with each copy of
146 | the Program.
147 |
148 | 3.3 Contributors may not remove or alter any copyright, patent,
149 | trademark, attribution notices, disclaimers of warranty, or limitations
150 | of liability ("notices") contained within the Program from any copy of
151 | the Program which they Distribute, provided that Contributors may add
152 | their own appropriate notices.
153 |
154 | 4. COMMERCIAL DISTRIBUTION
155 |
156 | Commercial distributors of software may accept certain responsibilities
157 | with respect to end users, business partners and the like. While this
158 | license is intended to facilitate the commercial use of the Program,
159 | the Contributor who includes the Program in a commercial product
160 | offering should do so in a manner which does not create potential
161 | liability for other Contributors. Therefore, if a Contributor includes
162 | the Program in a commercial product offering, such Contributor
163 | ("Commercial Contributor") hereby agrees to defend and indemnify every
164 | other Contributor ("Indemnified Contributor") against any losses,
165 | damages and costs (collectively "Losses") arising from claims, lawsuits
166 | and other legal actions brought by a third party against the Indemnified
167 | Contributor to the extent caused by the acts or omissions of such
168 | Commercial Contributor in connection with its distribution of the Program
169 | in a commercial product offering. The obligations in this section do not
170 | apply to any claims or Losses relating to any actual or alleged
171 | intellectual property infringement. In order to qualify, an Indemnified
172 | Contributor must: a) promptly notify the Commercial Contributor in
173 | writing of such claim, and b) allow the Commercial Contributor to control,
174 | and cooperate with the Commercial Contributor in, the defense and any
175 | related settlement negotiations. The Indemnified Contributor may
176 | participate in any such claim at its own expense.
177 |
178 | For example, a Contributor might include the Program in a commercial
179 | product offering, Product X. That Contributor is then a Commercial
180 | Contributor. If that Commercial Contributor then makes performance
181 | claims, or offers warranties related to Product X, those performance
182 | claims and warranties are such Commercial Contributor's responsibility
183 | alone. Under this section, the Commercial Contributor would have to
184 | defend claims against the other Contributors related to those performance
185 | claims and warranties, and if a court requires any other Contributor to
186 | pay any damages as a result, the Commercial Contributor must pay
187 | those damages.
188 |
189 | 5. NO WARRANTY
190 |
191 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT
192 | PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS"
193 | BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR
194 | IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF
195 | TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR
196 | PURPOSE. Each Recipient is solely responsible for determining the
197 | appropriateness of using and distributing the Program and assumes all
198 | risks associated with its exercise of rights under this Agreement,
199 | including but not limited to the risks and costs of program errors,
200 | compliance with applicable laws, damage to or loss of data, programs
201 | or equipment, and unavailability or interruption of operations.
202 |
203 | 6. DISCLAIMER OF LIABILITY
204 |
205 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT
206 | PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS
207 | SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
208 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST
209 | PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
210 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
211 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
212 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE
213 | POSSIBILITY OF SUCH DAMAGES.
214 |
215 | 7. GENERAL
216 |
217 | If any provision of this Agreement is invalid or unenforceable under
218 | applicable law, it shall not affect the validity or enforceability of
219 | the remainder of the terms of this Agreement, and without further
220 | action by the parties hereto, such provision shall be reformed to the
221 | minimum extent necessary to make such provision valid and enforceable.
222 |
223 | If Recipient institutes patent litigation against any entity
224 | (including a cross-claim or counterclaim in a lawsuit) alleging that the
225 | Program itself (excluding combinations of the Program with other software
226 | or hardware) infringes such Recipient's patent(s), then such Recipient's
227 | rights granted under Section 2(b) shall terminate as of the date such
228 | litigation is filed.
229 |
230 | All Recipient's rights under this Agreement shall terminate if it
231 | fails to comply with any of the material terms or conditions of this
232 | Agreement and does not cure such failure in a reasonable period of
233 | time after becoming aware of such noncompliance. If all Recipient's
234 | rights under this Agreement terminate, Recipient agrees to cease use
235 | and distribution of the Program as soon as reasonably practicable.
236 | However, Recipient's obligations under this Agreement and any licenses
237 | granted by Recipient relating to the Program shall continue and survive.
238 |
239 | Everyone is permitted to copy and distribute copies of this Agreement,
240 | but in order to avoid inconsistency the Agreement is copyrighted and
241 | may only be modified in the following manner. The Agreement Steward
242 | reserves the right to publish new versions (including revisions) of
243 | this Agreement from time to time. No one other than the Agreement
244 | Steward has the right to modify this Agreement. The Eclipse Foundation
245 | is the initial Agreement Steward. The Eclipse Foundation may assign the
246 | responsibility to serve as the Agreement Steward to a suitable separate
247 | entity. Each new version of the Agreement will be given a distinguishing
248 | version number. The Program (including Contributions) may always be
249 | Distributed subject to the version of the Agreement under which it was
250 | received. In addition, after a new version of the Agreement is published,
251 | Contributor may elect to Distribute the Program (including its
252 | Contributions) under the new version.
253 |
254 | Except as expressly stated in Sections 2(a) and 2(b) above, Recipient
255 | receives no rights or licenses to the intellectual property of any
256 | Contributor under this Agreement, whether expressly, by implication,
257 | estoppel or otherwise. All rights in the Program not expressly granted
258 | under this Agreement are reserved. Nothing in this Agreement is intended
259 | to be enforceable by any entity that is not a Contributor or Recipient.
260 | No third-party beneficiary rights are created under this Agreement.
261 |
262 | Exhibit A - Form of Secondary Licenses Notice
263 |
264 | "This Source Code may also be made available under the following
265 | Secondary Licenses when the conditions for such availability set forth
266 | in the Eclipse Public License, v. 2.0 are satisfied: {name license(s),
267 | version(s), and exceptions or additional permissions here}."
268 |
269 | Simply including a copy of this Agreement, including this Exhibit A
270 | is not sufficient to license the Source Code under Secondary Licenses.
271 |
272 | If it is not possible or desirable to put the notice in a particular
273 | file, then You may include the notice in a location (such as a LICENSE
274 | file in a relevant directory) where a recipient would be likely to
275 | look for such a notice.
276 |
277 | You may add additional accurate notices of copyright ownership.
278 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Modern Java in Action
2 |
3 | A repository for my live-coding talk [Modern Java in Action](https://nipafx.dev/talk-java-action).
4 |
5 | Each step is its own commit.
6 | Check them out to see here what needs to be done.
7 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | dev.nipafx.demo
8 | modern-java
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 23
13 | UTF-8
14 |
15 |
16 |
17 |
18 | org.jsoup
19 | jsoup
20 | 1.17.2
21 |
22 |
23 |
24 |
25 |
26 |
27 | org.apache.maven.plugins
28 | maven-dependency-plugin
29 | 3.6.1
30 |
31 |
32 | copy-dependencies
33 | package
34 |
35 | copy-dependencies
36 |
37 |
38 | jars
39 |
40 |
41 |
42 |
43 |
44 | org.apache.maven.plugins
45 | maven-compiler-plugin
46 | 3.12.1
47 |
48 | true
49 | true
50 |
51 |
52 |
53 | org.apache.maven.plugins
54 | maven-jar-plugin
55 | 3.3.0
56 |
57 | gh-crawler
58 |
59 |
60 | dev.nipafx.demo.modern.GitHubCrawl
61 | true
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/serve/style.css:
--------------------------------------------------------------------------------
1 | body {
2 | background-color: #262429;
3 | color: white;
4 | font-size: 24px;
5 | }
6 |
7 | .container {
8 | width: 600px;
9 | margin: 2em auto;
10 | }
11 |
12 | .page {
13 | margin: 1em 0;
14 | padding: 0.5em;
15 | border: 1px solid #69ea7d;
16 | border-radius: 0.25em;
17 |
18 | display: grid;
19 | grid-template-columns: 1fr auto;
20 | }
21 |
22 | .page .ref:before {
23 | content: "⤴";
24 | }
25 |
26 | .page.level-1 {
27 | margin-left: 1em;
28 | }
29 |
30 | .page.level-2 {
31 | margin-left: 2em;
32 | }
33 |
34 | .page.level-3 {
35 | margin-left: 3em;
36 | }
37 |
38 | .page.level-4 {
39 | margin-left: 4em;
40 | }
41 |
42 | .page.level-5 {
43 | margin-left: 5em;
44 | }
45 |
46 | a, a:visited {
47 | color: white;
48 | text-decoration: none;
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/dev/nipafx/demo/modern/GitHubCrawl.java:
--------------------------------------------------------------------------------
1 | package dev.nipafx.demo.modern;
2 |
3 | import dev.nipafx.demo.modern.crawler.PageTreeFactory;
4 | import dev.nipafx.demo.modern.operations.Pretty;
5 | import dev.nipafx.demo.modern.operations.ResultServer;
6 | import dev.nipafx.demo.modern.operations.Statistician;
7 |
8 | import java.net.URI;
9 | import java.net.URISyntaxException;
10 | import java.net.http.HttpClient;
11 | import java.nio.file.Path;
12 |
13 | public class GitHubCrawl {
14 |
15 | /**
16 | * @param args 0: path to GitHub issue or PR page
17 | * 1: depth of tree that will be built
18 | */
19 | public static void main(String[] args) throws Exception {
20 | var config = Configuration.parse(args);
21 |
22 | System.out.printf("%nTo see virtual threads in action, run this while the app is resolving a bunch of links:%n");
23 | System.out.printf("jcmd %s Thread.dump_to_file -format=json -overwrite threads.json%n%n", ProcessHandle.current().pid());
24 |
25 | var client = HttpClient.newHttpClient();
26 | var factory = new PageTreeFactory(client);
27 | var rootPage = factory.createPage(config.seedUrl(), config.depth());
28 |
29 | System.out.printf("""
30 |
31 | ---
32 |
33 | %s
34 |
35 | %s
36 |
37 |
38 | """, Statistician.evaluate(rootPage), Pretty.pageList(rootPage));
39 |
40 | ResultServer.serve(rootPage, Path.of("serve"));
41 | }
42 |
43 | private record Configuration(URI seedUrl, int depth) {
44 |
45 | static Configuration parse(String[] args) throws URISyntaxException {
46 | if (args.length < 2)
47 | throw new IllegalArgumentException("Please specify the seed URL and depth.");
48 | var seedUrl = new URI(args[0]);
49 | var depth = Integer.parseInt(args[1]);
50 | return new Configuration(seedUrl, depth);
51 | }
52 |
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/dev/nipafx/demo/modern/crawler/PageFactory.java:
--------------------------------------------------------------------------------
1 | package dev.nipafx.demo.modern.crawler;
2 |
3 | import dev.nipafx.demo.modern.page.ExternalPage;
4 | import dev.nipafx.demo.modern.page.GitHubIssuePage;
5 | import dev.nipafx.demo.modern.page.GitHubPrPage;
6 | import org.jsoup.Jsoup;
7 | import org.jsoup.nodes.Document;
8 |
9 | import java.net.URI;
10 | import java.net.URISyntaxException;
11 | import java.util.Set;
12 | import java.util.regex.Pattern;
13 | import java.util.stream.Stream;
14 |
15 | import static java.util.stream.Collectors.toSet;
16 |
17 | class PageFactory {
18 |
19 | private static final Set GITHUB_HOSTS = Set.of("github.com", "user-images.githubusercontent.com");
20 | private static final Pattern GITHUB_TRACKED_PAGE = Pattern.compile("/issues/\\d+/?$|/pull/\\d+/?$");
21 | private static final Pattern GITHUB_ISSUE_NUMBER = Pattern.compile(".*/issues/(\\d+)/?.*");
22 | private static final Pattern GITHUB_PR_NUMBER = Pattern.compile(".*/pull/(\\d+)/?.*");
23 |
24 | private static final String GITHUB_ISSUE_CONTENT_SELECTOR = "#show_issue";
25 | private static final String GITHUB_PR_CONTENT_SELECTOR = ".clearfix.js-issues-results";
26 |
27 | private PageFactory() {
28 | // private constructor to prevent instantiation of factory class
29 | }
30 |
31 | public static PageWithLinks parsePage(URI url, String html) {
32 | // turn this into an `if`, I dare you!
33 | return switch (url) {
34 | case URI u when u.getHost().equals("github.com") && u.getPath().contains("/issues/") -> parseIssuePage(url, html);
35 | case URI u when u.getHost().equals("github.com") && u.getPath().contains("/pull/") -> parsePrPage(url, html);
36 | default -> parseExternalPage(url, html);
37 | };
38 | }
39 |
40 | static PageWithLinks parseIssuePage(URI url, String html) {
41 | var document = Jsoup.parse(html);
42 | var content = extractContent(document, GITHUB_ISSUE_CONTENT_SELECTOR);
43 | var links = extractLinks(url, document, GITHUB_ISSUE_CONTENT_SELECTOR);
44 | var issueNr = getFirstMatchAsNumber(GITHUB_ISSUE_NUMBER, url);
45 | return new PageWithLinks(new GitHubIssuePage(url, content, issueNr), links);
46 | }
47 |
48 | static PageWithLinks parsePrPage(URI url, String html) {
49 | var document = Jsoup.parse(html);
50 | var content = extractContent(document, GITHUB_PR_CONTENT_SELECTOR);
51 | var links = extractLinks(url, document, GITHUB_PR_CONTENT_SELECTOR);
52 | var issueNr = getFirstMatchAsNumber(GITHUB_PR_NUMBER, url);
53 | return new PageWithLinks(new GitHubPrPage(url, content, issueNr), links);
54 | }
55 |
56 | private static PageWithLinks parseExternalPage(URI url, String html) {
57 | return new PageWithLinks(new ExternalPage(url, html), Set.of());
58 | }
59 |
60 | private static String extractContent(Document document, String cssContentSelector) {
61 | var selectedElements = document.select(cssContentSelector);
62 | if (selectedElements.size() != 1)
63 | throw new IllegalArgumentException("The CSS selector '%s' yielded %d elements".formatted(cssContentSelector, selectedElements.size()));
64 | return selectedElements.getFirst().toString();
65 | }
66 |
67 | private static Set extractLinks(URI url, Document document, String cssContentSelector) {
68 | return document
69 | .select(cssContentSelector + " a[href]").stream()
70 | .map(element -> element.attribute("href").getValue())
71 | .flatMap(href -> normalizePotentialLink(url, href))
72 | .filter(PageFactory::shouldRegisterLink)
73 | .collect(toSet());
74 | }
75 |
76 | private static Stream normalizePotentialLink(URI pageUrl, String href) {
77 | if (href == null || href.isBlank())
78 | return Stream.empty();
79 |
80 | try {
81 | var url = pageUrl.resolve(new URI(href));
82 | var isCyclicLink = url.equals(pageUrl);
83 | if (isCyclicLink)
84 | return Stream.empty();
85 | return Stream.of(url);
86 | } catch (URISyntaxException ex) {
87 | // nothing to be done
88 | return Stream.empty();
89 | }
90 | }
91 |
92 | private static boolean shouldRegisterLink(URI url) {
93 | if (url.getHost() == null)
94 | return false;
95 |
96 | var isExternalUrl = !GITHUB_HOSTS.contains(url.getHost());
97 | return isExternalUrl || GITHUB_TRACKED_PAGE.matcher(url.toString()).find();
98 | }
99 |
100 | private static int getFirstMatchAsNumber(Pattern pattern, URI url) {
101 | var issueNumberMatcher = pattern.matcher(url.toString());
102 | var found = issueNumberMatcher.find();
103 | if (!found)
104 | throw new IllegalStateException("Alleged issue/PR URL %s does not seem to contain a number.".formatted(url));
105 | return Integer.parseInt(issueNumberMatcher.group(1));
106 | }
107 |
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/java/dev/nipafx/demo/modern/crawler/PageTreeFactory.java:
--------------------------------------------------------------------------------
1 | package dev.nipafx.demo.modern.crawler;
2 |
3 | import dev.nipafx.demo.modern.page.ErrorPage;
4 | import dev.nipafx.demo.modern.page.ExternalPage;
5 | import dev.nipafx.demo.modern.page.GitHubIssuePage;
6 | import dev.nipafx.demo.modern.page.GitHubPrPage;
7 | import dev.nipafx.demo.modern.page.Page;
8 |
9 | import java.io.IOException;
10 | import java.net.URI;
11 | import java.net.http.HttpClient;
12 | import java.net.http.HttpRequest;
13 | import java.net.http.HttpResponse.BodyHandlers;
14 | import java.util.ArrayList;
15 | import java.util.Collections;
16 | import java.util.Set;
17 | import java.util.concurrent.ConcurrentHashMap;
18 | import java.util.concurrent.ConcurrentMap;
19 | import java.util.concurrent.ExecutionException;
20 | import java.util.concurrent.StructuredTaskScope;
21 | import java.util.concurrent.StructuredTaskScope.Subtask;
22 |
23 | import static java.util.Objects.requireNonNull;
24 | import static java.util.stream.Collectors.toSet;
25 |
26 | public class PageTreeFactory {
27 |
28 | private final HttpClient client;
29 | private final ConcurrentMap resolvedPages;
30 |
31 | public PageTreeFactory(HttpClient client) {
32 | this.client = requireNonNull(client);
33 | resolvedPages = new ConcurrentHashMap<>();
34 | }
35 |
36 | public Page createPage(URI url, int depth) throws InterruptedException {
37 | if (resolvedPages.containsKey(url)) {
38 | System.out.printf("Found cached '%s'%n", url);
39 | return resolvedPages.get(url);
40 | }
41 |
42 | System.out.printf("Resolving '%s'...%n", url);
43 | var pageWithLinks = fetchPageWithLinks(url);
44 | var page = pageWithLinks.page();
45 | resolvedPages.computeIfAbsent(page.url(), __ -> page);
46 | System.out.printf("Resolved '%s' with children: %s%n", url, pageWithLinks.links());
47 |
48 | return switch (page) {
49 | case GitHubIssuePage(var isUrl, var content, _, int nr) ->
50 | new GitHubIssuePage(isUrl, content, resolveLinks(pageWithLinks.links(), depth - 1), nr);
51 | case GitHubPrPage(var prUrl, var content, _, int nr) ->
52 | new GitHubIssuePage(prUrl, content, resolveLinks(pageWithLinks.links(), depth - 1), nr);
53 | case ExternalPage _, ErrorPage _ -> page;
54 | };
55 | }
56 |
57 | private PageWithLinks fetchPageWithLinks(URI url) throws InterruptedException {
58 | try {
59 | var pageBody = fetchPageAsString(url);
60 | return PageFactory.parsePage(url, pageBody);
61 | } catch (InterruptedException iex) {
62 | throw iex;
63 | } catch (Exception ex) {
64 | return new PageWithLinks(new ErrorPage(url, ex));
65 | }
66 | }
67 |
68 | private String fetchPageAsString(URI url) throws IOException, InterruptedException {
69 | var request = HttpRequest
70 | .newBuilder(url)
71 | .GET()
72 | .build();
73 | return client
74 | .send(request, BodyHandlers.ofString())
75 | .body();
76 | }
77 |
78 | private Set resolveLinks(Set links, int depth) throws InterruptedException {
79 | if (depth < 0)
80 | return Collections.emptySet();
81 |
82 | try (var scope = new StructuredTaskScope.ShutdownOnFailure()) {
83 | var futurePages = new ArrayList>();
84 | for (URI link : links)
85 | futurePages.add(scope.fork(() -> createPage(link, depth)));
86 |
87 | scope.join();
88 | scope.throwIfFailed();
89 |
90 | return futurePages.stream()
91 | .map(Subtask::get)
92 | .collect(toSet());
93 | } catch (ExecutionException ex) {
94 | // this should not happen as `ErrorPage` instances should have been created for all errors
95 | throw new IllegalStateException("Error cases should have been handled during page creation!", ex);
96 | }
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/dev/nipafx/demo/modern/crawler/PageWithLinks.java:
--------------------------------------------------------------------------------
1 | package dev.nipafx.demo.modern.crawler;
2 |
3 | import dev.nipafx.demo.modern.page.Page;
4 |
5 | import java.net.URI;
6 | import java.util.Set;
7 |
8 | import static java.util.Objects.requireNonNull;
9 |
10 | record PageWithLinks(Page page, Set links) {
11 |
12 | PageWithLinks {
13 | requireNonNull(page);
14 | requireNonNull(links);
15 | links = Set.copyOf(links);
16 | }
17 |
18 | public PageWithLinks(Page page) {
19 | this(page, Set.of());
20 | }
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/java/dev/nipafx/demo/modern/operations/Pretty.java:
--------------------------------------------------------------------------------
1 | package dev.nipafx.demo.modern.operations;
2 |
3 | import dev.nipafx.demo.modern.page.ErrorPage;
4 | import dev.nipafx.demo.modern.page.ExternalPage;
5 | import dev.nipafx.demo.modern.page.GitHubIssuePage;
6 | import dev.nipafx.demo.modern.page.GitHubPage;
7 | import dev.nipafx.demo.modern.page.GitHubPrPage;
8 | import dev.nipafx.demo.modern.page.Page;
9 |
10 | import java.net.URI;
11 |
12 | import static java.util.stream.Collectors.joining;
13 |
14 | public class Pretty {
15 |
16 | private Pretty() {
17 | // private constructor to prevent instantiation
18 | }
19 |
20 | public static String pageList(Page rootPage) {
21 | if (!(rootPage instanceof GitHubPage ghPage))
22 | return pageName(rootPage);
23 |
24 | return ghPage
25 | .subtree()
26 | .map(Pretty::pageName)
27 | .collect(joining("\n"));
28 | }
29 |
30 | public static String pageName(Page page) {
31 | return switch (page) {
32 | case ErrorPage(URI url, _) -> "💥 ERROR: " + url.getHost();
33 | case ExternalPage(URI url, _) -> "💤 EXTERNAL: " + url.getHost();
34 | case GitHubIssuePage(_, _, _, int nr) -> "🐈 ISSUE #" + nr;
35 | case GitHubPrPage(_, _, _, int nr) -> "🐙 PR #" + nr;
36 | };
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/dev/nipafx/demo/modern/operations/ResultServer.java:
--------------------------------------------------------------------------------
1 | package dev.nipafx.demo.modern.operations;
2 |
3 | import com.sun.net.httpserver.SimpleFileServer;
4 | import com.sun.net.httpserver.SimpleFileServer.OutputLevel;
5 | import dev.nipafx.demo.modern.page.GitHubPage;
6 | import dev.nipafx.demo.modern.page.Page;
7 | import org.jsoup.Jsoup;
8 |
9 | import java.io.IOException;
10 | import java.net.InetSocketAddress;
11 | import java.nio.file.Files;
12 | import java.nio.file.Path;
13 | import java.util.HashSet;
14 | import java.util.Set;
15 |
16 | import static java.util.stream.Collectors.joining;
17 |
18 | public class ResultServer {
19 |
20 | public static void serve(Page rootPage, Path serverDir) throws IOException {
21 | if (!Files.exists(serverDir))
22 | Files.createDirectory(serverDir);
23 |
24 | var html = Jsoup.parse("""
25 |
26 |
27 |
28 |
29 | %s
30 |
31 |
32 |
33 |