├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── LICENSE
├── README.md
├── graph-exploration-README.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── org
        │   │   └── commoncrawl
        │   │       └── webgraph
        │   │           ├── CountingMergedIntIterator.java
        │   │           ├── CreatePreferenceVector.java
        │   │           ├── HostToDomainGraph.java
        │   │           ├── JoinSortRanks.java
        │   │           ├── explore
        │   │               ├── Graph.java
        │   │               └── GraphExplorer.java
        │   │           └── package-info.java
        └── resources
        │   └── simplelogger.properties
    ├── script
        ├── host2domaingraph.sh
        ├── hostgraph
        │   ├── build_hostgraph.sh
        │   └── hostgraph_config.sh
        ├── webgraph_ranking
        │   ├── graph_explore_build_vertex_map.sh
        │   ├── graph_explore_download_webgraph.sh
        │   ├── graph_explore_load_graph.jsh
        │   ├── process_webgraph.sh
        │   ├── process_webgraph_degrees.sh
        │   ├── run_webgraph.sh
        │   └── webgraph_config.sh
        └── workflow_lib.sh
    └── test
        └── java
            └── org
                └── commoncrawl
                    └── webgraph
                        ├── TestCountingMergedIntIterator.java
                        ├── TestHostToDomainGraph.java
                        └── TestJoinSortRanks.java


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: cc-webgraph build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |   pull_request:
 8 |     branches:
 9 |     - main
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         java: [ 11, 17, 21 ]
17 |     name: Java ${{ matrix.java }}
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - name: Setup JDK
22 |         uses: actions/setup-java@v4
23 |         with:
24 |           distribution: 'temurin'
25 |           java-version: ${{ matrix.java }}
26 |           cache: 'maven'
27 | 
28 |       - name: Build
29 |         run: mvn verify javadoc:aggregate
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.ear
17 | *.zip
18 | *.tar.gz
19 | *.rar
20 | 
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 | 
24 | # maven build directory
25 | /target/
26 | 
27 | # Eclipse project files
28 | .project
29 | .classpath
30 | .settings/
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cc-webgraph
 2 | 
 3 | Tools to construct web graphs from Common Crawl data, process and explore them.
 4 | 
 5 | ## Compiling and Packaging Java Tools
 6 | 
 7 | Java 11 or upwards are required.
 8 | 
 9 | The Java tools are compiled and packaged by [Maven](https://maven.apache.org/). If Maven is installed just run `mvn package`. Now the Java tools can be run via
10 | ```
11 | java -cp target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar <classname> <args>...
12 | ```
13 | 
14 | The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/) and [LAW](https://law.di.unimi.it/software.php) packages required to process the webgraphs and compute [PageRank](https://en.wikipedia.org/wiki/PageRank) or [Harmonic Centrality](https://en.wikipedia.org/wiki/Centrality#Harmonic_centrality).
15 | 
16 | 
17 | ### Javadocs
18 | 
19 | The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser.
20 | 
21 | 
22 | ## Memory and Disk Requirements
23 | 
24 | Note that the webgraphs are usually multiple Gigabytes in size and require for processing
25 | - a sufficient Java heap size ([Java option](https://docs.oracle.com/en/java/javase/21/docs/specs/man/java.html#extra-options-for-java) `-Xmx`)
26 | - enough disk space to store the graphs and temporary data.
27 | 
28 | The exact requirements depend on the graph size and the task – graph exploration or ranking, etc.
29 | 
30 | 
31 | ## Construction and Ranking of Host- and Domain-Level Web Graphs
32 | 
33 | ### Host-Level Web Graph
34 | 
35 | The host-level web graph is built with help of PySpark, the corresponding code is found in the project [cc-pyspark](https://github.com/commoncrawl/cc-pyspark). Instructions are found in the script [build_hostgraph.sh](src/script/hostgraph/build_hostgraph.sh).
36 | 
37 | ### Domain-Level Web Graph
38 | 
39 | The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh).
40 | 
41 | ### Processing Graphs using the WebGraph Framework
42 | 
43 | To analyze the graph structure and calculate rankings you may further process the graphs using software from the Laboratory for Web Algorithmics (LAW) at the University of Milano, namely the [WebGraph framework](https://webgraph.di.unimi.it/) and the [LAW library](https://law.di.unimi.it/software.php).
44 | 
45 | A couple of scripts may help you to run the WebGraph tools to build and process the graphs are provided in [src/script/webgraph_ranking/](src/script/webgraph_ranking/). They're also used to prepare the Common Crawl web graph releases.
46 | 
47 | To process a webgraph and rank the nodes, you should first adapt the configuration to your graph and hardware setup:
48 | ```
49 | vi ./src/script/webgraph_ranking/webgraph_config.sh
50 | ```
51 | After running
52 | ```
53 | ./src/script/webgraph_ranking/process_webgraph.sh graph_name vertices.txt.gz edges.txt.gz output_dir
54 | ```
55 | the `output_dir/` should contain all generated files, eg. `graph_name.graph` and `graph_name-ranks.txt.gz`.
56 | 
57 | The shell script is easily adapted to your needs. Please refer to the [LAW dataset tutorial](https://law.di.unimi.it/tutorial.php), the [API docs of LAW](https://law.di.unimi.it/software/law-docs/index.html) and [webgraph](https://webgraph.di.unimi.it/docs/) for further information.
58 | 
59 | 
60 | ## Exploring Webgraph Data Sets
61 | 
62 | The Common Crawl webgraph data sets are announced on the [Common Crawl web site](https://commoncrawl.org/tag/webgraph/).
63 | 
64 | For instructions how to explore the webgraphs using the JShell please see the tutorial [Interactive Graph Exploration](./graph-exploration-README.md). For an older approach using [Jython](https://www.jython.org/) and [pyWebGraph](https://github.com/mapio/py-web-graph), see the [cc-notebooks project](//github.com/commoncrawl/cc-notebooks/tree/master/cc-webgraph-statistics).
65 | 
66 | 
67 | ## Credits
68 | 
69 | Thanks to the authors of the [WebGraph framework](https://webgraph.di.unimi.it/) used to process the graphs and compute page rank and harmonic centrality. See also Sebastiano Vigna's projects [webgraph](//github.com/vigna/webgraph) and [webgraph-big](//github.com/vigna/webgraph-big).
70 | 


--------------------------------------------------------------------------------
/graph-exploration-README.md:
--------------------------------------------------------------------------------
  1 | # Interactive Graph Exploration
  2 | 
  3 | A tutorial how to interactively explore the Common Crawl webgraphs – or other graphs using the webgraph format – using the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html) and the [GraphExplorer](src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java) class.
  4 | 
  5 | 
  6 | ## Quick Start
  7 | 
  8 | 1. change into the "cc-webgraph" project directory, [build the cc-webgraph JAR](README.md#compiling-and-packaging-java-tools) and remember the project directory and the JAR using environment variables:
  9 | 
 10 |    ```
 11 |    $> cd .../cc-webgraph
 12 | 
 13 |    $> mvn clean package
 14 | 
 15 |    $> CC_WEBGRAPH="$PWD"
 16 |    $> CC_WEBGRAPH_JAR=$(ls "$PWD"/target/cc-webgraph-*-jar-with-dependencies.jar)
 17 |    ```
 18 | 
 19 | 2. select a web graph you want to explore, choose a download directory and download the web graph
 20 | 
 21 |    ```
 22 |    $> GRAPH=cc-main-2024-feb-apr-may-domain
 23 | 
 24 |    $> mkdir .../my-webgraphs/$GRAPH
 25 |    $> cd .../my-webgraphs/$GRAPH
 26 |    ```
 27 | 
 28 |    About 15 GiB disk are needed to hold all files of a domain-level webgraph.
 29 | 
 30 |    ```
 31 |    $> "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_download_webgraph.sh $GRAPH
 32 |    ```
 33 | 
 34 | 3. Build the map from vertex label to vertex ID and vice versa. This allows to look up a reverse domain name (e.g. "org.commoncrawl") and get the corresponding vertex ID.
 35 | 
 36 |    ```
 37 |    $> "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh $GRAPH $GRAPH-vertices.txt.gz
 38 |    ```
 39 | 
 40 | 4. Launch the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html)
 41 | 
 42 |    ```
 43 |    $> jshell --class-path "$CC_WEBGRAPH_JAR"
 44 |    |  Welcome to JShell -- Version 21.0.3
 45 |    |  For an introduction type: /help intro
 46 |    
 47 |    jshell> 
 48 |    ```
 49 | 
 50 |    Now you may play around with the JShell or load the GraphExplorer class and your graph:
 51 | 
 52 |    ```
 53 |    jshell> import org.commoncrawl.webgraph.explore.GraphExplorer
 54 |    
 55 |    jshell> GraphExplorer e = new GraphExplorer("cc-main-2024-feb-apr-may-domain")
 56 |    2024-06-23 13:38:51:084 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph
 57 |    2024-06-23 13:38:51:193 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph
 58 |    2024-06-23 13:38:51:279 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap)
 59 |    2024-06-23 13:38:52:356 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph
 60 |    e ==> org.commoncrawl.webgraph.explore.GraphExplorer@4cc0edeb
 61 |    ```
 62 | 
 63 |    But for now exit the JShell
 64 |    ```
 65 |    jshell> /exit
 66 |    |  Goodbye
 67 |    ```
 68 | 
 69 |    To make the loading easier, you may use the load script [graph_explore_load_graph.jsh](src/script/webgraph_ranking/graph_explore_load_graph.jsh) and pass the graph name as a Java property to the JShell via command-line option `-R-Dgraph=$GRAPH`
 70 | 
 71 |    ```
 72 |    $> jshell --class-path "$CC_WEBGRAPH_JAR" \
 73 |              -R-Dgraph=$GRAPH \
 74 |              "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_load_graph.jsh
 75 |    Loading graph cc-main-2024-feb-apr-may-domain
 76 |    2024-06-23 13:30:14:134 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph
 77 |    2024-06-23 13:30:14:340 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph
 78 |    2024-06-23 13:30:14:439 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap)
 79 |    2024-06-23 13:30:15:595 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph
 80 |    
 81 |    Graph cc-main-2024-feb-apr-may-domain loaded into GraphExplorer *e*
 82 |    Type "e." and press <TAB> to list the public methods of the class GraphExplorer
 83 |    ... or "g." for the graph loaded for exploration
 84 |    
 85 |    ... or use one of the predefined methods:
 86 |      void cn(String)
 87 |      void cn(long)
 88 |      void pwn()
 89 |      void ls()
 90 |      void ls(long)
 91 |      void ls(String)
 92 |      void sl()
 93 |      void sl(long)
 94 |      void sl(String)
 95 |    
 96 |    |  Welcome to JShell -- Version 21.0.3
 97 |    |  For an introduction type: /help intro
 98 |    
 99 |    jshell> 
100 |    ```
101 | 
102 |    The predefined methods are those provided by [pyWebGraph](https://github.com/mapio/py-web-graph).
103 | 
104 |    ```
105 |    jshell> cn("org.commoncrawl")
106 |    #111997321      org.commoncrawl
107 |    
108 |    jshell> pwn()
109 |    #111997321      org.commoncrawl
110 |    
111 |    jshell> ls()  // list successors (vertices linked from the domain commoncrawl.org or one of its subdomains)
112 |    
113 |    jshell> sl()  // list predecessors (vertices connected via incoming links)
114 |    ```
115 | 
116 | 
117 | ## Using the Java Classes
118 | 
119 | The Java classes "GraphExplorer" and "Graph" bundle a set of methods which help exploring the graphs:
120 | - load the webgraph, its transpose and the vertex map
121 | - access the vertices and their successors or predecessors
122 | - utilities to import or export a list of vertices or counts from or into a file
123 | 
124 | The methods are bundled in the classes of the Java package `org.commoncrawl.webgraph.explore`. To get an overview over all provided methods, inspect the source code or see the section [Javadocs](README.md#javadocs) in the main README for how to read the Javadocs. Here only few examples are presented.
125 | 
126 | We start again with launching the JShell and loading a webgraph:
127 | 
128 | ```
129 | $> jshell --class-path "$CC_WEBGRAPH_JAR" \
130 |           -R-Dgraph=$GRAPH \
131 |           "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_load_graph.jsh
132 | jshell> 
133 | ```
134 | 
135 | Two classes are already instantiated – the *GraphExplorer* `e` and the *Graph* `g`, the former holds a reference to the latter:
136 | 
137 | ```
138 | jshell> /vars
139 | |    String graph = "cc-main-2024-feb-apr-may-domain"
140 | |    GraphExplorer e = org.commoncrawl.webgraph.explore.GraphExplorer@7dc7cbad
141 | |    Graph g = org.commoncrawl.webgraph.explore.Graph@4f933fd1
142 | 
143 | jshell> e.getGraph()
144 | $45 ==> org.commoncrawl.webgraph.explore.Graph@4f933fd1
145 | ```
146 | 
147 | First, the vertices in the webgraphs are represented by numbers. So, we need to translage between vertex label and ID:
148 | 
149 | ```
150 | jshell> g.vertexLabelToId("org.wikipedia")
151 | $46 ==> 115107569
152 | 
153 | jshell> g.vertexIdToLabel(115107569)
154 | $47 ==> "org.wikipedia"
155 | ```
156 | 
157 | One important note: Common Crawl's webgraphs list the host or domain names in [reverse domain name notation](https://en.wikipedia.org/wiki/Reverse_domain_name_notation). The vertex lists are sorted by the reversed names in lexicographic order and then numbered continuously. This gives a close-to-perfect compression of the webgraphs itself. Most of the arcs are close in terms of locality because subdomains or sites of the same region (by country-code top-level domain) are listed in one continous block. Cf. the paper [The WebGraph Framework I: Compression Techniques](https://vigna.di.unimi.it/ftp/papers/WebGraphI.pdf) by Paolo Boldi and Sebastiano Vigna.
158 | 
159 | Now, let's look how many other domains are linked from Wikipedia?
160 | 
161 | ```
162 | jshell> g.outdegree("org.wikipedia")
163 | $46 ==> 2106338
164 | ```
165 | 
166 | Another note: Common Crawl's webgraphs are based on sample crawls of the web. Same as the crawls, also the webgraphs are not complete and the Wikipedia may in reality link to far more domains. But 2 million linked domains is already not a small sample.
167 | 
168 | The Graph class also gives you access to the successors of a vertex, as array or stream of integers, but also as stream of strings (vertex labels):
169 | 
170 | ```
171 | jshell> g.successors("org.wikipedia").length
172 | $48 ==> 2106338
173 | 
174 | jshell> g.successorIntStream("org.wikipedia").count()
175 | $49 ==> 2106338
176 | 
177 | jshell> g.successorStream("org.wikipedia").limit(10).forEach(System.out::println)
178 | abb.global
179 | abb.nic
180 | abbott.cardiovascular
181 | abbott.globalpointofcare
182 | abbott.molecular
183 | abbott.pk
184 | abc.www
185 | abudhabi.gov
186 | abudhabi.mediaoffice
187 | abudhabi.tamm
188 | ```
189 | 
190 | Using Java streams it's easy to translate between the both representations:
191 | 
192 | ```
193 | jshell> g.successorIntStream("org.wikipedia").limit(5).mapToObj(i -> g.vertexIdToLabel(i)).forEach(System.out::println)
194 | abb.global
195 | abb.nic
196 | abbott.cardiovascular
197 | abbott.globalpointofcare
198 | abbott.molecular
199 | ```
200 | 
201 | Successors represent outgoing links to other domains. We can do the same for predecsors, that is incoming links from other domains:
202 | 
203 | ```
204 | jshell> g.indegree("org.wikipedia")
205 | $50 ==> 2752391
206 | 
207 | jshell> g.predecessorIntStream("org.wikipedia").count()
208 | $51 ==> 2752391
209 | 
210 | jshell> g.predecessorStream("org.wikipedia").limit(5).forEach(System.out::println)
211 | abogado.fabiobalbuena
212 | abogado.jacksonville
213 | abogado.jaskot
214 | abogado.super
215 | ac.789bet
216 | ```
217 | 
218 | Technically, webgraphs only store successor lists. But the Graph class holds also two graphs: the "original" one and its transpose. In the transposed graph "successors" are "predecessors", and "outdegree" means "indegree". Some methods on a deeper level take one of the two webgraphs as argument, here it makes a difference whether you pass `g.graph` or `g.graphT`, here to a method which translates vertex IDs to labels and extracts the top-level domain:
219 | 
220 | ```
221 | jshell> g.successorTopLevelDomainStream(g.graph, g.vertexLabelToId("org.wikipedia")).limit(5).forEach(System.out::println)
222 | abb
223 | abb
224 | abbott
225 | abbott
226 | abbott
227 | 
228 | jshell> g.successorTopLevelDomainStream(g.graphT, g.vertexLabelToId("org.wikipedia")).limit(5).forEach(System.out::println)
229 | abogado
230 | abogado
231 | abogado
232 | abogado
233 | ac
234 | ```
235 | 
236 | The top-level domains repeat, and you may want to count the occurrences and create a frequency list. There is a predefined method to perform this:
237 | 
238 | ```
239 | jshell> g.successorTopLevelDomainCounts("org.wikipedia").filter(e -> e.getKey().startsWith("abb")).forEach(e -> System.out.printf("%8d\t%s\n", e.getValue(), e.getKey()))
240 |        4        abbott
241 |        2        abb
242 | 
243 | jshell> g.successorTopLevelDomainCounts("org.wikipedia").limit(10).forEach(e -> System.out.printf("%8d\t%s\n", e.getValue(), e.getKey()))
244 |   706707        com
245 |   213406        org
246 |   117042        de
247 |    86684        net
248 |    65906        ru
249 |    55914        fr
250 |    53628        uk
251 |    52828        it
252 |    51622        jp
253 |    33729        br
254 | ```
255 | 
256 | The same can be done for predecessors using the method "Graph::predecessorTopLevelDomainCounts".
257 | 
258 | Dealing with large successor or predecessor lists can be painful and viewing them in a terminal window is practically impossible. We've already discussed how to compress the list to top-level domain counts. Alternatively, you could select the labels by prefix...
259 | 
260 | ```
261 | jshell> g.successorStream("org.wikipedia", "za.org.").limit(10).forEach(System.out::println)
262 | za.org.61mech
263 | za.org.aadp
264 | za.org.aag
265 | za.org.abc
266 | za.org.acaparty
267 | za.org.acbio
268 | za.org.accord
269 | za.org.acd
270 | za.org.acdp
271 | za.org.acjr
272 | ```
273 | 
274 | ... but even then the list may be huge. Then the best option is to write the stream output (vertex labels or top-level domain frequencies) into a file and view it later using a file viewer or use any other tool for further processing:
275 | 
276 | ```
277 | jshell> e.saveVerticesToFile(g.successors("org.wikipedia"), "org-wikipedia-successors.txt")
278 | 
279 | jshell> e.saveCountsToFile(g.successorTopLevelDomainCounts("org.wikipedia"), "org-wikipedia-successors-tld-counts.txt")
280 | ```
281 | 
282 | ## Final Remarks
283 | 
284 | We hope these few examples will support either to have fun exploring the graphs or to develop your own pipeline to extract insights from the graphs.
285 | 
286 | Finally, thanks to the authors of the [WebGraph framework](https://webgraph.di.unimi.it/) and of [pyWebGraph](https://github.com/mapio/py-web-graph) for their work on these powerful tools and for any inspiration taken into these examples.
287 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>org.commoncrawl</groupId>
  6 | 	<artifactId>cc-webgraph</artifactId>
  7 | 	<version>0.1-SNAPSHOT</version>
  8 | 	<packaging>jar</packaging>
  9 | 
 10 | 	<name>cc-webgraph</name>
 11 | 	<url>https://github.com/commoncrawl/cc-webgraph</url>
 12 | 
 13 | 	<properties>
 14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 | 		<java.version>11</java.version>
 16 | 
 17 | 		<webgraph.version>3.6.10</webgraph.version>
 18 | 		<webgraph.big.version>3.7.0</webgraph.big.version>
 19 | 		<law.version>2.7.2</law.version>
 20 | 		<fastutil.version>8.5.15</fastutil.version>
 21 | 		<crawler.commons.version>1.4</crawler.commons.version>
 22 | 
 23 | 		<slf4j-api.version>2.0.16</slf4j-api.version>
 24 | 
 25 | 		<junit.version>5.11.2</junit.version>
 26 | 	</properties>
 27 | 
 28 | 	<build>
 29 | 		<resources>
 30 | 			<resource>
 31 | 				<directory>src/main/resources</directory>
 32 | 			</resource>
 33 | 		</resources>
 34 | 		<plugins>
 35 | 			<plugin>
 36 | 				<artifactId>maven-compiler-plugin</artifactId>
 37 | 				<version>3.14.0</version>
 38 | 				<configuration>
 39 | 					<source>${java.version}</source>
 40 | 					<target>${java.version}</target>
 41 | 				</configuration>
 42 | 			</plugin>
 43 | 			<plugin>
 44 | 				<artifactId>maven-assembly-plugin</artifactId>
 45 | 				<version>3.7.1</version>
 46 | 				<configuration>
 47 | 					<descriptorRefs>
 48 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
 49 | 					</descriptorRefs>
 50 | 					<finalName>cc-webgraph-${project.version}</finalName>
 51 | 				</configuration>
 52 | 				<executions>
 53 | 					<execution>
 54 | 						<phase>package</phase>
 55 | 						<goals>
 56 | 							<goal>single</goal>
 57 | 						</goals>
 58 | 					</execution>
 59 | 				</executions>
 60 | 			</plugin>
 61 | 			<plugin>
 62 | 				<artifactId>maven-surefire-plugin</artifactId>
 63 | 				<version>3.5.2</version>
 64 | 			</plugin>
 65 | 			<plugin>
 66 | 				<groupId>org.apache.maven.plugins</groupId>
 67 | 				<artifactId>maven-enforcer-plugin</artifactId>
 68 | 				<version>3.5.0</version>
 69 | 				<executions>
 70 | 					<execution>
 71 | 						<id>enforce-maven</id>
 72 | 						<goals>
 73 | 							<goal>enforce</goal>
 74 | 						</goals>
 75 | 						<configuration>
 76 | 							<rules>
 77 | 								<requireMavenVersion>
 78 | 									<version>3.6.3</version>
 79 | 								</requireMavenVersion>
 80 | 							</rules>
 81 | 						</configuration>
 82 | 					</execution>
 83 | 				</executions>
 84 | 			</plugin>
 85 | 		</plugins>
 86 | 	</build>
 87 | 
 88 | 
 89 | 
 90 | 	<dependencyManagement>
 91 | 		<dependencies>
 92 | 			<dependency>
 93 | 				<groupId>org.junit</groupId>
 94 | 				<artifactId>junit-bom</artifactId>
 95 | 				<version>${junit.version}</version>
 96 | 				<type>pom</type>
 97 | 				<scope>import</scope>
 98 | 			</dependency>
 99 | 		</dependencies>
100 | 	</dependencyManagement>
101 | 
102 | 	<dependencies>
103 | 
104 | 		<dependency>
105 | 			<groupId>org.slf4j</groupId>
106 | 			<artifactId>slf4j-api</artifactId>
107 | 			<version>${slf4j-api.version}</version>
108 | 		</dependency>
109 | 
110 | 		<dependency>
111 | 			<groupId>com.github.crawler-commons</groupId>
112 | 			<artifactId>crawler-commons</artifactId>
113 | 			<version>${crawler.commons.version}</version>
114 | 		</dependency>
115 | 
116 | 		<dependency>
117 | 			<groupId>it.unimi.dsi</groupId>
118 | 			<artifactId>fastutil-core</artifactId>
119 | 			<version>${fastutil.version}</version>
120 | 		</dependency>
121 | 
122 | 		<dependency>
123 | 			<groupId>commons-cli</groupId>
124 | 			<artifactId>commons-cli</artifactId>
125 | 			<version>1.5.0</version>
126 | 		</dependency>
127 | 
128 | 		<!-- WebGraph framework (https://webgraph.di.unimi.it/)
129 | 		     and libraries from the Laboratory for Web Algorithmics
130 | 		     (LAW - https://law.di.unimi.it/) -->
131 | 		<dependency>
132 | 			<groupId>it.unimi.dsi</groupId>
133 | 			<artifactId>webgraph</artifactId>
134 | 			<version>${webgraph.version}</version>
135 | 			<exclusions>
136 | 				<exclusion>
137 | 					<groupId>net.sf.jung</groupId>
138 | 					<artifactId>jung-api</artifactId>
139 | 				</exclusion>
140 | 				<exclusion>
141 | 					<groupId>net.sf.jung</groupId>
142 | 					<artifactId>jung-io</artifactId>
143 | 				</exclusion>
144 | 				<exclusion>
145 | 					<groupId>ch.qos.logback</groupId>
146 | 					<artifactId>logback-classic</artifactId>
147 | 				</exclusion>
148 | 			</exclusions>
149 | 		</dependency>
150 | 
151 | 		<dependency>
152 | 			<groupId>it.unimi.dsi</groupId>
153 | 			<artifactId>webgraph-big</artifactId>
154 | 			<version>${webgraph.big.version}</version>
155 | 			<exclusions>
156 | 				<exclusion>
157 | 					<groupId>ch.qos.logback</groupId>
158 | 					<artifactId>logback-classic</artifactId>
159 | 				</exclusion>
160 | 			</exclusions>
161 | 		</dependency>
162 | 
163 | 		<dependency>
164 | 			<groupId>it.unimi.dsi</groupId>
165 | 			<artifactId>law</artifactId>
166 | 			<version>${law.version}</version>
167 | 			<exclusions>
168 | 				<exclusion>
169 | 					<groupId>net.sf.jung</groupId>
170 | 					<artifactId>jung-api</artifactId>
171 | 				</exclusion>
172 | 				<exclusion>
173 | 					<groupId>net.sf.jung</groupId>
174 | 					<artifactId>jung-io</artifactId>
175 | 				</exclusion>
176 | 				<exclusion>
177 | 					<groupId>org.apache.httpcomponents</groupId>
178 | 					<artifactId>httpclient</artifactId>
179 | 				</exclusion>
180 | 				<exclusion>
181 | 					<groupId>org.apache.httpcomponents</groupId>
182 | 					<artifactId>httpasyncclient</artifactId>
183 | 				</exclusion>
184 | 				<exclusion>
185 | 					<groupId>org.eclipse.jetty.aggregate</groupId>
186 | 					<artifactId>jetty-all</artifactId>
187 | 				</exclusion>
188 | 				<exclusion>
189 | 					<groupId>org.softee</groupId>
190 | 					<artifactId>pojo-mbean</artifactId>
191 | 				</exclusion>
192 | 				<exclusion>
193 | 					<groupId>com.fasterxml.jackson</groupId>
194 | 					<artifactId>jackson-bom</artifactId>
195 | 				</exclusion>
196 | 				<exclusion>
197 | 					<groupId>it.unimi.di</groupId>
198 | 					<artifactId>mg4j</artifactId>
199 | 				</exclusion>
200 | 				<exclusion>
201 | 					<groupId>it.unimi.di</groupId>
202 | 					<artifactId>mg4j-big</artifactId>
203 | 				</exclusion>
204 | 				<exclusion>
205 | 					<groupId>org.wikidata.wdtk</groupId>
206 | 					<artifactId>wdtk-dumpfiles</artifactId>
207 | 				</exclusion>
208 | 				<exclusion>
209 | 					<groupId>info.bliki.wiki</groupId>
210 | 					<artifactId>bliki-core</artifactId>
211 | 				</exclusion>
212 | 				<exclusion>
213 | 					<groupId>it.unimi.di.law</groupId>
214 | 					<artifactId>jericho-html-dev</artifactId>
215 | 				</exclusion>
216 | 				<exclusion>
217 | 					<groupId>ch.qos.logback</groupId>
218 | 					<artifactId>logback-classic</artifactId>
219 | 				</exclusion>
220 | 				<exclusion>
221 | 					<groupId>org.slf4j</groupId>
222 | 					<artifactId>log4j-over-slf4j</artifactId>
223 | 				</exclusion>
224 | 				<exclusion>
225 | 					<groupId>org.slf4j</groupId>
226 | 					<artifactId>jcl-over-slf4j</artifactId>
227 | 				</exclusion>
228 | 			</exclusions>
229 | 		</dependency>
230 | 		<!-- it.unimi.dsi.law.rank.PageRankParallelGaussSeidel requires -->
231 | 		<dependency>
232 | 			<groupId>org.apache.commons</groupId>
233 | 			<artifactId>commons-configuration2</artifactId>
234 | 			<version>2.10.1</version>
235 | 			<scope>runtime</scope>
236 | 		</dependency>
237 | 		<dependency>
238 | 			<groupId>org.slf4j</groupId>
239 | 			<artifactId>slf4j-simple</artifactId>
240 | 			<version>${slf4j-api.version}</version>
241 | 		</dependency>
242 | 
243 | 		<!-- Test dependencies -->
244 | 		<dependency>
245 | 			<groupId>org.junit.jupiter</groupId>
246 | 			<artifactId>junit-jupiter</artifactId>
247 | 			<scope>test</scope>
248 | 		</dependency>
249 | 
250 | 	</dependencies>
251 | </project>
252 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-License-Identifier: Apache-2.0
  3 |  * Copyright (C) 2024 Common Crawl and contributors
  4 |  */
  5 | package org.commoncrawl.webgraph;
  6 | 
  7 | import java.util.PriorityQueue;
  8 | 
  9 | import it.unimi.dsi.fastutil.ints.IntIterator;
 10 | import it.unimi.dsi.webgraph.LazyIntIterator;
 11 | import it.unimi.dsi.webgraph.LazyIntIterators;
 12 | 
 13 | /**
 14 |  * An iterator counting the integers returned by multiple
 15 |  * {@link LazyIntIterator}s. The input iterators must return integers in a
 16 |  * monotonically non-decreasing order. The resulting iterator returns the
 17 |  * unified input integers in strictly non-decreasing order. The method
 18 |  * {@link getCount()} is used to access the count of the integer returned last
 19 |  * by {@link nextInt()}. The count equals the number of times any of the
 20 |  * iterators returned the current integer value. See also
 21 |  * {@link it.unimi.dsi.webgraph.MergedIntIterator}.
 22 |  */
 23 | public class CountingMergedIntIterator implements IntIterator {
 24 | 
 25 | 	protected class QueuedIterator implements Comparable<QueuedIterator> {
 26 | 		LazyIntIterator iter;
 27 | 		int value;
 28 | 
 29 | 		public QueuedIterator(LazyIntIterator iterator) {
 30 | 			iter = iterator;
 31 | 			value = iterator.nextInt();
 32 | 		}
 33 | 
 34 | 		@Override
 35 | 		public int compareTo(QueuedIterator o) {
 36 | 			if (value < o.value) {
 37 | 				return -1;
 38 | 			}
 39 | 			if (value > o.value) {
 40 | 				return 1;
 41 | 			}
 42 | 			return 0;
 43 | 		}
 44 | 	}
 45 | 
 46 | 	public static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();
 47 | 
 48 | 	private final PriorityQueue<QueuedIterator> iters = new PriorityQueue<>();
 49 | 	private int currentCount = 0;
 50 | 
 51 | 	/**
 52 | 	 * @param iterators input iterators
 53 | 	 */
 54 | 	public CountingMergedIntIterator(LazyIntIterator... iterators) {
 55 | 		for (final LazyIntIterator iter : iterators) {
 56 | 			final QueuedIterator qiter = new QueuedIterator(iter);
 57 | 			if (qiter.value != LAZY_INT_ITERATOR_EMPTY_VALUE) {
 58 | 				iters.add(qiter);
 59 | 			}
 60 | 		}
 61 | 	}
 62 | 
 63 | 	/**
 64 | 	 * {@inheritDoc}
 65 | 	 */
 66 | 	@Override
 67 | 	public boolean hasNext() {
 68 | 		return iters.size() > 0;
 69 | 	}
 70 | 
 71 | 	/**
 72 | 	 * {@inheritDoc}
 73 | 	 * 
 74 | 	 * @deprecated Please use {@link nextInt()} instead.
 75 | 	 */
 76 | 	@Deprecated
 77 | 	@Override
 78 | 	public Integer next() {
 79 | 		return Integer.valueOf(nextInt());
 80 | 	}
 81 | 
 82 | 	/**
 83 | 	 * {@inheritDoc}
 84 | 	 */
 85 | 	@Override
 86 | 	public int nextInt() {
 87 | 		QueuedIterator qiter = iters.peek();
 88 | 		final int value = qiter.value;
 89 | 		int count = 1;
 90 | 		while (true) {
 91 | 			iters.remove();
 92 | 			int val;
 93 | 			while ((val = qiter.iter.nextInt()) == value) {
 94 | 				count++;
 95 | 			}
 96 | 			if (val != LAZY_INT_ITERATOR_EMPTY_VALUE) {
 97 | 				qiter.value = val;
 98 | 				iters.add(qiter);
 99 | 			}
100 | 			if (iters.isEmpty()) {
101 | 				break;
102 | 			}
103 | 			qiter = iters.peek();
104 | 			if (qiter.value == value) {
105 | 				count++;
106 | 			} else {
107 | 				break;
108 | 			}
109 | 		}
110 | 		currentCount = count;
111 | 		return value;
112 | 	}
113 | 
114 | 	/**
115 | 	 * @return the count how often the last integer (returned by {@link nextInt()})
116 | 	 *         was seen in the input iterators
117 | 	 */
118 | 	public int getCount() {
119 | 		return currentCount;
120 | 	}
121 | 
122 | 	/**
123 | 	 * {@inheritDoc}
124 | 	 */
125 | 	@Override
126 | 	public int skip(int n) {
127 | 		int i = 0;
128 | 		while (i < n && hasNext()) {
129 | 			nextInt();
130 | 			i++;
131 | 		}
132 | 		return i;
133 | 	}
134 | 
135 | }
136 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/webgraph/CreatePreferenceVector.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-License-Identifier: Apache-2.0
  3 |  * Copyright (C) 2022 Common Crawl and contributors
  4 |  */
  5 | package org.commoncrawl.webgraph;
  6 | 
  7 | import java.io.DataOutputStream;
  8 | import java.io.IOException;
  9 | import java.nio.file.Files;
 10 | import java.nio.file.Paths;
 11 | import java.util.Iterator;
 12 | import java.util.Objects;
 13 | import java.util.stream.Stream;
 14 | 
 15 | import org.slf4j.Logger;
 16 | import org.slf4j.LoggerFactory;
 17 | 
 18 | import it.unimi.dsi.fastutil.longs.LongArrayList;
 19 | import it.unimi.dsi.fastutil.longs.LongList;
 20 | 
 21 | /**
 22 |  * Create a preference vector used for PageRank calculations, e.g.,
 23 |  * (Anti)TrustRank. See <a href=
 24 |  * "https://law.di.unimi.it/software/law-docs/it/unimi/dsi/law/rank/PageRank.html">PageRank.buildProperties(...)</a>.
 25 |  */
 26 | public class CreatePreferenceVector {
 27 | 
 28 | 	protected static Logger LOG = LoggerFactory.getLogger(CreatePreferenceVector.class);
 29 | 
 30 | 	private long lastId = 0;
 31 | 	private Iterator<String> preferenceIterator;
 32 | 	private LongList preferenceIds = new LongArrayList();
 33 | 	private double defaultPreferenceValue;
 34 | 	private String nextPreferenceName;
 35 | 	long recordsProcessed;
 36 | 	long preferenceNamesFound;
 37 | 
 38 | 
 39 | 	public CreatePreferenceVector(double defVal) {
 40 | 		defaultPreferenceValue = defVal;
 41 | 	}
 42 | 
 43 | 	private boolean nextPreferenceElement() {
 44 | 		if (preferenceIterator.hasNext()) {
 45 | 			nextPreferenceName = preferenceIterator.next();
 46 | 			return true;
 47 | 		} else {
 48 | 			nextPreferenceName = null;
 49 | 			return false;
 50 | 		}
 51 | 	}
 52 | 
 53 | 	private void setPrefSet(Stream<String> pref) {
 54 | 		preferenceIterator = pref.iterator();
 55 | 		nextPreferenceElement();
 56 | 	}
 57 | 
 58 | 	private void logProgress() {
 59 | 		LOG.info("Processed {} nodes, found {} preference elements", recordsProcessed, preferenceNamesFound);
 60 | 	}
 61 | 
 62 | 	private long readJoinNode(String line) {
 63 | 		int sep1 = line.indexOf('\t');
 64 | 		if (sep1 == -1) {
 65 | 			return -1;
 66 | 		}
 67 | 		lastId = Long.parseLong(line.substring(0, sep1));
 68 | 		sep1++;
 69 | 		int sep2 = line.indexOf('\t', sep1);
 70 | 		if (sep2 == -1) {
 71 | 			sep2 = line.length();
 72 | 		}
 73 | 		String name = line.substring(sep1, sep2);
 74 | 		long res = -1;
 75 | 		if (nextPreferenceName != null) {
 76 | 			int c = name.compareTo(nextPreferenceName);
 77 | 			while (c > 0 && nextPreferenceElement()) {
 78 | 				c = name.compareTo(nextPreferenceName);
 79 | 			}
 80 | 			if (c == 0) {
 81 | 				preferenceNamesFound++;
 82 | 				nextPreferenceElement();
 83 | 				res = lastId;
 84 | 			}
 85 | 		}
 86 | 		recordsProcessed++;
 87 | 		if ((recordsProcessed % 1000000) == 0) {
 88 | 			logProgress();
 89 | 		}
 90 | 		return res;
 91 | 	}
 92 | 
 93 | 	private Double convertNode(String line) {
 94 | 		if (readJoinNode(line) < 0) {
 95 | 			return 0.0;
 96 | 		}
 97 | 		return defaultPreferenceValue;
 98 | 	}
 99 | 
100 | 	private void read(Stream<String> in) {
101 | 		in.map(this::readJoinNode).forEach(id -> {
102 | 			if (id >= 0) {
103 | 				preferenceIds.add((long) id);
104 | 			}
105 | 		});
106 | 	}
107 | 
108 | 	private void write(DataOutputStream out) throws IOException {
109 | 		long id = 0;
110 | 		Iterator<Long> prefIdIter = preferenceIds.iterator();
111 | 		long nextPrefId = Long.MAX_VALUE;
112 | 		if (prefIdIter.hasNext()) {
113 | 			nextPrefId = prefIdIter.next();
114 | 		}
115 | 		defaultPreferenceValue = 1.0 / preferenceIds.size();
116 | 		LOG.info("Preference value = {}", defaultPreferenceValue);
117 | 		while (id <= lastId) {
118 | 			double res = 0.0;
119 | 			if (id == nextPrefId) {
120 | 				res = defaultPreferenceValue;
121 | 				if (prefIdIter.hasNext()) {
122 | 					nextPrefId = prefIdIter.next();
123 | 				} else {
124 | 					nextPrefId = Long.MAX_VALUE;
125 | 				}
126 | 			}
127 | 			out.writeDouble(res);
128 | 			id++;
129 | 			if ((id % 1000000) == 0) {
130 | 				LOG.info("{}% of preference vector written", String.format("%.2f", (100.0 * id / lastId)));
131 | 			}
132 | 		}
133 | 	}
134 | 
135 | 	private void convert(Stream<String> in, DataOutputStream out) {
136 | 		in.map(this::convertNode).filter(Objects::nonNull).forEach(t -> {
137 | 			try {
138 | 				out.writeDouble(t);
139 | 			} catch (IOException e) {
140 | 				LOG.error("Failed to write preference vector:", e);
141 | 				System.exit(1);
142 | 			}
143 | 		});
144 | 	}
145 | 
146 | 	/**
147 | 	 * Check preference vector whether values sum up to 1.0, see <a href=
148 | 	 * "https://law.di.unimi.it/software/law-docs/it/unimi/dsi/law/rank/SpectralRanking.html#isStochastic(it.unimi.dsi.fastutil.doubles.DoubleList)">isStochastic()</a>
149 | 	 */
150 | 	private boolean validatePreferenceVector() {
151 | 		double sumPreferenceValues = preferenceNamesFound * defaultPreferenceValue;
152 | 		if (Math.abs(sumPreferenceValues - 1.0) > 1E-6) {
153 | 			LOG.error("Sum of preference values not within tolerance: abs({} - 1.0) > {}", sumPreferenceValues, 1E-6);
154 | 			return false;
155 | 		}
156 | 		return true;
157 | 	}
158 | 
159 | 	private static void showHelp() {
160 | 		System.err.println(
161 | 				"CreatePreferenceVector [--value <preference_value>] <vertices> <preference_set> <preference_vector>");
162 | 		System.err.println("");
163 | 		System.err.println("Options:");
164 | 		System.err.println(" --value <preference_value>\tprecalculated preference value");
165 | 		System.err.println("                           \t1/n for n preferred vertices)\");");
166 | 		System.err.println("If no preference value is given, the preference set is kept");
167 | 		System.err.println("in memory, and the preference value is calculated using");
168 | 		System.err.println("the number of found preference elements");
169 | 		System.err.println("");
170 | 		System.err.println("Input / output parameters");
171 | 		System.err.println(" <vertices>\tvertices file with format:");
172 | 		System.err.println("           \t  <id> \\t <name>");
173 | 		System.err.println(" <preference_set>\tfile containing set of \"preferred\" vertices,");
174 | 		System.err.println("                 \tone vertex <name> per line");
175 | 		System.err.println(" <preference_vector>\toutput file, binary preference vector,");
176 | 		System.err.println("                    \tused as \"--preference-vector\"");
177 | 		System.err.println("                    \tfor the LAW PageRank classes");
178 | 		System.err.println("Both input files, vertices and preference set, must be sorted");
179 | 		System.err.println("lexicographically by vertex names, vertex ids are assigned");
180 | 		System.err.println("in sequential order starting from 0.");
181 | 		System.err.println("");
182 | 	}
183 | 
184 | 	public static void main(String[] args) {
185 | 		double defaultPrefVal = 0.0;
186 | 		boolean inMemory = true;
187 | 		int argpos = 0;
188 | 		while (argpos < args.length && args[argpos].startsWith("-")) {
189 | 			switch (args[argpos]) {
190 | 			case "--value":
191 | 				try {
192 | 					defaultPrefVal = Double.parseDouble(args[++argpos]);
193 | 				} catch (NumberFormatException e) {
194 | 					LOG.error("Invalid number: " + args[argpos]);
195 | 					System.exit(1);
196 | 				}
197 | 				inMemory = false;
198 | 				break;
199 | 			default:
200 | 				System.err.println("Unknown option " + args[argpos]);
201 | 				showHelp();
202 | 				System.exit(1);
203 | 			}
204 | 			argpos++;
205 | 		}
206 | 
207 | 		if (args.length < 3) {
208 | 			showHelp();
209 | 			System.exit(1);
210 | 		}
211 | 		String nodesIn = args[argpos++];
212 | 		String prefSet = args[argpos++];
213 | 		String prefOut = args[argpos++];
214 | 
215 | 		CreatePreferenceVector converter = new CreatePreferenceVector(defaultPrefVal);
216 | 
217 | 		try (Stream<String> in = Files.lines(Paths.get(nodesIn));
218 | 				Stream<String> pref = Files.lines(Paths.get(prefSet))) {
219 | 			DataOutputStream out;
220 | 			if (prefOut.equals("-")) {
221 | 				out = new DataOutputStream(System.out);
222 | 			} else {
223 | 				out = new DataOutputStream(Files.newOutputStream(Paths.get(prefOut)));
224 | 			}
225 | 			converter.setPrefSet(pref);
226 | 			if (inMemory) {
227 | 				LOG.info("Reading preference vector...");
228 | 				converter.read(in);
229 | 				LOG.info("Writing preference vector...");
230 | 				converter.write(out);
231 | 			} else {
232 | 				LOG.info("Converting preference vector...");
233 | 				converter.convert(in, out);
234 | 			}
235 | 			converter.logProgress();
236 | 			if (!converter.validatePreferenceVector()) {
237 | 				System.exit(2);
238 | 			}
239 | 		} catch (IOException e) {
240 | 			LOG.error("Failed to create preference vector:", e);
241 | 			System.exit(1);
242 | 		}
243 | 	}
244 | 
245 | }


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-License-Identifier: Apache-2.0
  3 |  * Copyright (C) 2022 Common Crawl and contributors
  4 |  */
  5 | package org.commoncrawl.webgraph;
  6 | 
  7 | import java.io.IOException;
  8 | import java.io.PrintStream;
  9 | import java.nio.charset.StandardCharsets;
 10 | import java.nio.file.Files;
 11 | import java.nio.file.Paths;
 12 | import java.util.ArrayList;
 13 | import java.util.List;
 14 | import java.util.Objects;
 15 | import java.util.TreeMap;
 16 | import java.util.function.Consumer;
 17 | import java.util.function.Function;
 18 | import java.util.regex.Pattern;
 19 | import java.util.stream.Stream;
 20 | 
 21 | import org.slf4j.Logger;
 22 | import org.slf4j.LoggerFactory;
 23 | 
 24 | import crawlercommons.domains.EffectiveTldFinder;
 25 | import it.unimi.dsi.fastutil.Arrays;
 26 | import it.unimi.dsi.fastutil.BigArrays;
 27 | import it.unimi.dsi.fastutil.longs.LongBigArrays;
 28 | 
 29 | /**
 30 |  * Convert host-level webgraph to domain-level webgraph. A webgraph is
 31 |  * represented by two text files/streams with tab-separated columns
 32 |  * <dl>
 33 |  * <dt>vertices</dt>
 34 |  * <dd>&lt;id, revName&gt;</dd>
 35 |  * <dt>edges</dt>
 36 |  * <dd>&lt;fromId, toId&gt;</dd>
 37 |  * </dl>
 38 |  * Host or domain names are reversed (<code>www.example.com</code> is written as
 39 |  * <code>com.example.www</code>). The vertices file is sorted lexicographically
 40 |  * by host name in
 41 |  * <a href="https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
 42 |  * domain name notation</a>. IDs (0,1,...,n) are assigned in this sort order.
 43 |  * The edges file is sorted numerically first by fromId, second by toId. These
 44 |  * sorting restrictions allow to convert large host graphs with acceptable
 45 |  * memory requirements (number of hosts &times; 4 bytes plus some memory to
 46 |  * queue domains unless all hosts under this domain are processed).
 47 |  * 
 48 |  * <p>
 49 |  * Notes, assumptions and preconditions:
 50 |  * </p>
 51 |  * <ul>
 52 |  * <li>host vertices must be sorted lexicographically by reversed host name, see
 53 |  * above</li>
 54 |  * <li>the host-domain map is hold as array. To overcome Java's max array size
 55 |  * (approx. 2^32 or {@link Arrays#MAX_ARRAY_SIZE}) {@link HostToDomainGraphBig}
 56 |  * (based on fastutils' {@link BigArrays}) is used if the array size limit is
 57 |  * hit by the number of hosts. This number (or an estimate) needs to be known
 58 |  * ahead.</li>
 59 |  * <li>the number of resulting domains is limited by Java's max. array size.
 60 |  * This shouldn't be a problem.</li>
 61 |  * <li>also the number of hosts per domain is limited by Java's max. array
 62 |  * size.</li>
 63 |  * </ul>
 64 |  */
 65 | public class HostToDomainGraph {
 66 | 
 67 | 	protected static Logger LOG = LoggerFactory.getLogger(HostToDomainGraph.class);
 68 | 
 69 | 	protected boolean countHosts = false;
 70 | 	protected boolean privateDomains = false;
 71 | 	protected boolean includeMultiPartSuffixes = false;
 72 | 
 73 | 	protected long maxSize;
 74 | 	private int[] ids;
 75 | 	protected long currentId = -1;
 76 | 	protected long lastFromId = -1;
 77 | 	protected long lastToId = -1;
 78 | 	private long numInputLinesNodes = 0;
 79 | 	private long numInputLinesEdges = 0;
 80 | 	protected String lastRevHost = null;
 81 | 	protected Domain lastDomain = null;
 82 | 	private TreeMap<String, Domain> domainQueue = new TreeMap<>();
 83 | 	private int maxQueueUsed = 0;
 84 | 
 85 | 	private static Pattern SPLIT_HOST_PATTERN = Pattern.compile("\\.");
 86 | 
 87 | 	private Consumer<? super String> reporterInputNodes = (String line) -> {
 88 | 		if ((numInputLinesNodes % 500000) != 0 || numInputLinesNodes == 0) {
 89 | 			return;
 90 | 		}
 91 | 		LOG.info("Processed {} node input lines, mapped to {} domains, domain queue usage: {} (max. {})",
 92 | 				numInputLinesNodes, (currentId + 1), domainQueue.size(), maxQueueUsed);
 93 | 	};
 94 | 
 95 | 	private Consumer<? super String> reporterInputEdges = (String line) -> {
 96 | 		if ((numInputLinesEdges % 5000000) != 0 || numInputLinesEdges == 0) {
 97 | 			return;
 98 | 		}
 99 | 		LOG.info("Processed {} edge input lines, last edge from node id = {}", numInputLinesEdges, lastFromId);
100 | 	};
101 | 
102 | 	private void reportConfig() {
103 | 		LOG.info("{} with {} host vertices", this.getClass().getSimpleName(), maxSize);
104 | 		LOG.info(" - map to {} domains", (privateDomains ? "private" : "ICANN"));
105 | 		LOG.info(" - {}multi-part public suffixes as domains", (includeMultiPartSuffixes ? "" : "no "));
106 | 	}
107 | 
108 | 	/**
109 | 	 * Representation of a domain as a result of folding one or more host names to a
110 | 	 * domain name. Holds all information for the given domain to convert host
111 | 	 * vertices and associated edges into a domain graph.
112 | 	 */
113 | 	protected static class Domain implements Comparable<Domain> {
114 | 		final static char HYPHEN = '-';
115 | 		final static char DOT = '.';
116 | 		String name;
117 | 		String revName;
118 | 		long id;
119 | 		long numberOfHosts;
120 | 		List<Long> ids = new ArrayList<>();
121 | 
122 | 		public Domain(String name, String revName, long id, long numberOfHosts) {
123 | 			this.name = name;
124 | 			this.revName = revName;
125 | 			this.id = id;
126 | 			this.numberOfHosts = numberOfHosts;
127 | 		}
128 | 
129 | 		public Domain(String name, long id, long numberOfHosts) {
130 | 			this(name, reverseHost(name), id, numberOfHosts);
131 | 		}
132 | 
133 | 		public Domain(String name) {
134 | 			this(name, -1, 0);
135 | 		}
136 | 
137 | 		public Domain(String name, String revName) {
138 | 			this(name, revName, -1, 0);
139 | 		}
140 | 
141 | 		public Domain(String name, long hostId) {
142 | 			this(name, -1, 0);
143 | 			add(hostId);
144 | 		}
145 | 
146 | 		public void add(long hostId) {
147 | 			ids.add(hostId);
148 | 			numberOfHosts++;
149 | 		}
150 | 
151 | 		@Override
152 | 		public String toString() {
153 | 			return name;
154 | 		}
155 | 
156 | 		@Override
157 | 		public int compareTo(Domain o) {
158 | 			return revName.compareTo(o.revName);
159 | 		}
160 | 
161 | 		/**
162 | 		 * Whether the domain is safe to output given the reversed domain name seen
163 | 		 * next.
164 | 		 * 
165 | 		 * @param nextDomainRevName next name in lexicographically sorted list of
166 | 		 *                          reversed domain names
167 | 		 * @return true if the domain is safe to output, that is from a list of sorted
168 | 		 *         host names no host later in this list may fold to this domain name
169 | 		 */
170 | 		public boolean isSafeToOutput(String nextDomainRevName) {
171 | 			return isSafeToOutput(this.revName, nextDomainRevName);
172 | 		}
173 | 
174 | 		public static boolean isSafeToOutput(String domainRevName, String nextDomainRevName) {
175 | 			return compareRevDomainsSafe(domainRevName, nextDomainRevName) < 0;
176 | 		}
177 | 
178 | 		public static int compareRevDomainsSafe(String d1, String d2) {
179 | 			int l1 = d1.length();
180 | 			int l2 = d2.length();
181 | 			int l = Math.min(l1, l2);
182 | 			int dots = 0;
183 | 			for (int i = 0; i < l; i++) {
184 | 				char c1 = d1.charAt(i);
185 | 				char c2 = d2.charAt(i);
186 | 				if (c1 != c2) {
187 | 					return c1 - c2;
188 | 				} else if (c1 == HYPHEN) {
189 | 					/*
190 | 					 * cannot finish "org.example-domain" unless "org.example" is done
191 | 					 */
192 | 					return 0;
193 | 				} else if (c1 == DOT) {
194 | 					dots++;
195 | 					if (dots > 1) {
196 | 						/*
197 | 						 * cannot finish "name.his.forgot.foobar" unless "name.his" is done
198 | 						 * 
199 | 						 * This is a special case of multi-part suffixes with more than two parts when
200 | 						 * the first part is also a public suffix, e.g. (in reversed domain name
201 | 						 * notation) if "a" and "a.b.c" are public suffixes, and the input hosts are
202 | 						 * (sorted): "a.b.c.d", "a.b.c.e" and "a.b.f", then we need to delay the output
203 | 						 * of "a.b.c.*" until "a.b" is done.
204 | 						 */
205 | 						return 0;
206 | 					}
207 | 				}
208 | 			}
209 | 			if (l1 == l2) {
210 | 				return 0;
211 | 			}
212 | 			if (l1 > l2) {
213 | 				char c1 = d1.charAt(l2);
214 | 				switch (c1) {
215 | 				case HYPHEN:
216 | 					/*
217 | 					 * cannot finish "org.example-domain" unless "org.example" is done
218 | 					 */
219 | 				case DOT:
220 | 					// cannot finish "tld.suffix.suffix2.domain" unless "tld.suffix" is done
221 | 					return 1;
222 | 				}
223 | 				return c1 - DOT;
224 | 			}
225 | 			char c2 = d2.charAt(l1);
226 | 			if (c2 == HYPHEN || c2 == DOT)
227 | 				return 1;
228 | 			return DOT - c2;
229 | 		}
230 | 	}
231 | 
232 | 	private HostToDomainGraph() {
233 | 	}
234 | 
235 | 	public HostToDomainGraph(int maxSize) {
236 | 		this.maxSize = maxSize;
237 | 		ids = new int[maxSize];
238 | 	}
239 | 
240 | 	/**
241 | 	 * @param countHosts if true count the number of hosts per domain
242 | 	 */
243 | 	public void doCount(boolean countHosts) {
244 | 		this.countHosts = countHosts;
245 | 	}
246 | 
247 | 	/**
248 | 	 * @param privateDomains if true map host to domain names using also the
249 | 	 *                       suffixes from the <a href=
250 | 	 *                       "https://github.com/publicsuffix/list/wiki/Format#divisions">subdivision</a>
251 | 	 *                       of "private domains" in the public suffix list in
252 | 	 *                       addition to the "ICANN domains" used otherwise
253 | 	 */
254 | 	public void doPrivateDomains(boolean privateDomains) {
255 | 		this.privateDomains = privateDomains;
256 | 	}
257 | 
258 | 	/**
259 | 	 * deprecated, use {@link #multiPartSuffixesAsDomains(boolean)} instead (note
260 | 	 * that this requires to invert boolean parameter)
261 | 	 * 
262 | 	 * @param strict if false map host names equal to any multi-part public suffix
263 | 	 *               (the suffix contains a dot) (eg. <code>gov.uk</code> or
264 | 	 *               <code>freight.aero</code>) one by one to domain names.
265 | 	 */
266 | 	@Deprecated
267 | 	public void setStrictDomainValidate(boolean strict) {
268 | 		this.includeMultiPartSuffixes = !strict;
269 | 	}
270 | 
271 | 	/**
272 | 	 * @param include if true map host names equal to any multi-part public suffix
273 | 	 *                (the suffix contains a dot) (eg. <code>gov.uk</code> or
274 | 	 *                <code>freight.aero</code>) one by one to domain names.
275 | 	 */
276 | 	public void multiPartSuffixesAsDomains(boolean include) {
277 | 		this.includeMultiPartSuffixes = include;
278 | 	}
279 | 
280 | 	/**
281 | 	 * Reverse host name, eg. <code>www.example.com</code> is reversed to
282 | 	 * <code>com.example.www</code>. Can also be used to "unreverse" a reversed host
283 | 	 * name.
284 | 	 * 
285 | 	 * @param host name
286 | 	 * @return host in <a href=
287 | 	 *         "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
288 | 	 *         domain name notation</a>
289 | 	 */
290 | 	public static String reverseHost(String host) {
291 | 		String[] rev = SPLIT_HOST_PATTERN.split(host);
292 | 		for (int i = 0; i < (rev.length / 2); i++) {
293 | 			String temp = rev[i];
294 | 			rev[i] = rev[rev.length - i - 1];
295 | 			rev[rev.length - i - 1] = temp;
296 | 		}
297 | 		return String.join(".", rev);
298 | 	}
299 | 
300 | 	protected void setValue(long id, long value) {
301 | 		ids[(int) id] = (int) value;
302 | 	}
303 | 
304 | 	protected long getValue(long id) {
305 | 		return ids[(int) id];
306 | 	}
307 | 
308 | 	public String convertNode(String line) {
309 | 		numInputLinesNodes++;
310 | 		int sep = line.indexOf('\t');
311 | 		if (sep == -1) {
312 | 			LOG.warn("Skipping invalid line: <{}>", line);
313 | 			return "";
314 | 		}
315 | 		long id = Long.parseLong(line.substring(0, sep));
316 | 		String revHost = line.substring(sep + 1);
317 | 		if (lastRevHost != null) {
318 | 			if (lastRevHost.compareTo(revHost) >= 0) {
319 | 				String msg = "Reversed host names in input are not properly sorted: " + lastRevHost + " <> " + revHost;
320 | 				LOG.error(msg);
321 | 				throw new RuntimeException(msg);
322 | 			}
323 | 		}
324 | 		lastRevHost = revHost;
325 | 		String host = reverseHost(revHost);
326 | 		String domain = EffectiveTldFinder.getAssignedDomain(host, true, !privateDomains);
327 | 		StringBuilder sb = new StringBuilder();
328 | 		if (domain == null && includeMultiPartSuffixes) {
329 | 			if (EffectiveTldFinder.getEffectiveTLDs().containsKey(host) && host.indexOf('.') != -1) {
330 | 				LOG.info("Accepting public suffix (containing dot) as domain: {}", host);
331 | 			}
332 | 			domain = host;
333 | 		}
334 | 		if (domain == null) {
335 | 			LOG.warn("No domain for host: {}", host);
336 | 			setValue(id, -1);
337 | 			return null;
338 | 		}
339 | 		if (lastDomain != null && domain.equals(lastDomain.name)) {
340 | 			// short cut for the common case of many subsequent subdomains of the same domain
341 | 			lastDomain.add(id);
342 | 			return null;
343 | 		}
344 | 		lastDomain = queueDomain(sb, domain);
345 | 		if (lastDomain != null) {
346 | 			lastDomain.add(id);
347 | 		}
348 | 		if (sb.length() == 0) {
349 | 			return null;
350 | 		}
351 | 		return sb.toString();
352 | 	}
353 | 
354 | 	/**
355 | 	 * Add the domain name to the queue if it is not already queued. Flush the
356 | 	 * queue, assuming properly sorted input.
357 | 	 * 
358 | 	 * @param sb         domains which are safe to print are added to this
359 | 	 *                   StringBuilder.
360 | 	 * @param domainName domain name to be queued
361 | 	 * @return the queued domain object
362 | 	 */
363 | 	private Domain queueDomain(StringBuilder sb, String domainName) {
364 | 		String revDomainName = reverseHost(domainName);
365 | 		Domain domain = null;
366 | 		// first, poll all queued domains safe to output
367 | 		while (!domainQueue.isEmpty()) {
368 | 			String firstDomain = domainQueue.firstKey();
369 | 			if (!Domain.isSafeToOutput(firstDomain, revDomainName)) {
370 | 				/*
371 | 				 * queued domains are sorted lexicographically: if the first/current domain
372 | 				 * cannot be safely dequeued and written to output, this is also the case for
373 | 				 * the following ones.
374 | 				 */
375 | 				break;
376 | 			}
377 | 			Domain d = domainQueue.pollFirstEntry().getValue();
378 | 			d.id = ++currentId;
379 | 			getNodeLine(sb, d);
380 | 		}
381 | 		if (domainQueue.containsKey(revDomainName)) {
382 | 			domain = domainQueue.get(revDomainName);
383 | 		} else {
384 | 			domain = new Domain(domainName);
385 | 			domainQueue.put(revDomainName, domain);
386 | 			if (domainQueue.size() > maxQueueUsed) {
387 | 				maxQueueUsed = domainQueue.size();
388 | 			}
389 | 		}
390 | 		return domain;
391 | 	}
392 | 
393 | 	private String getNodeLine(Domain domain) {
394 | 		StringBuilder b = new StringBuilder();
395 | 		getNodeLine(b, domain);
396 | 		return b.toString();
397 | 	}
398 | 
399 | 	private void getNodeLine(StringBuilder b, Domain domain) {
400 | 		if (domain == null)
401 | 			return;
402 | 		if (domain.id >= 0 && domain.name != null) {
403 | 			if (b.length() > 0) {
404 | 				b.append('\n');
405 | 			}
406 | 			b.append(domain.id);
407 | 			b.append('\t');
408 | 			b.append(reverseHost(domain.name));
409 | 			if (countHosts) {
410 | 				b.append('\t');
411 | 				b.append(domain.numberOfHosts);
412 | 			}
413 | 		}
414 | 		for (Long hostId : domain.ids) {
415 | 			setValue(hostId.longValue(), domain.id);
416 | 		}
417 | 	}
418 | 
419 | 	public String convertEdge(String line) {
420 | 		numInputLinesEdges++;
421 | 		int sep = line.indexOf('\t');
422 | 		if (sep == -1) {
423 | 			return "";
424 | 		}
425 | 		long fromId = Long.parseLong(line.substring(0, sep));
426 | 		long toId = Long.parseLong(line.substring(sep + 1));
427 | 		fromId = getValue(fromId);
428 | 		toId = getValue(toId);
429 | 		if (fromId == toId || fromId == -1 || toId == -1 || (lastFromId == fromId && lastToId == toId)) {
430 | 			return null;
431 | 		}
432 | 		lastFromId = fromId;
433 | 		lastToId = toId;
434 | 		return fromId + "\t" + toId;
435 | 	}
436 | 
437 | 	public void convert(Function<String, String> func, Stream<String> in, PrintStream out) {
438 | 		in.map(func).filter(Objects::nonNull).forEach(out::println);
439 | 	}
440 | 
441 | 	public void convert(Function<String, String> func, Stream<String> in, PrintStream out,
442 | 			Consumer<? super String> reporter) {
443 | 		convert(func, in.peek(reporter), out);
444 | 	}
445 | 
446 | 	public void finishNodes(PrintStream out) {
447 | 		for (Domain domain : domainQueue.values()) {
448 | 			domain.id = ++currentId;
449 | 			out.println(getNodeLine(domain));
450 | 		}
451 | 		out.flush();
452 | 		domainQueue.clear();
453 | 		LOG.info("Number of input lines: {}", numInputLinesNodes);
454 | 		LOG.info("Number of domain nodes: {}", currentId + 1);
455 | 		LOG.info("Max. domain queue usage: {}", maxQueueUsed);
456 | 	}
457 | 
458 | 	/**
459 | 	 * Holds a host to domain graph mapping if the size of the host graph exceeds
460 | 	 * {@link Arrays#MAX_ARRAY_SIZE}.
461 | 	 */
462 | 	public static class HostToDomainGraphBig extends HostToDomainGraph {
463 | 
464 | 		private long[][] ids;
465 | 
466 | 		public HostToDomainGraphBig(long maxSize) {
467 | 			this.maxSize = maxSize;
468 | 			ids = LongBigArrays.newBigArray(maxSize);
469 | 		}
470 | 
471 | 		@Override
472 | 		protected void setValue(long id, long value) {
473 | 			BigArrays.set(ids, id, value);
474 | 		}
475 | 
476 | 		@Override
477 | 		protected long getValue(long id) {
478 | 			return BigArrays.get(ids, id);
479 | 		}
480 | 	}
481 | 
482 | 	private static void showHelp() {
483 | 		System.err.println("HostToDomainGraph [options]... <maxSize> <nodes_in> <nodes_out> <edges_in> <edges_out>");
484 | 		System.err.println("");
485 | 		System.err.println("Convert host-level webgraph to domain-level webgraph.");
486 | 		System.err.println("Both input and output must be UTF-8 or ASCII, the input is required");
487 | 		System.err.println("to be sorted lexicographically by node labels given in reversed domain name notation.");
488 | 		System.err.println("");
489 | 		System.err.println("Options:");
490 | 		System.err.println(" -h\t(also -? or --help) show usage message and exit");
491 | 		System.err.println(" -c\tcount hosts per domain (additional column in <nodes_out>");
492 | 		System.err.println(" --private-domains\tconvert to private domains (include suffixes from the");
493 | 		System.err.println("                  \tPRIVATE domains subdivision of the public suffix list,");
494 | 		System.err.println("                  \tsee https://github.com/publicsuffix/list/wiki/Format#divisions");
495 | 		System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part");
496 | 		System.err.println("                                \tpublic suffixes (the suffix contains a dot) as domain");
497 | 		System.err.println("                                \tnames, eg. `gov.uk', `freight.aero' or `altoadige.it'.");
498 | 		System.err.println("                                \tNo further validation (DNS lookup) is performed.");
499 | 	}
500 | 
501 | 	public static void main(String[] args) {
502 | 		boolean countHosts = false;
503 | 		boolean includeMultiPartSuffixes = false;
504 | 		boolean privateDomains = false;
505 | 		int argpos = 0;
506 | 		while (argpos < args.length && args[argpos].startsWith("-")) {
507 | 			switch (args[argpos]) {
508 | 			case "-?":
509 | 			case "-h":
510 | 			case "--help":
511 | 				showHelp();
512 | 				System.exit(0);
513 | 			case "-c":
514 | 				countHosts = true;
515 | 				break;
516 | 			case "--multipart-suffixes-as-domains":
517 | 			case "--no-strict-domain-validate": // back-ward compatibility
518 | 				includeMultiPartSuffixes = true;
519 | 				break;
520 | 			case "--private-domains":
521 | 			case "--private": // back-ward compatibility
522 | 				privateDomains = true;
523 | 				break;
524 | 			default:
525 | 				System.err.println("Unknown option " + args[argpos]);
526 | 				showHelp();
527 | 				System.exit(1);
528 | 			}
529 | 			argpos++;
530 | 		}
531 | 		if ((args.length - argpos) < 5) {
532 | 			showHelp();
533 | 			System.exit(1);
534 | 		}
535 | 		long maxSize = 0;
536 | 		try {
537 | 			maxSize = Long.parseLong(args[argpos + 0]);
538 | 		} catch (NumberFormatException e) {
539 | 			LOG.error("Invalid number: " + args[argpos + 0]);
540 | 			System.exit(1);
541 | 		}
542 | 		HostToDomainGraph converter;
543 | 		if (maxSize <= Arrays.MAX_ARRAY_SIZE) {
544 | 			converter = new HostToDomainGraph((int) maxSize);
545 | 		} else {
546 | 			converter = new HostToDomainGraphBig(maxSize);
547 | 		}
548 | 		converter.doCount(countHosts);
549 | 		converter.multiPartSuffixesAsDomains(includeMultiPartSuffixes);
550 | 		converter.doPrivateDomains(privateDomains);
551 | 		converter.reportConfig();
552 | 		String nodesIn = args[argpos + 1];
553 | 		String nodesOut = args[argpos + 2];
554 | 		try (Stream<String> in = Files.lines(Paths.get(nodesIn), StandardCharsets.UTF_8);
555 | 				PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(nodesOut)), false,
556 | 						StandardCharsets.UTF_8)) {
557 | 			converter.convert(converter::convertNode, in, out, converter.reporterInputNodes);
558 | 			converter.finishNodes(out);
559 | 			LOG.info("Finished conversion of nodes/vertices");
560 | 		} catch (IOException e) {
561 | 			LOG.error("Failed to convert nodes", e);
562 | 			System.exit(1);
563 | 		}
564 | 		String edgesIn = args[argpos + 3];
565 | 		String edgesOut = args[argpos + 4];
566 | 		try (Stream<String> in = Files.lines(Paths.get(edgesIn), StandardCharsets.UTF_8);
567 | 				PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(edgesOut)), false,
568 | 						StandardCharsets.UTF_8)) {
569 | 			converter.convert(converter::convertEdge, in, out, converter.reporterInputEdges);
570 | 			LOG.info("Finished conversion of edges");
571 | 		} catch (IOException e) {
572 | 			LOG.error("Failed to convert edges", e);
573 | 			System.exit(1);
574 | 		}
575 | 	}
576 | 
577 | }
578 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/webgraph/JoinSortRanks.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-License-Identifier: Apache-2.0
  3 |  * Copyright (C) 2022 Common Crawl and contributors
  4 |  */
  5 | package org.commoncrawl.webgraph;
  6 | 
  7 | import java.io.IOException;
  8 | import java.io.OutputStream;
  9 | import java.io.PrintStream;
 10 | import java.nio.charset.StandardCharsets;
 11 | import java.nio.file.Files;
 12 | import java.nio.file.Paths;
 13 | import java.util.function.Function;
 14 | import java.util.stream.Stream;
 15 | 
 16 | import org.slf4j.Logger;
 17 | import org.slf4j.LoggerFactory;
 18 | 
 19 | import it.unimi.dsi.fastutil.Arrays;
 20 | import it.unimi.dsi.fastutil.BigArrays;
 21 | import it.unimi.dsi.fastutil.ints.IntArrays;
 22 | import it.unimi.dsi.fastutil.ints.IntComparator;
 23 | import it.unimi.dsi.fastutil.io.BinIO;
 24 | import it.unimi.dsi.fastutil.longs.LongBigArrays;
 25 | import it.unimi.dsi.fastutil.longs.LongComparator;
 26 | 
 27 | /**
 28 |  * Assign ranks to harmonic centrality and page rank values, join ranks with
 29 |  * node names and sort by decreasing harmonic centrality rank/score.
 30 |  * 
 31 |  * Sorting and joining is done in memory. For a graph with <i>n</i> nodes, the
 32 |  * required memory is 24 * <i>n</i> bytes, resp. 36 * <i>n</i> bytes if <i>n</i>
 33 |  * &gt; {@link Arrays#MAX_ARRAY_SIZE}. In practice, the requirements are higher
 34 |  * by about 50%.
 35 |  */
 36 | public class JoinSortRanks {
 37 | 
 38 | 	protected static Logger LOG = LoggerFactory.getLogger(JoinSortRanks.class);
 39 | 
 40 | 	private float[] harmonicCentralityValues;
 41 | 	private double[] pageRankValues;
 42 | 
 43 | 	private int[] harmonicCentralityRanks;
 44 | 	private int[] pageRankRanks;
 45 | 	private int[] indirectSortPerm;
 46 | 
 47 | 	public void loadHarmonicCentrality(String ranksHC) throws IOException {
 48 | 		harmonicCentralityValues = BinIO.loadFloats(ranksHC);
 49 | 		harmonicCentralityRanks = new int[harmonicCentralityValues.length];
 50 | 	}
 51 | 
 52 | 	public void loadPageRank(String ranksPR) throws IOException {
 53 | 		pageRankValues = BinIO.loadDoubles(ranksPR);
 54 | 		pageRankRanks = new int[pageRankValues.length];
 55 | 	}
 56 | 
 57 | 	private int compareHarmonicCentralityIndirect(int k1, int k2) {
 58 | 		k1 = indirectSortPerm[k1];
 59 | 		k2 = indirectSortPerm[k2];
 60 | 		float f1 = harmonicCentralityValues[k1];
 61 | 		float f2 = harmonicCentralityValues[k2];
 62 | 		// sort in reverse order, higher values first
 63 | 		if (f1 < f2) {
 64 | 			return 1;
 65 | 		}
 66 | 		if (f1 > f2) {
 67 | 			return -1;
 68 | 		}
 69 | 		// secondary sorting by original order (node IDs)
 70 | 		return Integer.compare(k1, k2);
 71 | 	}
 72 | 
 73 | 	private int comparePageRankIndirect(int k1, int k2) {
 74 | 		k1 = indirectSortPerm[k1];
 75 | 		k2 = indirectSortPerm[k2];
 76 | 		double f1 = pageRankValues[k1];
 77 | 		double f2 = pageRankValues[k2];
 78 | 		// sort in reverse order, higher values first
 79 | 		if (f1 < f2) {
 80 | 			return 1;
 81 | 		}
 82 | 		if (f1 > f2) {
 83 | 			return -1;
 84 | 		}
 85 | 		// secondary sorting by original order (node IDs)
 86 | 		return Integer.compare(k1, k2);
 87 | 	}
 88 | 
 89 | 	private void swapIndirect(int k1, int k2) {
 90 | 		IntArrays.swap(indirectSortPerm, k1, k2);
 91 | 	}
 92 | 
 93 | 	private void assignRank(int[] ranks, IntComparator comp) {
 94 | 		int length = ranks.length;
 95 | 		indirectSortPerm = new int[length];
 96 | 		for (int i = 0; i < length; i++) {
 97 | 			indirectSortPerm[i] = i;
 98 | 		}
 99 | 		Arrays.parallelQuickSort(0, length, comp, this::swapIndirect);
100 | 		for (int i = 0; i < length; ) {
101 | 			ranks[indirectSortPerm[i]] = ++i;
102 | 		}
103 | 		indirectSortPerm = null;
104 | 	}
105 | 
106 | 	public void assignHarmonicCentralityRank() {
107 | 		assignRank(harmonicCentralityRanks, this::compareHarmonicCentralityIndirect);
108 | 	}
109 | 
110 | 	public void assignPageRankRank() {
111 | 		assignRank(pageRankRanks, this::comparePageRankIndirect);
112 | 	}
113 | 
114 | 	protected float getHarmonicCentralityValue(long id) {
115 | 		return harmonicCentralityValues[(int) id];
116 | 	}
117 | 
118 | 	protected long getHarmonicCentralityRank(long id) {
119 | 		return harmonicCentralityRanks[(int) id];
120 | 	}
121 | 
122 | 	protected double getPageRankValue(long id) {
123 | 		return pageRankValues[(int) id];
124 | 	}
125 | 
126 | 	protected long getPageRankRank(long id) {
127 | 		return pageRankRanks[(int) id];
128 | 	}
129 | 
130 | 	public void convert(Function<String, String> func, Stream<String> in, PrintStream out) {
131 | 		in.map(func).forEach(out::println);
132 | 	}
133 | 
134 | 	public String addRanks(String line) {
135 | 		int sep = line.indexOf('\t');
136 | 		if (sep == -1) {
137 | 			return "";
138 | 		}
139 | 		long id = Long.parseLong(line.substring(0, sep));
140 | 		// check whether new line is already contained
141 | 		int end = line.lastIndexOf('\n');
142 | 		String revHost = line.substring(sep+1);
143 | 		float hcv = getHarmonicCentralityValue(id);
144 | 		long hcr = getHarmonicCentralityRank(id);
145 | 		double prv = getPageRankValue(id);
146 | 		long prr = getPageRankRank(id);
147 | 		StringBuilder sb = new StringBuilder();
148 | 		sb.append(hcr);
149 | 		sb.append('\t');
150 | 		sb.append(hcv);
151 | 		sb.append('\t');
152 | 		sb.append(prr);
153 | 		sb.append('\t');
154 | 		sb.append(prv);
155 | 		sb.append('\t');
156 | 		sb.append(revHost);
157 | 		if (end != -1) {
158 | 			sb.append('\n');
159 | 		}
160 | 		return sb.toString();
161 | 	}
162 | 
163 | 
164 | 	/**
165 | 	 * Implementation of {@link JoinSortRanks} for lists exceeding
166 | 	 * {@link Arrays#MAX_ARRAY_SIZE}.
167 | 	 */
168 | 	public static class JoinSortRanksBig extends JoinSortRanks {
169 | 
170 | 		private float[][] harmonicCentralityValues;
171 | 		private double[][] pageRankValues;
172 | 
173 | 		private long[][] harmonicCentralityRanks;
174 | 		private long[][] pageRankRanks;
175 | 		private long[][] indirectSortPerm;
176 | 
177 | 		public void loadHarmonicCentrality(String ranksFile) throws IOException {
178 | 			harmonicCentralityValues = BinIO.loadFloatsBig(ranksFile);
179 | 			long length = BigArrays.length(harmonicCentralityValues);
180 | 			harmonicCentralityRanks = LongBigArrays.newBigArray(length);
181 | 		}
182 | 
183 | 		public void loadPageRank(String ranksFile) throws IOException {
184 | 			pageRankValues = BinIO.loadDoublesBig(ranksFile);
185 | 			long length = BigArrays.length(pageRankValues);
186 | 			pageRankRanks = LongBigArrays.newBigArray(length);
187 | 		}
188 | 
189 | 		private int compareHarmonicCentralityIndirect(long k1, long k2) {
190 | 			k1 = BigArrays.get(indirectSortPerm, k1);
191 | 			k2 = BigArrays.get(indirectSortPerm, k2);
192 | 			float f1 = BigArrays.get(harmonicCentralityValues, k1);
193 | 			float f2 = BigArrays.get(harmonicCentralityValues, k2);
194 | 			// sort in reverse order, higher values first
195 | 			if (f1 < f2) {
196 | 				return 1;
197 | 			}
198 | 			if (f1 > f2) {
199 | 				return -1;
200 | 			}
201 | 			// secondary sorting by original order (node IDs)
202 | 			return Long.compare(k1, k2);
203 | 		}
204 | 
205 | 		private int comparePageRankIndirect(long k1, long k2) {
206 | 			k1 = BigArrays.get(indirectSortPerm, k1);
207 | 			k2 = BigArrays.get(indirectSortPerm, k2);
208 | 			double f1 = BigArrays.get(pageRankValues, k1);
209 | 			double f2 = BigArrays.get(pageRankValues, k2);
210 | 			// sort in reverse order, higher values first
211 | 			if (f1 < f2) {
212 | 				return 1;
213 | 			}
214 | 			if (f1 > f2) {
215 | 				return -1;
216 | 			}
217 | 			// secondary sorting by original order (node IDs)
218 | 			return Long.compare(k1, k2);
219 | 		}
220 | 
221 | 		private void swapIndirect(long k1, long k2) {
222 | 			BigArrays.swap(indirectSortPerm, k1, k2);
223 | 		}
224 | 
225 | 		private void assignRank(long[][] ranks, LongComparator comp) {
226 | 			long length = BigArrays.length(ranks);
227 | 			indirectSortPerm = LongBigArrays.newBigArray(length);
228 | 			for (long i = 0; i < length; i++) {
229 | 				BigArrays.set(indirectSortPerm, i, i);
230 | 			}
231 | 			BigArrays.quickSort(0, length, comp, this::swapIndirect);
232 | 			for (long i = 0; i < length; ) {
233 | 				BigArrays.set(ranks, BigArrays.get(indirectSortPerm, i), ++i);
234 | 			}
235 | 			indirectSortPerm = null;
236 | 		}
237 | 
238 | 		public void assignHarmonicCentralityRank() {
239 | 			assignRank(harmonicCentralityRanks, this::compareHarmonicCentralityIndirect);
240 | 		}
241 | 
242 | 		public void assignPageRankRank() {
243 | 			assignRank(pageRankRanks, this::comparePageRankIndirect);
244 | 		}
245 | 
246 | 		protected float getHarmonicCentralityValue(long id) {
247 | 			return BigArrays.get(harmonicCentralityValues, id);
248 | 		}
249 | 
250 | 		protected long getHarmonicCentralityRank(long id) {
251 | 			return BigArrays.get(harmonicCentralityRanks, id);
252 | 		}
253 | 
254 | 		protected double getPageRankValue(long id) {
255 | 			return BigArrays.get(pageRankValues, id);
256 | 		}
257 | 
258 | 		protected long getPageRankRank(long id) {
259 | 			return BigArrays.get(pageRankRanks, id);
260 | 		}
261 | 
262 | 	}
263 | 
264 | 	private static void showHelp() {
265 | 		System.err.println("JoinSortRanks [--big] <vertices> <hc.bin> <pr.bin> <ranks_out>");
266 | 		System.err.println("");
267 | 		System.err.println("Assign ranks to harmonic centrality and page rank values,");
268 | 		System.err.println("and join ranks with node names.");
269 | 		System.err.println("");
270 | 		System.err.println("Options:");
271 | 		System.err.println(" --big\tgraphs are \"big\" (more than 2^31 nodes)");
272 | 		System.err.println("");
273 | 		System.err.println("Input / output parameters (text must be UTF-8)");
274 | 		System.err.println(" <vertices>\tvertices file with format:");
275 | 		System.err.println("           \t  <id> \\t <name> [ \\t <optionalfield>]...");
276 | 		System.err.println(" <hc.bin>  \tharmonic centrality values, binary floats");
277 | 		System.err.println(" <pr.bin>  \tpage rank values, binary doubles");
278 | 		System.err.println(" <ranks_out>\tranks output, tab-separated:");
279 | 		System.err.println("            \t   <hc_rank> <hc_val> <pr_rank> <pr_val> <name> <optfields>...");
280 | 		System.err.println("");
281 | 	}
282 | 
283 | 	public static void main(String[] args) {
284 | 		boolean useBigGraph = false;
285 | 		int argpos = 0;
286 | 		while (argpos < args.length && args[argpos].startsWith("-")) {
287 | 			switch (args[argpos]) {
288 | 			case "--big":
289 | 				useBigGraph = true;
290 | 				break;
291 | 			default:
292 | 				System.err.println("Unknown option " + args[argpos]);
293 | 				showHelp();
294 | 				System.exit(1);
295 | 			}
296 | 			argpos++;
297 | 		}
298 | 		if ((args.length - argpos) < 4) {
299 | 			showHelp();
300 | 			System.exit(1);
301 | 		}
302 | 		JoinSortRanks converter;
303 | 		if (useBigGraph) {
304 | 			converter = new JoinSortRanksBig();
305 | 		} else {
306 | 			converter = new JoinSortRanks();
307 | 		}
308 | 
309 | 		String nodesIn = args[argpos++];
310 | 		String ranksHC = args[argpos++];
311 | 		String ranksPR = args[argpos++];
312 | 		String ranksOut = args[argpos++];
313 | 		try (Stream<String> in = Files.lines(Paths.get(nodesIn), StandardCharsets.UTF_8)) {
314 | 			OutputStream ranksOutStream;
315 | 			if (ranksOut.equals("-")) {
316 | 				ranksOutStream = System.out;
317 | 			} else {
318 | 				ranksOutStream = Files.newOutputStream(Paths.get(ranksOut));
319 | 			}
320 | 			PrintStream out = new PrintStream(ranksOutStream, false, StandardCharsets.UTF_8);
321 | 			LOG.info("Loading harmonic centrality values from {}", ranksHC);
322 | 			converter.loadHarmonicCentrality(ranksHC);
323 | 			LOG.info("Loading page rank values from {}", ranksPR);
324 | 			converter.loadPageRank(ranksPR);
325 | 			LOG.info("Assigning harmonic centrality ranks");
326 | 			converter.assignHarmonicCentralityRank();
327 | 			LOG.info("Assigning page rank ranks");
328 | 			converter.assignPageRankRank();
329 | 			LOG.info("Joining ranks");
330 | 			converter.convert(converter::addRanks, in, out);
331 | 			LOG.info("Finished joining ranks");
332 | 		} catch (IOException e) {
333 | 			LOG.error("Failed to join ranks:", e);
334 | 			System.exit(1);
335 | 		}
336 | 	}
337 | 
338 | }
339 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/webgraph/explore/Graph.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-License-Identifier: Apache-2.0
  3 |  * Copyright (C) 2024 Common Crawl and contributors
  4 |  */
  5 | package org.commoncrawl.webgraph.explore;
  6 | 
  7 | import java.io.IOException;
  8 | import java.nio.file.Files;
  9 | import java.nio.file.Paths;
 10 | import java.util.AbstractMap.SimpleEntry;
 11 | import java.util.Arrays;
 12 | import java.util.Collections;
 13 | import java.util.LinkedList;
 14 | import java.util.List;
 15 | import java.util.Map;
 16 | import java.util.Map.Entry;
 17 | import java.util.PrimitiveIterator;
 18 | import java.util.stream.IntStream;
 19 | import java.util.stream.Stream;
 20 | 
 21 | import org.commoncrawl.webgraph.CountingMergedIntIterator;
 22 | import org.commoncrawl.webgraph.HostToDomainGraph;
 23 | import org.slf4j.Logger;
 24 | import org.slf4j.LoggerFactory;
 25 | 
 26 | import crawlercommons.domains.EffectiveTldFinder;
 27 | import it.unimi.dsi.fastutil.io.BinIO;
 28 | import it.unimi.dsi.fastutil.longs.LongArrayList;
 29 | import it.unimi.dsi.lang.MutableString;
 30 | import it.unimi.dsi.sux4j.mph.GOV4Function;
 31 | import it.unimi.dsi.util.FrontCodedStringList;
 32 | import it.unimi.dsi.util.ImmutableExternalPrefixMap;
 33 | import it.unimi.dsi.util.Interval;
 34 | import it.unimi.dsi.util.LiterallySignedStringMap;
 35 | import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
 36 | import it.unimi.dsi.webgraph.ImmutableGraph;
 37 | import it.unimi.dsi.webgraph.LazyIntIterator;
 38 | import it.unimi.dsi.webgraph.LazyIntIterators;
 39 | 
 40 | /**
 41 |  * Holds webgraph-related data structures and access methods for graph
 42 |  * exploration.
 43 |  */
 44 | public class Graph {
 45 | 
 46 | 	private static Logger LOG = LoggerFactory.getLogger(Graph.class);
 47 | 
 48 | 	/** The base name of the graph */
 49 | 	public String name;
 50 | 	/** The graph */
 51 | 	public ImmutableGraph graph;
 52 | 	/** The transpose of the graph */
 53 | 	public ImmutableGraph graphT;
 54 | 
 55 | 	/* Maps to translate between vertex label an ID */
 56 | 	protected ImmutableExternalPrefixMap vertexMap;
 57 | 	protected FrontCodedStringList vertexMapFcl;
 58 | 	protected ShiftAddXorSignedStringMap vertexMapSmph;
 59 | 	protected GOV4Function<String> vertexMapMph;
 60 | 	protected LiterallySignedStringMap vertexMapLmap;
 61 | 
 62 | 	private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt();
 63 | 
 64 | 	public Graph(String name) throws Exception {
 65 | 		this.name = name;
 66 | 		try {
 67 | 			LOG.info("Loading graph {}.graph", name);
 68 | 			graph = ImmutableGraph.loadMapped(name);
 69 | 			LOG.info("Loading transpose of the graph {}-t.graph", name);
 70 | 			graphT = ImmutableGraph.loadMapped(name + "-t");
 71 | 			if (Files.exists(Paths.get(name + ".iepm"))) {
 72 | 				LOG.info("Loading vertex map {}.iepm (ImmutableExternalPrefixMap)", name);
 73 | 				vertexMap = (ImmutableExternalPrefixMap) BinIO.loadObject(name + ".iepm");
 74 | 			} else if (Files.exists(Paths.get(name + ".fcl"))) {
 75 | 				LOG.info("Loading vertex map {}.fcl (FrontCodedStringList, maps vertex IDs to labels)", name);
 76 | 				vertexMapFcl = (FrontCodedStringList) BinIO.loadObject(name + ".fcl");
 77 | 				if (Files.exists(Paths.get(name + ".smph"))) {
 78 | 					LOG.info("Loading vertex map {}.smph (string map perfect hash, maps vertex labels to IDs)", name);
 79 | 					vertexMapSmph = (ShiftAddXorSignedStringMap) BinIO.loadObject(name + ".smph");
 80 | 				} else if (Files.exists(Paths.get(name + ".mph"))) {
 81 | 					LOG.info("Loading vertex map {}.mph (minimal perfect hash, maps vertex labels to IDs)", name);
 82 | 					vertexMapMph = (GOV4Function<String>) BinIO.loadObject(name + ".mph");
 83 | 					LOG.warn(
 84 | 							"Using a minimal perfect hash as vertex map does not allow to verify that a vertex label exists. "
 85 | 									+ "Non-existant labels are mapped to quasi-random IDs.");
 86 | 				} else {
 87 | 					LOG.error("No vertex mapping found, cannot translate from vertex names to IDs.");
 88 | 				}
 89 | 			} else if (Files.exists(Paths.get(name + ".lmap"))) {
 90 | 				LOG.info("Loading vertex map {}.lmap (LiterallySignedStringMap)", name);
 91 | 				vertexMapLmap = (LiterallySignedStringMap) BinIO.loadObject(name + ".lmap");
 92 | 			} else {
 93 | 				LOG.error("No vertex mapping found, cannot translate from vertex names to IDs.");
 94 | 			}
 95 | 		} catch (IOException | ClassNotFoundException e) {
 96 | 			LOG.error("Failed to load graph {}:", name, e);
 97 | 			throw e;
 98 | 		}
 99 | 		LOG.info("Loaded graph {}.graph", name);
100 | 	}
101 | 
102 | 	public String vertexIdToLabel(long id) {
103 | 		if (vertexMap != null) {
104 | 			return vertexMap.list().get((int) id).toString();
105 | 		} else if (vertexMapFcl != null) {
106 | 			return vertexMapFcl.get((int) id).toString();
107 | 		} else if (vertexMapLmap != null) {
108 | 			return vertexMapLmap.list().get((int) id).toString();
109 | 		} else {
110 | 			throw new RuntimeException("No vertex map loaded.");
111 | 		}
112 | 	}
113 | 
114 | 	public long vertexLabelToId(String label) {
115 | 		if (vertexMap != null) {
116 | 			return vertexMap.getLong(label);
117 | 		} else if (vertexMapSmph != null) {
118 | 			return vertexMapSmph.getLong(label);
119 | 		} else if (vertexMapMph != null) {
120 | 			return vertexMapMph.getLong(label);
121 | 		} else if (vertexMapLmap != null) {
122 | 			return vertexMapLmap.getLong(label);
123 | 		} else {
124 | 			throw new RuntimeException("No vertex map loaded.");
125 | 		}
126 | 	}
127 | 
128 | 	public int outdegree(long vertexId) {
129 | 		return graph.outdegree((int) vertexId);
130 | 	}
131 | 
132 | 	public int outdegree(String vertexLabel) {
133 | 		return graph.outdegree((int) vertexLabelToId(vertexLabel));
134 | 	}
135 | 
136 | 	public int indegree(long vertexId) {
137 | 		return graphT.outdegree((int) vertexId);
138 | 	}
139 | 
140 | 	public int indegree(String vertexLabel) {
141 | 		return graphT.outdegree((int) vertexLabelToId(vertexLabel));
142 | 	}
143 | 
144 | 	public int[] successors(long vertexId) {
145 | 		return graph.successorArray((int) vertexId);
146 | 	}
147 | 
148 | 	public int[] successors(String vertexLabel) {
149 | 		return graph.successorArray((int) vertexLabelToId(vertexLabel));
150 | 	}
151 | 
152 | 	public Stream<String> successorStream(String vertexLabel) {
153 | 		return successorStream(graph, vertexLabelToId(vertexLabel));
154 | 	}
155 | 
156 | 	public IntStream successorIntStream(String vertexLabel) {
157 | 		return successorIntStream(graph, vertexLabelToId(vertexLabel));
158 | 	}
159 | 
160 | 	public Stream<String> successorStream(String vertexLabel, String prefix) {
161 | 		return successorStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
162 | 	}
163 | 
164 | 	public IntStream successorIntStream(String vertexLabel, String prefix) {
165 | 		return successorIntStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
166 | 	}
167 | 
168 | 	public Stream<Entry<String, Long>> successorTopLevelDomainCounts(String vertexLabel) {
169 | 		return successorTopLevelDomainCounts(graph, vertexLabelToId(vertexLabel));
170 | 	}
171 | 
172 | 	public Stream<String> successorStream(ImmutableGraph graph, long vertexId) {
173 | 		return successorIntStream(graph, vertexId).mapToObj(i -> vertexIdToLabel(i));
174 | 	}
175 | 
176 | 	public IntStream successorIntStream(ImmutableGraph graph, long vertexId) {
177 | 		return Arrays.stream(graph.successorArray((int) vertexId));
178 | 	}
179 | 
180 | 	private Stream<String> successorStream(ImmutableGraph graph, long vertexId, Interval interval) {
181 | 		return successorIntStream(graph, vertexId, interval).mapToObj(i -> vertexIdToLabel(i));
182 | 	}
183 | 
184 | 	public IntStream successorIntStream(ImmutableGraph graph, long vertexId, Interval interval) {
185 | 		return Arrays.stream(graph.successorArray((int) vertexId)).filter(x -> (interval.compareTo(x) == 0));
186 | 	}
187 | 
188 | 	public Stream<String> successorTopLevelDomainStream(ImmutableGraph graph, long vertexId) {
189 | 		return Arrays.stream(graph.successorArray((int) vertexId)).mapToObj(i -> getTopLevelDomain(vertexIdToLabel(i)));
190 | 	}
191 | 
192 | 	public Stream<Entry<String, Long>> successorTopLevelDomainCounts(ImmutableGraph graph, long vertexId) {
193 | 		if (vertexMap != null) {
194 | 			/*
195 | 			 * speed up if we have a prefix map, utilizing the fact that vertex labels are
196 | 			 * lexicographically sorted by reversed domain name
197 | 			 */
198 | 			List<Entry<String, Long>> res = new LinkedList<>();
199 | 			LazyIntIterator iter = graph.successors((int) vertexId);
200 | 			int curr = iter.nextInt();
201 | 			while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE) {
202 | 				final MutableString currLabel = vertexMap.list().get(curr);
203 | 				final int pos = currLabel.indexOf('.');
204 | 				final MutableString tldPrefix;
205 | 				final String tld;
206 | 				if (pos > -1 && (pos + 1) < currLabel.length()) {
207 | 					tldPrefix = currLabel.substring(0, pos + 1);
208 | 					tld = tldPrefix.substring(0, pos).toString();
209 | 				} else {
210 | 					tldPrefix = currLabel;
211 | 					tld = currLabel.toString();
212 | 				}
213 | 				long count = 1;
214 | 				final Interval interval = vertexMap.getInterval(tldPrefix);
215 | 				int next;
216 | 				while ((next = iter.nextInt()) != LAZY_INT_ITERATOR_EMPTY_VALUE) {
217 | 					if (next > interval.right) {
218 | 						break;
219 | 					}
220 | 					count++;
221 | 				}
222 | 				curr = next;
223 | 				res.add(new SimpleEntry<>(tld, count));
224 | 			}
225 | 			return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue()));
226 | 		}
227 | 		return GraphExplorer.frequencies(successorTopLevelDomainStream(graph, vertexId));
228 | 	}
229 | 
230 | 	public Stream<Entry<String, Long>> topLevelDomainCounts(IntStream vertexIds) {
231 | 		if (vertexMap != null) {
232 | 			List<Entry<String, Long>> res = new LinkedList<>();
233 | 			PrimitiveIterator.OfInt iter = vertexIds.iterator();
234 | 			if (iter.hasNext()) {
235 | 				int curr = iter.nextInt();
236 | 				do {
237 | 					final MutableString currLabel = vertexMap.list().get(curr);
238 | 					final int pos = currLabel.indexOf('.');
239 | 					final MutableString tldPrefix;
240 | 					final String tld;
241 | 					if (pos > -1 && (pos + 1) < currLabel.length()) {
242 | 						tldPrefix = currLabel.substring(0, pos + 1);
243 | 						tld = tldPrefix.substring(0, pos).toString();
244 | 					} else {
245 | 						tldPrefix = currLabel;
246 | 						tld = currLabel.toString();
247 | 					}
248 | 					long count = 1;
249 | 					final Interval interval = vertexMap.getInterval(tldPrefix);
250 | 					int next = -1;
251 | 					while (iter.hasNext()) {
252 | 						next = iter.nextInt();
253 | 						if (next > interval.right) {
254 | 							break;
255 | 						}
256 | 						count++;
257 | 					}
258 | 					curr = next;
259 | 					res.add(new SimpleEntry<>(tld, count));
260 | 				} while (curr > -1);
261 | 			}
262 | 			return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue()));
263 | 		}
264 | 		return GraphExplorer.frequencies(vertexIds.mapToObj(i -> Graph.getTopLevelDomain(vertexIdToLabel(i))));
265 | 	}
266 | 
267 | 	public int[] predecessors(long vertexId) {
268 | 		return graphT.successorArray((int) vertexId);
269 | 	}
270 | 
271 | 	public int[] predecessors(String vertexLabel) {
272 | 		return graphT.successorArray((int) vertexLabelToId(vertexLabel));
273 | 	}
274 | 
275 | 	public Stream<String> predecessorStream(String vertexLabel) {
276 | 		return successorStream(graphT, vertexLabelToId(vertexLabel));
277 | 	}
278 | 
279 | 	public IntStream predecessorIntStream(String vertexLabel) {
280 | 		return successorIntStream(graphT, vertexLabelToId(vertexLabel));
281 | 	}
282 | 
283 | 	public Stream<String> predecessorStream(String vertexLabel, String prefix) {
284 | 		return successorStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
285 | 	}
286 | 
287 | 	public IntStream predecessorIntStream(String vertexLabel, String prefix) {
288 | 		return successorIntStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix));
289 | 	}
290 | 
291 | 	public Stream<Entry<String, Long>> predecessorTopLevelDomainCounts(String vertexLabel) {
292 | 		return successorTopLevelDomainCounts(graphT, vertexLabelToId(vertexLabel));
293 | 	}
294 | 
295 | 	public long[] sharedPredecessors(long[] vertices) {
296 | 		return sharedPredecessors(vertices, vertices.length, vertices.length);
297 | 	}
298 | 
299 | 	public long[] sharedPredecessors(long[] vertices, int minShared, int maxShared) {
300 | 		return sharedSuccessors(graphT, vertices, minShared, maxShared);
301 | 	}
302 | 
303 | 	public long[] sharedSuccessors(long[] vertices) {
304 | 		return sharedSuccessors(vertices, vertices.length, vertices.length);
305 | 	}
306 | 
307 | 	public long[] sharedSuccessors(long[] vertices, int minShared, int maxShared) {
308 | 		return sharedSuccessors(graph, vertices, minShared, maxShared);
309 | 	}
310 | 
311 | 	/**
312 | 	 * Get shared successors (children) of all {@code vertices} in a {@code graph}.
313 | 	 * The parameters {@code minShared} and {@code maxShared} allow to select the
314 | 	 * intersection, the union or a subset with a specific overlap (shared
315 | 	 * successors). If vertex <i>a</i> has the successors <i>d, e</i>, vertex
316 | 	 * <i>b</i> has <i>d, f</i> and vertex <i>c</i> has <i>d, e, g</i>, then
317 | 	 * <ul>
318 | 	 * <li>{@code minShared} = {@code maxShared} = {@code vertices.length} returns
319 | 	 * the intersection (<i>d</i>)</li>
320 | 	 * <li>{@code minShared} = 1 and {@code maxShared} = {@code vertices.length}
321 | 	 * returns the union (<i>d, e, f</i>)</li>
322 | 	 * <li>{@code minShared} = {@code maxShared} = 2 returns all successors shared
323 | 	 * by exactly two of the {@code vertices} (<i>e</i>)</li>
324 | 	 * </ul>
325 | 	 * 
326 | 	 * @param graph     the graph used to access the successors of a vertex (the
327 | 	 *                  transpose of the graph will give the predecessors of the
328 | 	 *                  vertex)
329 | 	 * @param vertices  list of vertex IDs
330 | 	 * @param minShared the minimum number of shared links to successors
331 | 	 * @param maxShared the minimum number of shared links to successors
332 | 	 * @return shared successors
333 | 	 */
334 | 	public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minShared, int maxShared) {
335 | 		LazyIntIterator[] iters = new LazyIntIterator[vertices.length];
336 | 		for (int i = 0; i < vertices.length; i++) {
337 | 			iters[i] = graph.successors((int) vertices[i]);
338 | 		}
339 | 		CountingMergedIntIterator iter = new CountingMergedIntIterator(iters);
340 | 		LongArrayList res = new LongArrayList();
341 | 		int id;
342 | 		while (iter.hasNext()) {
343 | 			id = iter.nextInt();
344 | 			if (iter.getCount() >= minShared && iter.getCount() <= maxShared) {
345 | 				res.add(id);
346 | 			}
347 | 		}
348 | 		res.trim();
349 | 		return res.elements();
350 | 	}
351 | 
352 | 	public static String getTopLevelDomain(String reversedDomainName) {
353 | 		int dot = reversedDomainName.indexOf('.');
354 | 		if (dot < reversedDomainName.length()) {
355 | 			return reversedDomainName.substring(0, dot);
356 | 		}
357 | 		return reversedDomainName;
358 | 	}
359 | 
360 | 	/**
361 | 	 * Get the registered domain for a host name based on the ICANN section of the
362 | 	 * <a href="https://www.publicsuffix.org/">public suffix list</a>.
363 | 	 * 
364 | 	 * @see EffectiveTldFinder
365 | 	 * 
366 | 	 * @param hostName host name, e.g. <code>www.example.org.uk</code>
367 | 	 * @param strict   if true return null instead of <code>hostName</code> if no
368 | 	 *                 valid public suffix is detected
369 | 	 * @return the domain name below the public suffix, e.g.
370 | 	 *         <code>example.org.uk</code>
371 | 	 */
372 | 	public static String getRegisteredDomain(String hostName, boolean strict) {
373 | 		return EffectiveTldFinder.getAssignedDomain(hostName, strict, true);
374 | 	}
375 | 
376 | 	/**
377 | 	 * Get the registered domain for a host name, both in
378 | 	 * <a href= "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
379 | 	 * domain name notation</a>.
380 | 	 * 
381 | 	 * @see #getRegisteredDomain(String, boolean)
382 | 	 * 
383 | 	 * @param reversedHostName host name in reverse domain name notation, e.g.
384 | 	 *                         <code>uk.ork.example.www</code>
385 | 	 * @param strict           if true return null instead of
386 | 	 *                         <code>reversedHostName</code> if no valid public
387 | 	 *                         suffix is detected
388 | 	 * @return the domain name below the public suffix, e.g.
389 | 	 *         <code>uk.org.example</code> (in reverse domain name notation)
390 | 	 */
391 | 	public static String getRegisteredDomainReversed(String reversedHostName, boolean strict) {
392 | 		String hostName = reverseDomainName(reversedHostName);
393 | 		String domainName = getRegisteredDomain(hostName, strict);
394 | 		if (strict && domainName == null) {
395 | 			return null;
396 | 		} else if (hostName.equals(domainName)) {
397 | 			return reversedHostName;
398 | 		}
399 | 		return reverseDomainName(domainName);
400 | 	}
401 | 
402 | 	/**
403 | 	 * Reverse or "unreverse" a host/domain name: <code>com.example.www</code> is
404 | 	 * reversed to <code>www.example.com</code> and vice versa.
405 | 	 * 
406 | 	 * @param domainName domain name
407 | 	 * @return domain name with <a href=
408 | 	 *         "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
409 | 	 *         domain name notation</a> (un)applied
410 | 	 */
411 | 	public static String reverseDomainName(String domainName) {
412 | 		return HostToDomainGraph.reverseHost(domainName);
413 | 	}
414 | }
415 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-License-Identifier: Apache-2.0
  3 |  * Copyright (C) 2024 Common Crawl and contributors
  4 |  */
  5 | package org.commoncrawl.webgraph.explore;
  6 | 
  7 | import java.io.IOException;
  8 | import java.io.PrintStream;
  9 | import java.nio.charset.StandardCharsets;
 10 | import java.nio.file.Files;
 11 | import java.nio.file.Paths;
 12 | import java.util.Arrays;
 13 | import java.util.Comparator;
 14 | import java.util.Map.Entry;
 15 | import java.util.function.Function;
 16 | import java.util.stream.Collectors;
 17 | import java.util.stream.IntStream;
 18 | import java.util.stream.LongStream;
 19 | import java.util.stream.Stream;
 20 | 
 21 | import org.commoncrawl.webgraph.CountingMergedIntIterator;
 22 | import org.slf4j.Logger;
 23 | import org.slf4j.LoggerFactory;
 24 | 
 25 | import it.unimi.dsi.webgraph.LazyIntIterator;
 26 | 
 27 | /**
 28 |  * Utility class for graph exploration: load and hold all required web graph
 29 |  * data structures, provided methods to interactively explore the graph.
 30 |  */
 31 | public class GraphExplorer {
 32 | 
 33 | 	private static Logger LOG = LoggerFactory.getLogger(GraphExplorer.class);
 34 | 
 35 | 	public class Vertex {
 36 | 		private long id;
 37 | 		private String label;
 38 | 
 39 | 		public Vertex(String label) {
 40 | 			this.label = label;
 41 | 			id = g.vertexLabelToId(label);
 42 | 		}
 43 | 
 44 | 		public Vertex(long id) {
 45 | 			this.id = id;
 46 | 			label = g.vertexIdToLabel(id);
 47 | 		}
 48 | 
 49 | 		@Override
 50 | 		public String toString() {
 51 | 			return "#" + id + "\t" + label;
 52 | 		}
 53 | 
 54 | 		public int outdegree() {
 55 | 			return g.outdegree((int) id);
 56 | 		}
 57 | 
 58 | 		public int indegree() {
 59 | 			return g.indegree((int) id);
 60 | 		}
 61 | 
 62 | 		public int[] successors() {
 63 | 			return g.graph.successorArray((int) id);
 64 | 		}
 65 | 
 66 | 		public int[] predecessors() {
 67 | 			return g.graphT.successorArray((int) id);
 68 | 		}
 69 | 	}
 70 | 
 71 | 	private Graph g = null;
 72 | 	private Vertex v = null;
 73 | 
 74 | 	public GraphExplorer(String name) throws Exception {
 75 | 		g = new Graph(name);
 76 | 	}
 77 | 
 78 | 	public Graph getGraph() {
 79 | 		return g;
 80 | 	}
 81 | 
 82 | 	public Vertex getVertex(String vertexLabel) {
 83 | 		return new Vertex(vertexLabel);
 84 | 	}
 85 | 
 86 | 	public Vertex getVertex(long vertexId) {
 87 | 		return new Vertex(vertexId);
 88 | 	}
 89 | 
 90 | 	public void setVertex(String vertexLabel) {
 91 | 		v = getVertex(vertexLabel);
 92 | 	}
 93 | 
 94 | 	public void setVertex(long vertexId) {
 95 | 		v = getVertex(vertexId);
 96 | 	}
 97 | 
 98 | 	/* Reimplementation of commands provided by pywebgraph (cn, pwn, ls, sl) */
 99 | 
100 | 	/**
101 | 	 * Change the current working node / vertex.
102 | 	 * 
103 | 	 * @param vertexLabel vertex label (node name)
104 | 	 */
105 | 	public void cn(String vertexLabel) {
106 | 		setVertex(vertexLabel);
107 | 		pwn();
108 | 	}
109 | 
110 | 	/**
111 | 	 * Change the current working node / vertex.
112 | 	 * 
113 | 	 * @param vertexId vertex ID
114 | 	 */
115 | 	public void cn(long vertexId) {
116 | 		setVertex(vertexId);
117 | 		pwn();
118 | 	}
119 | 
120 | 	/**
121 | 	 * Print the current working node / vertex.
122 | 	 */
123 | 	public void pwn() {
124 | 		if (v == null) {
125 | 			throw new NullPointerException("Current working node not set, use cn(...) to define the working node.");
126 | 		}
127 | 		print(v.toString());
128 | 	}
129 | 
130 | 	/**
131 | 	 * Print the successors (outgoing links) of the current working node / vertex.
132 | 	 */
133 | 	public void ls() {
134 | 		if (v == null) {
135 | 			throw new NullPointerException("Current working node not set, use cn(...) to define the working node.");
136 | 		}
137 | 		ls(v.id);
138 | 	}
139 | 
140 | 	/**
141 | 	 * Print the successors (outgoing links) of a vertex.
142 | 	 * 
143 | 	 * @param vertexId vertex ID
144 | 	 */
145 | 	public void ls(long vertexId) {
146 | 		printVertices(g.graph.successors((int) vertexId));
147 | 	}
148 | 
149 | 	/**
150 | 	 * Print the successors (outgoing links) of a vertex.
151 | 	 * 
152 | 	 * @param vertexLabel vertex label / vertex name
153 | 	 */
154 | 	public void ls(String vertexLabel) {
155 | 		ls(g.vertexLabelToId(vertexLabel));
156 | 	}
157 | 
158 | 	/**
159 | 	 * Print the predecessors (incoming links) of the current working node / vertex.
160 | 	 */
161 | 	public void sl() {
162 | 		if (v == null) {
163 | 			throw new NullPointerException("Current working node not set, use cn(...) to define the working node.");
164 | 		}
165 | 		sl(v.id);
166 | 	}
167 | 
168 | 	/**
169 | 	 * Print the predecessors (incoming links) of a vertex.
170 | 	 * 
171 | 	 * @param vertexId vertex ID
172 | 	 */
173 | 	public void sl(long vertexId) {
174 | 		printVertices(g.graphT.successors((int) vertexId));
175 | 	}
176 | 
177 | 	/**
178 | 	 * Print the predecessors (incoming links) of a vertex.
179 | 	 * 
180 | 	 * @param vertexLabel vertex label / vertex name
181 | 	 */
182 | 	public void sl(String vertexLabel) {
183 | 		sl(g.vertexLabelToId(vertexLabel));
184 | 	}
185 | 
186 | 	/* Utilities */
187 | 
188 | 	public long[] loadVerticesFromFile(String fileName) {
189 | 		try (Stream<String> in = Files.lines(Paths.get(fileName), StandardCharsets.UTF_8)) {
190 | 			return in.mapToLong(label -> g.vertexLabelToId(label)).filter(id -> id > -1).toArray();
191 | 		} catch (IOException e) {
192 | 			LOG.error("Failed to load vertices from file {}", fileName, e);
193 | 		}
194 | 		return new long[0];
195 | 	}
196 | 
197 | 	public void saveVerticesToFile(long[] vertexIDs, String fileName) {
198 | 		try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
199 | 				StandardCharsets.UTF_8)) {
200 | 			Arrays.stream(vertexIDs).forEach(id -> out.println(g.vertexIdToLabel(id)));
201 | 		} catch (IOException e) {
202 | 			LOG.error("Failed to write vertices to file {}", fileName, e);
203 | 		}
204 | 	}
205 | 
206 | 	public void saveVerticesToFile(int[] vertexIDs, String fileName) {
207 | 		saveVerticesToFile(Arrays.stream(vertexIDs), fileName);
208 | 	}
209 | 
210 | 	public void saveVerticesToFile(IntStream vertexIDs, String fileName) {
211 | 		try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
212 | 				StandardCharsets.UTF_8)) {
213 | 			vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id)));
214 | 		} catch (IOException e) {
215 | 			LOG.error("Failed to write vertices to file {}", fileName, e);
216 | 		}
217 | 	}
218 | 
219 | 	public void saveVerticesToFile(LongStream vertexIDs, String fileName) {
220 | 		try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
221 | 				StandardCharsets.UTF_8)) {
222 | 			vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id)));
223 | 		} catch (IOException e) {
224 | 			LOG.error("Failed to write vertices to file {}", fileName, e);
225 | 		}
226 | 	}
227 | 
228 | 	public void saveToFile(Stream<String> strings, String fileName) {
229 | 		try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
230 | 				StandardCharsets.UTF_8)) {
231 | 			strings.forEach(out::println);
232 | 		} catch (IOException e) {
233 | 			LOG.error("Failed to write strings to file {}", fileName, e);
234 | 		}
235 | 	}
236 | 
237 | 	public void saveCountsToFile(Stream<Entry<String, Long>> counts, String fileName) {
238 | 		try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
239 | 				StandardCharsets.UTF_8)) {
240 | 			counts.forEach(c -> {
241 | 				out.print(c.getValue());
242 | 				out.print('\t');
243 | 				out.print(c.getKey());
244 | 				out.print('\n');
245 | 			});
246 | 		} catch (IOException e) {
247 | 			LOG.error("Failed to write counts to file {}", fileName, e);
248 | 		}
249 | 	}
250 | 
251 | 	private void print(String s) {
252 | 		System.out.println(s);
253 | 	}
254 | 
255 | 	public void printVertices(LazyIntIterator it) {
256 | 		int next = it.nextInt();
257 | 		int i = 0;
258 | 		while (next != CountingMergedIntIterator.LAZY_INT_ITERATOR_EMPTY_VALUE) {
259 | 			print(String.format("%d: %s", i, (new Vertex(next)).toString()));
260 | 			next = it.nextInt();
261 | 			i++;
262 | 		}
263 | 	}
264 | 
265 | 	public void printVertices(long[] vertexIDs) {
266 | 		int i = 0;
267 | 		for (long id : vertexIDs) {
268 | 			print(String.format("%d: %s", i, (new Vertex(id)).toString()));
269 | 			i++;
270 | 		}
271 | 	}
272 | 
273 | 	public void printVertices(int[] vertexIDs) {
274 | 		int i = 0;
275 | 		for (long id : vertexIDs) {
276 | 			print(String.format("%d: %s", i, (new Vertex(id)).toString()));
277 | 			i++;
278 | 		}
279 | 	}
280 | 
281 | 	/**
282 | 	 * Count strings in a stream. Sort the resulting string-count pairs by
283 | 	 * decreasing count (frequency) and secondarily by string in lexicographic
284 | 	 * order.
285 | 	 * 
286 | 	 * @param strings stream of strings
287 | 	 * @return stream of pairs {@code <string, count>}
288 | 	 */
289 | 	public static Stream<Entry<String, Long>> frequencies(Stream<String> strings) {
290 | 		final Comparator<Entry<String, Long>> comp = Comparator.comparingLong((Entry<String, Long> e) -> e.getValue())
291 | 				.reversed().thenComparing(Comparator.comparing((Entry<String, Long> e) -> e.getKey()));
292 | 		return strings.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet().stream()
293 | 				.sorted(comp);
294 | 	}
295 | }
296 | 


--------------------------------------------------------------------------------
/src/main/java/org/commoncrawl/webgraph/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Custom classes to build Common Crawl web graph data sets. Built on software
3 |  * from the Laboratory for Web Algorithmics (LAW) at the University of Milano,
4 |  * namely the <a href="https://webgraph.di.unimi.it/">WebGraph framework</a> and
5 |  * the <a href="https://law.di.unimi.it/software.php">LAW library</a>.
6 |  */
7 | package org.commoncrawl.webgraph;


--------------------------------------------------------------------------------
/src/main/resources/simplelogger.properties:
--------------------------------------------------------------------------------
 1 | # SLF4J's SimpleLogger configuration file
 2 | # Simple implementation of Logger that sends all enabled log messages, for all defined loggers, to System.err.
 3 | 
 4 | # Default logging detail level for all instances of SimpleLogger.
 5 | # Must be one of ("trace", "debug", "info", "warn", or "error").
 6 | # If not specified, defaults to "info".
 7 | org.slf4j.simpleLogger.defaultLogLevel=debug
 8 | 
 9 | # Logging detail level for a SimpleLogger instance named "xxxxx".
10 | # Must be one of ("trace", "debug", "info", "warn", or "error").
11 | # If not specified, the default logging detail level is used.
12 | #org.slf4j.simpleLogger.log.xxxxx=
13 | 
14 | # Set to true if you want the current date and time to be included in output messages.
15 | # Default is false, and will output the number of milliseconds elapsed since startup.
16 | org.slf4j.simpleLogger.showDateTime=true
17 | 
18 | # The date and time format to be used in the output messages.
19 | # The pattern describing the date and time format is the same that is used in java.text.SimpleDateFormat.
20 | # If the format is not specified or is invalid, the default format is used.
21 | # The default format is yyyy-MM-dd HH:mm:ss:SSS Z.
22 | org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss:SSS Z
23 | 
24 | # Set to true if you want to output the current thread name.
25 | # Defaults to true.
26 | org.slf4j.simpleLogger.showThreadName=true
27 | 
28 | # Set to true if you want the Logger instance name to be included in output messages.
29 | # Defaults to true.
30 | org.slf4j.simpleLogger.showLogName=true
31 | 
32 | # Set to true if you want the last component of the name to be included in output messages.
33 | # Defaults to false.
34 | org.slf4j.simpleLogger.showShortLogName=true
35 | 


--------------------------------------------------------------------------------
/src/script/host2domaingraph.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | # Copyright (C) 2022 Common Crawl and contributors
  5 | 
  6 | FLAGS=()
  7 | PROPERTIES=()
  8 | while true; do
  9 |     case "$1" in
 10 |         "-D"* )
 11 |             PROPERTIES=("${PROPERTIES[@]}" "$1")
 12 |             shift
 13 |             ;;
 14 |         "-"* )
 15 |             FLAGS=("${FLAGS[@]}" "$1")
 16 |             shift
 17 |             ;;
 18 |         * )
 19 |             break
 20 |             ;;
 21 |     esac
 22 | done
 23 | 
 24 | JAR=target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar
 25 | 
 26 | if [ $# -lt 3 ]; then
 27 | 	echo "$0 [<flags>...] <number_of_vertices> <input_dir> <output_dir> [<tmp_dir>]" >&2
 28 |     if [ ${#FLAGS[@]} -gt 0 ]; then
 29 |         echo ""
 30 |         echo "Calling HostToDomainGraph with provided flags (${FLAGS[*]}):"
 31 |         "$JAVA_HOME"/bin/java -cp "$CLASSPATH":"$JAR" "${PROPERTIES[@]}" \
 32 |                     org.commoncrawl.webgraph.HostToDomainGraph "${FLAGS[@]}"
 33 |     fi
 34 | 	exit 1
 35 | fi
 36 | 
 37 | SIZE="$1"
 38 | INPUTDIR="$2"
 39 | OUTPUTDIR="$3"
 40 | TMPDIR=${4:-./tmp/}
 41 | 
 42 | MAIN_MEM_GB=16
 43 | PARALLEL_SORT_THREADS=2
 44 | 
 45 | # Reduce host-level web graph to domain-level graph
 46 | # - running HostToDomainGraph which has low memory requirements
 47 | # - requires properly sorted input:
 48 | #    * reversed host names
 49 | #    * all hosts/subdomains of one domain following in a single input block
 50 | # - approx. memory requirements:
 51 | #    * for graphs with less than 2^31 vertices
 52 | #       2 GB +  4*number_of_vertices Bytes
 53 | #    * larger graphs
 54 | #       8 GB + 10*number_of_vertices Bytes
 55 | 
 56 | # Notes about input sorting:
 57 | #
 58 | # 1 C locale is mandatory to keep reversed hosts of one domain or top-level domain
 59 | #   together in a single block:
 60 | #     echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=en_US.utf8 sort
 61 | #   vs.
 62 | #     echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=C sort
 63 | #   This requirement is met by the output of the cc-pyspark job.
 64 | #
 65 | # 2 the second problem stems from the fact that a hyphen (valid in host and
 66 | #   subdomain names) is sorted before the dot:
 67 | #     ac.gov
 68 | #     ac.gov.ascension
 69 | #     ac.gov.ascension-island
 70 | #     ac.gov.ascension.mail
 71 | #   Unfortunately the output of the cc-pyspark job does not completely meet this
 72 | #   sorting criterion.
 73 | #   The initial solution to ensure that the subdomains of "ac.gov.ascension" are not split
 74 | #   into two blocks, was to add an artificial dot temporarily to the end of each host
 75 | #   name during sorting:
 76 | #     zcat vertices.txt.gz | sed -e 's/$/./' \
 77 | #        | sort $SORTOPTS -t$'\t' -k2,2 | sed -e 's/\.$//'
 78 | #   The domain name "ac.gov.ascension" in the example above becomes temporarily
 79 | #   "ac.gov.ascension." and is now sorted after "ac.gov.ascension-island."
 80 | #   
 81 | #   To avoid this step (re-sorting billions of lines is expensive), the HostToDomainGraph
 82 | #   class now caches potentially "missorted" candidates and processes them later together
 83 | #   with the related subdomains / host names.
 84 | #
 85 | #   Note: The final sorting of the domain names is the same as if there would be
 86 | #   a trailing dot:
 87 | #     ac.gov.ascension-island
 88 | #     ac.gov.ascension
 89 | 
 90 | 
 91 | export LC_ALL=C
 92 | 
 93 | # sort with large buffers, merge sort over many files if possible
 94 | SORTOPTS="--batch-size 128 --buffer-size $((1+MAIN_MEM_GB/5))g --parallel=$PARALLEL_SORT_THREADS --temporary-directory $TMPDIR" # --compress-program=gzip
 95 | 
 96 | set -exo pipefail
 97 | 
 98 | test -d "$TMPDIR" || mkdir "$TMPDIR"
 99 | 
100 | 
101 | _EDGES=$INPUTDIR/edges.txt.gz
102 | if [ -e "$_EDGES" ]; then
103 |     echo "Found single edges file: $_EDGES"
104 | elif [ -d "$INPUTDIR"/edges/ ]; then
105 |     # edges is a directory with multiple edges files
106 |     _EDGES="$INPUTDIR/edges/*.gz"
107 |     echo "Found edges directory, using: $_EDGES"
108 | else
109 |     echo "Input edges file(s) not found"
110 |     exit 1
111 | fi
112 | 
113 | _VERTICES=$INPUTDIR/vertices.txt.gz
114 | if [ -e "$_VERTICES" ]; then
115 |     echo "Found single vertices file: $_VERTICES"
116 | elif [ -d "$INPUTDIR"/vertices/ ]; then
117 |     # vertices is a directory with multiple vertices files
118 |     echo "Found vertices directory, using: $_VERTICES"
119 |     _VERTICES="$INPUTDIR/vertices/*.gz"
120 | else
121 |     echo "Input vertices file(s) not found"
122 |     exit 1
123 | fi
124 | 
125 | 
126 | mkdir -p "$OUTPUTDIR/"
127 | 
128 | JXMX=$((2+1+5*SIZE/2**30))
129 | if [ "$SIZE" -gt $((2**31-1024)) ]; then
130 |     JXMX=$((8+1+10*SIZE/2**30))
131 | fi
132 | 
133 | "$JAVA_HOME"/bin/java -Xmx${JXMX}g -cp "$CLASSPATH":"$JAR" \
134 |                     "${PROPERTIES[@]}" \
135 |                     org.commoncrawl.webgraph.HostToDomainGraph \
136 |                     "${FLAGS[@]}" \
137 |                     $SIZE \
138 |                     <(zcat $_VERTICES) \
139 |                     >(gzip >"$OUTPUTDIR"/vertices.txt.gz) \
140 |                     <(zcat $_EDGES) \
141 |                     >(sort $SORTOPTS -t$'\t' -k1,1n -k2,2n -s -u | gzip >"$OUTPUTDIR"/edges.txt.gz)
142 | 
143 | wait # for subshells to finish
144 | 


--------------------------------------------------------------------------------
/src/script/hostgraph/build_hostgraph.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | # Copyright (C) 2022 Common Crawl and contributors
  5 | 
  6 | set -e
  7 | set -o pipefail
  8 | set -x
  9 | 
 10 | # run the webgraph workflow (based on cc-pyspark)
 11 | #  - extract inter-host links
 12 | #  - construct the host-level graph
 13 | 
 14 | # installation and execution:
 15 | #   - install cc-pyspark
 16 | #      git clone https://github.com/commoncrawl/cc-pyspark.git
 17 | #   - and make it the working directory
 18 | #      cd cc-pyspark
 19 | #   - point SPARK_HOME to your installation of Apache Spark (https://spark.apache.org/)
 20 | #      vi ./spark_env.sh
 21 | #     and make sure that your Spark cluster (on Hadoop YARN) is running!
 22 | #   - edit the hostgraph build configuration
 23 | #      vi .../hostgraph_config.sh
 24 | #   - run the workflow
 25 | #      .../build_hostgraph.sh
 26 | 
 27 | # Note: the script is tested using a Hadoop cluster running
 28 | # Apache Bigtop 3.x on Ubuntu 22.04. You may need to adapt it
 29 | # to run on different Hadoop distributions.
 30 | 
 31 | 
 32 | SPARK_ON_YARN="--master yarn"
 33 | SPARK_HADOOP_OPTS=""
 34 | SPARK_EXTRA_OPTS=""
 35 | 
 36 | HOST_LINK_EXTRACTOR=./hostlinks_extract_fastwarc.py
 37 | PYFILES_HOST_LINK_EXTRACTOR="sparkcc.py,sparkcc_fastwarc.py,wat_extract_links.py,json_importer.py"
 38 | 
 39 | HOST_LINKS_TO_GRAPH=./hostlinks_to_graph.py
 40 | PYFILES_HOST_LINKS_TO_GRAPH="sparkcc.py,iana_tld.py,wat_extract_links.py,json_importer.py"
 41 | HOST_LINKS_TO_GRAPH_ARGS=(--validate_host_names) # --normalize_host_names
 42 | 
 43 | 
 44 | # source library functions
 45 | source "$(dirname "$0")"/../workflow_lib.sh
 46 | 
 47 | # source workflow configuration
 48 | source "$(dirname "$0")"/hostgraph_config.sh
 49 | 
 50 | # define SPARK_HOME and HADOOP_CONF_DIR
 51 | source "$PWD"/spark_env.sh
 52 | 
 53 | 
 54 | ################################################################################
 55 | 
 56 | # upload Parquet graph
 57 | function upload_parquet() (
 58 |     set -xeo pipefail
 59 |     TABLE=$1
 60 |     UPLOAD_NAME=$2
 61 |     UPLOAD_DIR=$S3A_OUTPUT_PREFIX/$UPLOAD_NAME/hostgraph
 62 |     if hadoop fs -test -d "$UPLOAD_DIR"/vertices; then
 63 |         echo "Upload $UPLOAD_DIR/vertices already exists, skipping..."
 64 |     else
 65 |         hadoop distcp \
 66 |                "$HDFS_BASE_DIR"/${TABLE}_vertices \
 67 |                "$UPLOAD_DIR"/vertices
 68 |     fi
 69 |     if hadoop fs -test -d "$UPLOAD_DIR"/edges; then
 70 |         echo "Upload "$UPLOAD_DIR"/edges already exists, skipping..."
 71 |     else
 72 |         hadoop distcp \
 73 |                "$HDFS_BASE_DIR"/${TABLE}_edges \
 74 |                "$UPLOAD_DIR"/edges
 75 |     fi
 76 | )
 77 | 
 78 | function upload_text() (
 79 |     set -xeo pipefail
 80 |     NAME=$1
 81 |     UPLOAD_NAME=$2
 82 |     UPLOAD_DIR="$S3A_OUTPUT_PREFIX"/$UPLOAD_NAME/hostgraph/text
 83 |     PUBLIC=${3:-false}
 84 |     DISTCP_OPTS=""
 85 |     if $PUBLIC; then
 86 |         DISTCP_OPTS="$DISTCP_OPTS -Dfs.s3a.acl.default=PublicRead"
 87 |     fi
 88 |     if hadoop fs -test -d "$UPLOAD_DIR"/vertices; then
 89 |         echo "Upload $UPLOAD_DIR/vertices already exists, skipping..."
 90 |     else
 91 |         hadoop fs -rm -f "$HDFS_BASE_DIR"/text/$NAME/vertices/_SUCCESS
 92 |         hadoop distcp $DISTCP_OPTS \
 93 |                "$HDFS_BASE_DIR"/text/$NAME/vertices \
 94 |                "$UPLOAD_DIR"/vertices
 95 |     fi
 96 |     if hadoop fs -test -d "$UPLOAD_DIR"/edges; then
 97 |         echo "Upload $UPLOAD_DIR/edges already exists, skipping..."
 98 |     else
 99 |         hadoop fs -rm -f "$HDFS_BASE_DIR"/text/$NAME/edges/_SUCCESS
100 |         hadoop distcp $DISTCP_OPTS \
101 |                "$HDFS_BASE_DIR"/text/$NAME/edges \
102 |                "$UPLOAD_DIR"/edges
103 |     fi
104 | )
105 | 
106 | # text output
107 | function dump_upload_text() (
108 |     set -xeo pipefail
109 |     NAME=$1
110 |     UPLOAD_NAME=$2
111 |     mkdir -p output/$NAME/hostgraph/tmp_edges/
112 |     mkdir -p output/$NAME/hostgraph/tmp_vertices/
113 |     hadoop fs -copyToLocal "$HDFS_BASE_DIR"/text/$NAME/vertices/*.gz output/$NAME/hostgraph/tmp_vertices/
114 |     n_vertex_files=$(ls output/$NAME/hostgraph/tmp_vertices/*.gz | wc -l)
115 |     if [ $n_vertex_files -eq 1 ]; then
116 |         mv output/$NAME/hostgraph/tmp_vertices/*.gz output/$NAME/hostgraph/vertices.txt.gz
117 |     else
118 |         zcat output/$NAME/hostgraph/tmp_vertices/*.gz | gzip >output/$NAME/hostgraph/vertices.txt.gz
119 |     fi
120 |     aws s3 cp --no-progress output/$NAME/hostgraph/vertices.txt.gz $S3_OUTPUT_PREFIX/$UPLOAD_NAME/hostgraph/
121 |     hadoop fs -copyToLocal "$HDFS_BASE_DIR"/text/$NAME/edges/*.gz output/$NAME/hostgraph/tmp_edges/
122 |     sort_input=""
123 |     for e in output/$NAME/hostgraph/tmp_edges/*.gz; do
124 |         sort_input="$sort_input <(zcat $e)"
125 |     done
126 |     mkdir -p tmp
127 |     eval "sort --batch-size 96 --buffer-size 4g --parallel 2 --temporary-directory ./tmp/ --compress-program=gzip -t$'\t' -k1,1n -k2,2n --stable --merge $sort_input | gzip >output/$NAME/hostgraph/edges.txt.gz"
128 |     aws s3 cp --no-progress output/$NAME/hostgraph/edges.txt.gz    $S3_OUTPUT_PREFIX/$UPLOAD_NAME/hostgraph/
129 | )
130 | 
131 | function create_input_splits() {
132 |     CRAWL="$1"
133 |     __INPUT_SPLITS=()
134 |     if [ -d input/$CRAWL/ ]; then
135 |         # input splits are already created locally, read the splits again
136 |         # (this might happen if one of the steps/jobs has failed and
137 |         #  this script is run again)
138 |         for split in input/$CRAWL/input_split_*.txt; do
139 |             __INPUT_SPLITS=(${__INPUT_SPLITS[@]} "$HDFS_BASE_DIR/$split")
140 |         done
141 | 
142 |     elif hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/ >&2; then
143 |         # no local input splits but output on S3
144 |         echo "Not creating input split for crawl $CRAWL because output prefix already exists on S3: $S3A_OUTPUT_PREFIX/$CRAWL/hostlinks/" >&2
145 | 
146 |     else
147 |         mkdir -p input/$CRAWL
148 |         cd input/$CRAWL
149 |         aws s3 cp --quiet --no-progress s3://commoncrawl/crawl-data/$CRAWL/wat.paths.gz .
150 |         aws s3 cp --quiet --no-progress s3://commoncrawl/crawl-data/$CRAWL/non200responses.paths.gz .
151 |         if $INCLUDE_ROBOTSTXT_SITEMAP_LINKS; then
152 |             aws s3 cp --quiet --no-progress s3://commoncrawl/crawl-data/$CRAWL/robotstxt.paths.gz .
153 |         fi
154 |         zcat ./*.paths.gz | shuf >input.txt
155 |         NUM_INPUT_PATHS=$(wc -l <input.txt)
156 |         NUM_SPLITS=$((1+NUM_INPUT_PATHS/MAX_INPUT_SIZE))
157 |         if [ $NUM_SPLITS -gt 0 ]; then
158 |             split --suffix-length=2 -d --lines=$((1+NUM_INPUT_PATHS/NUM_SPLITS)) input.txt input_split_
159 |         fi
160 |         for split in input_split_*; do
161 |             mv $split $split.txt
162 |             __INPUT_SPLITS=(${__INPUT_SPLITS[@]} "$HDFS_BASE_DIR/input/$CRAWL/$split.txt")
163 |         done
164 |         cd - >&2
165 |         ### copy input to hdfs://
166 |         hadoop fs -mkdir -p "$HDFS_BASE_DIR"/$CRAWL
167 |         hadoop fs -mkdir -p "$HDFS_BASE_DIR"/input/$CRAWL/
168 |         hadoop fs -mkdir -p "$HDFS_BASE_DIR"/text/$CRAWL/
169 |         hadoop fs -copyFromLocal -f input/$CRAWL/input.txt "$HDFS_BASE_DIR"/input/$CRAWL/
170 |         for split in input/$CRAWL/input_split_*.txt; do
171 |             hadoop fs -copyFromLocal -f $split "$HDFS_BASE_DIR"/input/$CRAWL/
172 |         done
173 |         # The input list is considerably small because it only references s3:// paths:
174 |         # deploy it on every node to make all tasks NODE_LOCAL
175 |         hadoop fs -setrep $((NUM_EXECUTORS+1)) "$HDFS_BASE_DIR"/input/$CRAWL/ >&2
176 |     fi
177 |     echo "${__INPUT_SPLITS[@]}"
178 | }
179 | 
180 | 
181 | ################################################################################
182 | 
183 | MERGE_CRAWL_INPUT=""
184 | 
185 | for CRAWL in ${CRAWLS[@]}; do
186 | 
187 |     INPUT_SPLITS=($(create_input_splits $CRAWL))
188 | 
189 |     if [ -z "$INPUT_SPLITS" ]; then
190 |         # no input splits signals that the crawl has already successfully processed
191 |         if hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/; then
192 |             echo "Output prefix for crawl $CRAWL already exists on S3: $S3A_OUTPUT_PREFIX/$CRAWL/hostlinks/"
193 |             if ! hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/_SUCCESS; then
194 |                 echo "No success marker found below S3 output prefix: $S3A_OUTPUT_PREFIX/$CRAWL/hostlinks/_SUCCESS"
195 |                 echo "Please, verify the output and depending on the verification result, manually add the success marker or remove the output. Exiting ..."
196 |                 exit 1
197 |             fi
198 |         fi
199 |         # add the existing output splits as input for host graph and merged graph
200 |         for output_split in $(hadoop fs -ls -C "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/); do
201 |             case "$output_split" in
202 |                 */_SUCCESS )
203 |                     continue ;;
204 |             esac
205 |             if [ -z "$HOSTGRAPH_INPUT" ]; then
206 |                 HOSTGRAPH_INPUT="$output_split"
207 |             else
208 |                 HOSTGRAPH_INPUT="--add_input $output_split $HOSTGRAPH_INPUT"
209 |             fi
210 |             if [ -z "$MERGE_CRAWL_INPUT" ]; then
211 |                 MERGE_CRAWL_INPUT="$output_split"
212 |             else
213 |                 MERGE_CRAWL_INPUT="--add_input $output_split $MERGE_CRAWL_INPUT"
214 |             fi
215 |         done
216 | 
217 |     else
218 |         echo "Input splits: ""${INPUT_SPLITS[*]}"
219 | 
220 |         for ((i=0; i<${#INPUT_SPLITS[@]}; i++)); do
221 |             INPUT=${INPUT_SPLITS[$i]}
222 |             NUM_INPUT_PATHS=$(wc -l <input/$CRAWL/$(basename $INPUT))
223 |             INPUT_PARTITIONS=$((NUM_INPUT_PATHS/DIVISOR_INPUT_PARTITIONS))
224 |             echo "$INPUT => $INPUT_PARTITIONS partitions"
225 | 
226 |             _step hostlinks.$CRAWL.split$i \
227 |                   "$SPARK_HOME"/bin/spark-submit \
228 |                   $SPARK_ON_YARN \
229 |                   $SPARK_HADOOP_OPTS \
230 |                   --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
231 |                   --conf spark.task.maxFailures=80 \
232 |                   --conf spark.executor.memory=$EXECUTOR_MEM \
233 |                   --conf spark.driver.memory=6g \
234 |                   --conf spark.core.connection.ack.wait.timeout=600s \
235 |                   --conf spark.network.timeout=300s \
236 |                   --conf spark.shuffle.io.maxRetries=5 \
237 |                   --conf spark.shuffle.io.retryWait=30s \
238 |                   --conf spark.io.compression.codec=zstd \
239 |                   --conf spark.checkpoint.compress=true \
240 |                   --conf spark.locality.wait=0s \
241 |                   --num-executors $NUM_EXECUTORS \
242 |                   --executor-cores $EXECUTOR_CORES \
243 |                   --executor-memory $EXECUTOR_MEM \
244 |                   --conf spark.sql.warehouse.dir=$WAREHOUSE_DIR/$CRAWL \
245 |                   --conf spark.sql.parquet.compression.codec=zstd \
246 |                   --py-files "$PYFILES_HOST_LINK_EXTRACTOR" \
247 |                   $SPARK_EXTRA_OPTS \
248 |                   $HOST_LINK_EXTRACTOR \
249 |                   --input_base_url $INPUT_BASE_URL \
250 |                   --num_input_partitions $INPUT_PARTITIONS \
251 |                   --num_output_partitions $OUTPUT_PARTITIONS \
252 |                   --local_temp_dir "$TMPDIR" \
253 |                   $INPUT hostlinks$i
254 | 
255 |             _step hostlinks.$CRAWL.split$i.distcp \
256 |                   hadoop distcp \
257 |                   -Dfs.s3a.connection.timeout=2000 \
258 |                   -Dfs.s3a.attempts.maximum=3 \
259 |                   "$HDFS_BASE_DIR"/$CRAWL/hostlinks$i \
260 |                   "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/$i
261 | 
262 |             if [ -z "$HOSTGRAPH_INPUT" ]; then
263 |                 HOSTGRAPH_INPUT="$HDFS_BASE_DIR/$CRAWL/hostlinks$i"
264 |             else
265 |                 HOSTGRAPH_INPUT="--add_input $HDFS_BASE_DIR/$CRAWL/hostlinks$i $HOSTGRAPH_INPUT"
266 |             fi
267 |             if [ -z "$MERGE_CRAWL_INPUT" ]; then
268 |                 MERGE_CRAWL_INPUT="$HDFS_BASE_DIR/$CRAWL/hostlinks$i"
269 |             else
270 |                 MERGE_CRAWL_INPUT="--add_input $HDFS_BASE_DIR/$CRAWL/hostlinks$i $MERGE_CRAWL_INPUT"
271 |             fi
272 |         done # end input splits
273 | 
274 |         # Create the success marker on S3
275 |         hadoop fs -touchz "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/_SUCCESS
276 | 
277 |     fi
278 | 
279 | 
280 |     if $CONSTRUCT_HOSTGRAPH; then
281 | 
282 |         if hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostgraph/; then
283 |             echo "Skipping creation of hostgraph for crawl $CRAWL because output prefix already exists on S3: $S3A_OUTPUT_PREFIX/$CRAWL/hostgraph/"
284 |             continue
285 |         fi
286 | 
287 |         VERTEX_IDS=""
288 |         if hadoop fs -stat "$HDFS_BASE_DIR"/$CRAWL/hostgraph_vertices; then
289 |             VERTEX_IDS="--vertex_ids $HDFS_BASE_DIR/$CRAWL/hostgraph_vertices"
290 |         fi
291 | 
292 |         _step hostgraph.$CRAWL \
293 |               "$SPARK_HOME"/bin/spark-submit \
294 |               $SPARK_ON_YARN \
295 |               $SPARK_HADOOP_OPTS \
296 |               --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
297 |               --conf spark.task.maxFailures=10 \
298 |               --conf spark.executor.memory=$EXECUTOR_MEM \
299 |               --conf spark.driver.memory=6g \
300 |               --conf spark.core.connection.ack.wait.timeout=600s \
301 |               --conf spark.network.timeout=300s \
302 |               --conf spark.shuffle.io.maxRetries=5 \
303 |               --conf spark.shuffle.io.retryWait=30s \
304 |               --conf spark.locality.wait=1s \
305 |               --conf spark.io.compression.codec=zstd \
306 |               --conf spark.checkpoint.compress=true \
307 |               --num-executors $NUM_EXECUTORS \
308 |               --executor-cores $EXECUTOR_CORES \
309 |               --executor-memory $EXECUTOR_MEM \
310 |               --conf spark.sql.warehouse.dir=$WAREHOUSE_DIR/$CRAWL \
311 |               --conf spark.sql.parquet.compression.codec=zstd \
312 |               --py-files "$PYFILES_HOST_LINKS_TO_GRAPH" \
313 |               $SPARK_EXTRA_OPTS \
314 |               $HOST_LINKS_TO_GRAPH \
315 |               "${HOST_LINKS_TO_GRAPH_ARGS[@]}" \
316 |               --save_as_text "$HDFS_BASE_DIR"/text/$CRAWL \
317 |               --num_output_partitions $WEBGRAPH_EDGE_PARTITIONS \
318 |               --local_temp_dir $TMPDIR \
319 |               $VERTEX_IDS \
320 |               $HOSTGRAPH_INPUT hostgraph
321 | 
322 | 
323 |         _step hostgraph.$CRAWL.upload.1 \
324 |               upload_parquet hostgraph $CRAWL
325 | 
326 |         _step hostgraph.$CRAWL.upload.2 \
327 |               dump_upload_text $CRAWL $CRAWL
328 |     fi
329 | 
330 | done # CRAWLS
331 | 
332 | 
333 | 
334 | if [ -n "MERGE_NAME" ]; then
335 | 
336 |     hadoop fs -mkdir -p "$HDFS_BASE_DIR"/text/$MERGE_NAME
337 | 
338 |     for INP in "${MERGE_INPUT[@]}"; do
339 |         if [ -z "$MERGE_CRAWL_INPUT" ]; then
340 |             MERGE_CRAWL_INPUT="$INP"
341 |         else
342 |             MERGE_CRAWL_INPUT="--add_input $INP $MERGE_CRAWL_INPUT"
343 |         fi
344 |     done
345 | 
346 |     VERTEX_IDS=""
347 |     if hadoop fs -test -d "$HDFS_BASE_DIR"/hostgraph_merged_vertices; then
348 |         VERTEX_IDS="--vertex_ids $HDFS_BASE_DIR/hostgraph_merged_vertices"
349 |     fi
350 | 
351 |     _step hostgraph_merged \
352 |       "$SPARK_HOME"/bin/spark-submit \
353 |         $SPARK_ON_YARN \
354 |         $SPARK_HADOOP_OPTS \
355 |         --py-files "$PYFILES_HOST_LINKS_TO_GRAPH" \
356 |         --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
357 |         --conf spark.task.maxFailures=10 \
358 |         --conf spark.executor.memory=$EXECUTOR_MEM \
359 |         --conf spark.driver.memory=6g \
360 |         --conf spark.core.connection.ack.wait.timeout=600s \
361 |         --conf spark.network.timeout=300s \
362 |         --conf spark.shuffle.io.maxRetries=5 \
363 |         --conf spark.shuffle.io.retryWait=30s \
364 |         --conf spark.locality.wait=1s \
365 |         --conf spark.io.compression.codec=zstd \
366 |         --conf spark.checkpoint.compress=true \
367 |         --num-executors $NUM_EXECUTORS \
368 |         --executor-cores $EXECUTOR_CORES \
369 |         --executor-memory $EXECUTOR_MEM \
370 |         --conf spark.sql.warehouse.dir=$WAREHOUSE_DIR \
371 |         --conf spark.sql.parquet.compression.codec=zstd \
372 |         $SPARK_EXTRA_OPTS \
373 |         $HOST_LINKS_TO_GRAPH \
374 |         "${HOST_LINKS_TO_GRAPH_ARGS[@]}" \
375 |         --save_as_text "$HDFS_BASE_DIR"/text/$MERGE_NAME \
376 |         --vertex_partitions $WEBGRAPH_VERTEX_PARTITIONS \
377 |         --num_output_partitions $WEBGRAPH_EDGE_PARTITIONS \
378 |         --local_temp_dir "$TMPDIR" \
379 |         $VERTEX_IDS \
380 |         $MERGE_CRAWL_INPUT hostgraph_merged
381 | 
382 |     _step hostgraph_merged.upload.1 \
383 |           upload_parquet hostgraph_merged $MERGE_NAME
384 | 
385 |     _step hostgraph_merged.upload.2 \
386 |           upload_text $MERGE_NAME $MERGE_NAME true
387 | 
388 |     ### merge (one file for vertices, one for edges) and upload
389 |     # _step hostgraph_merged.upload.2 \
390 |     #       dump_upload_text $MERGE_NAME $MERGE_NAME
391 | 
392 | elif [ -n "$MERGE_INPUT" ]; then
393 | 
394 |     echo "MERGE_INPUT is defined, but no MERGE_NAME given?"
395 |     exit 1
396 | 
397 | fi
398 | 


--------------------------------------------------------------------------------
/src/script/hostgraph/hostgraph_config.sh:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | ### configuration of Common Crawl webgraph releases
  3 | ### (sourced from other scripts)
  4 | ################################################################################
  5 | 
  6 | 
  7 | ################################################################################
  8 | ### Exctraction of inter-host links from
  9 | ###  - WAT files and
 10 | ###  - non-200 responses WARC files for redirects
 11 | ###  - (optionally) sitemap directives in robots.txt files
 12 | ### saved as tuples <from_host, to_host>
 13 | 
 14 | # crawls to be processed
 15 | CRAWLS=("CC-MAIN-2025-08" "CC-MAIN-2025-13" "CC-MAIN-2025-18")
 16 | 
 17 | INPUT_BASE_URL="s3://commoncrawl/"
 18 | 
 19 | # whether to include links to sitemaps contained in robots.txt files
 20 | # Note: often links to sitemap indicate relations between domain owners.
 21 | INCLUDE_ROBOTSTXT_SITEMAP_LINKS=true
 22 | 
 23 | # whether to construct a host-level graph for each input crawl
 24 | CONSTRUCT_HOSTGRAPH=false
 25 | 
 26 | # max. number of input files (WARC/WAT) per Spark job
 27 | # - splits hostlink extraction into multiple jobs
 28 | # - output is checkpointed on S3 after each job
 29 | #   (useful if cluster runs on spot instances)
 30 | MAX_INPUT_SIZE=64000
 31 | 
 32 | # hdfs:// directory where input and output is kept
 33 | HDFS_BASE_DIR=hdfs:///user/ubuntu/webgraph
 34 | WAREHOUSE_DIR=$HDFS_BASE_DIR
 35 | 
 36 | # where to keep results on s3://
 37 | # (note: this is a private bucket and needs to be changed)
 38 | S3_OUTPUT_PREFIX=s3://commoncrawl-webgraph
 39 | S3A_OUTPUT_PREFIX=s3a://commoncrawl-webgraph
 40 | 
 41 | 
 42 | ################################################################################
 43 | # construct a merged graph of multiple monthly crawls
 44 | 
 45 | MERGE_NAME=cc-main-2025-feb-mar-apr
 46 | 
 47 | # Naming convention should be the three months' crawls that are
 48 | # used to generate this graph release. In the event of multiple months
 49 | # in a crawl, (e.g August & September, November & December) the first month is
 50 | # used (e.g aug-nov).
 51 | 
 52 | # input to construct a merged graph (over multiple months)
 53 | # - used in addition to input crawls (see CRAWLS)
 54 | # - output directories of hostlinks jobs of prior crawls
 55 | # - list of fully-qualified paths:
 56 | #   ("s3a://.../hostlinks/0/" "s3a://.../hostlinks/1/" ...)
 57 | # - ev. copy the data from s3:// to hdfs:// to avoid tasks
 58 | #   taking long while reading from S3
 59 | MERGE_INPUT=()
 60 | 
 61 | 
 62 | ################################################################################
 63 | # workflow runtime
 64 | 
 65 | # temporary directory
 66 | # - must exist on task/compute nodes for buffering data
 67 | # - should provide several GBs of free space
 68 | TMPDIR=/data/0/tmp
 69 | 
 70 | # where to keep logs for steps
 71 | LOGDIR=$PWD
 72 | 
 73 | # file to stop the workflow (stops after a the currently running step(s) are done)
 74 | STOP_FILE_=$LOGDIR/$(basename "$0" .sh).stop
 75 | 
 76 | # use Python executable different than default "python"
 77 | export PYSPARK_PYTHON=python3
 78 | 
 79 | ################################################################################
 80 | 
 81 | 
 82 | ################################################################################
 83 | ### Spark / Yarn cluster configuration
 84 | NUM_EXECUTORS=${NUM_EXECUTORS:-16}
 85 | EXECUTOR_CONFIG=${EXECUTOR_CONFIG:-"r5.xlarge"}
 86 | # NOTE:
 87 | #  - step 1 (host link extraction) can be run on smaller instances
 88 | #    or "compute optimized" instance types
 89 | #  - webgraph construction (esp. for merged graphs including multiple monthly crawls)
 90 | #    needs instances with sufficient amount of RAM (32 GB or more)
 91 | #  - assigning IDs in multiple partitions
 92 | #    (see hostlinks_to_graph.py --vertex_partitions)
 93 | #    reduces the memory requirements significantly
 94 | 
 95 | 
 96 | case "$EXECUTOR_CONFIG" in
 97 |     c[5678]*.xlarge )
 98 |         EXECUTOR_CORES=3
 99 |         EXECUTOR_MEM=5g
100 |         NODEMANAGER_MEM_MB=$((6*1024))
101 |         ;;
102 |     c[5678]*.2xlarge )
103 |         EXECUTOR_CORES=6
104 |         EXECUTOR_MEM=10g
105 |         NODEMANAGER_MEM_MB=$((11*1024))
106 |         ;;
107 |     c[5678]*.4xlarge )
108 |         EXECUTOR_CORES=12
109 |         EXECUTOR_MEM=22g
110 |         NODEMANAGER_MEM_MB=$((24*1024))
111 |         ;;
112 |     r[5678]*.xlarge )
113 |         EXECUTOR_CORES=4
114 |         EXECUTOR_MEM=23g
115 |         NODEMANAGER_MEM_MB=$((24*1024))
116 |         ;;
117 |     r[5678]*.2xlarge )
118 |         EXECUTOR_CORES=7
119 |         EXECUTOR_MEM=46g
120 |         NODEMANAGER_MEM_MB=$((48*1024))
121 |         ;;
122 |     r[5678]*.4xlarge )
123 |         EXECUTOR_CORES=15
124 |         EXECUTOR_MEM=94g
125 |         NODEMANAGER_MEM_MB=$((96*1024))
126 |         ;;
127 |     r[5678]*.8xlarge )
128 |         EXECUTOR_CORES=30
129 |         EXECUTOR_MEM=190g
130 |         NODEMANAGER_MEM_MB=$((192*1024))
131 |         ;;
132 |     m[5678]*.2xlarge )
133 |         EXECUTOR_CORES=8
134 |         EXECUTOR_MEM=23g
135 |         NODEMANAGER_MEM_MB=$((24*1024))
136 |         ;;
137 |     m[5678]*.4xlarge )
138 |         EXECUTOR_CORES=16
139 |         EXECUTOR_MEM=46g
140 |         NODEMANAGER_MEM_MB=$((48*1024))
141 |         ;;
142 |     m[5678]*.8xlarge )
143 |         EXECUTOR_CORES=32
144 |         EXECUTOR_MEM=94g
145 |         NODEMANAGER_MEM_MB=$((98*1024))
146 |         ;;
147 |     "custom" )
148 |         if [ -z "$EXECUTOR_CORES" ] || [ -z "$EXECUTOR_MEM" ]; then
149 |             echo "No valid custom executor configuration: must specify EXECUTOR_CORES and EXECUTOR_MEM'" >&2
150 |             exit 1
151 |         fi
152 |         ;;
153 |     * )
154 |         echo "No valid executor configuration: '$EXECUTOR_CONFIG'" >&2
155 |         exit 1
156 | esac
157 | 
158 | SPARK_EXTRA_OPTS="$SPARK_EXTRA_OPTS --conf spark.yarn.nodemanager.resource.memory-mb=$NODEMANAGER_MEM_MB"
159 | 
160 | OUTPUT_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/2))
161 | WEBGRAPH_EDGE_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/2))
162 | WEBGRAPH_EDGE_PARTITIONS=$(((WEBGRAPH_EDGE_PARTITIONS<NUM_EXECUTORS)?NUM_EXECUTORS:WEBGRAPH_EDGE_PARTITIONS))
163 | WEBGRAPH_VERTEX_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/4))
164 | WEBGRAPH_VERTEX_PARTITIONS=$(((WEBGRAPH_VERTEX_PARTITIONS<NUM_EXECUTORS)?NUM_EXECUTORS:WEBGRAPH_VERTEX_PARTITIONS))
165 | DIVISOR_INPUT_PARTITIONS=5
166 | 


--------------------------------------------------------------------------------
/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Build node indexes to interactively explore a Common Crawl webgraph.
  4 | # The webgraph files are expected to be placed in the current directory.
  5 | 
  6 | NAME="$1"
  7 | VERTICES="$2"
  8 | if ! shift 2; then
  9 |     echo "$(basename $0) <name> <vertices>"
 10 |     echo
 11 |     echo "Build node indexes to interactively explore a Common Crawl webgraph."
 12 |     echo "The webgraph files are expected to be placed in the current directory."
 13 |     echo
 14 |     echo "  <name>      basename of the graph (without the .graph suffix)"
 15 |     echo "  <vertices>  vertices file name (including the file suffix)"
 16 |     echo "              or directory containing the vertices files"
 17 |     echo
 18 |     exit 1
 19 | fi
 20 | 
 21 | export LC_ALL=C
 22 | 
 23 | BIN="$(dirname $0)"
 24 | WG="$BIN/run_webgraph.sh"
 25 | 
 26 | declare -A suffix_name_map
 27 | suffix_name_map=(
 28 |     graph      "webgraph / BVGraph"
 29 |     properties "webgraph properties"
 30 |     offsets    "webgraph offsets"
 31 |     iepm "immutable external prefix map"
 32 |     mph  "minimal perfect hash"
 33 |     fcl  "front coded list"
 34 |     smph "string map perfect hash"
 35 | )
 36 | 
 37 | function list_webgraph_files() {
 38 |     name="$1"; shift
 39 |     ok=true
 40 |     for suffix in "$@"; do
 41 |         if [ -e $name.$suffix ]; then
 42 |             printf " .%-10s : %-20s  (%s)\n" "$suffix" \
 43 |                    "${suffix_name_map[$suffix]}" "$name.$suffix"
 44 |         else
 45 |             echo -e "Missing $name.$suffix (${suffix_name_map[$suffix]})"
 46 |             ok=false
 47 |         fi
 48 |     done
 49 |     if ! $ok; then
 50 |         exit 1
 51 |     fi
 52 | }
 53 | 
 54 | function index_status() {
 55 |     echo
 56 |     echo "Prepared webgraph $NAME for look-ups by node label."
 57 |     echo "The following files (by file suffix) will be used:"
 58 | 
 59 |     echo "Webgraph:"
 60 |     list_webgraph_files $NAME graph properties offsets
 61 |     echo "Webgraph (transpose):"
 62 |     list_webgraph_files $NAME-t graph properties offsets
 63 | 
 64 |     echo "Mapping vertex labels to vertex IDs:"
 65 |     if [ -e $NAME.iepm ]; then
 66 |         list_webgraph_files $NAME iepm
 67 |     else
 68 |         list_webgraph_files $NAME mph fcl smph
 69 |     fi
 70 | }
 71 | 
 72 | 
 73 | # check for graph files (.graph and .properties), also for the
 74 | # transpose of the graph ($NAME-t.$suffix)
 75 | echo "Verifying webgraph files:"
 76 | list_webgraph_files $NAME graph properties
 77 | echo "Verifying webgraph files (transpose of the graph):"
 78 | list_webgraph_files $NAME-t graph properties
 79 | 
 80 | # check for the vertices file
 81 | if ! [ -e $VERTICES ]; then
 82 |     echo "Vertices file not found"
 83 |     exit 1
 84 | fi
 85 | 
 86 | 
 87 | # generate offsets (*.offsets and *.obl)
 88 | if ! [ -e $NAME.offsets ]; then
 89 |     "$WG" it.unimi.dsi.webgraph.BVGraph --offsets --list $NAME
 90 |     echo "webgraph offsets file created"
 91 | fi
 92 | if ! [ -e $NAME-t.offsets ]; then
 93 |     "$WG" it.unimi.dsi.webgraph.BVGraph --offsets --list $NAME-t
 94 |     echo "webgraph offsets file created (transpose of the graph)"
 95 | fi
 96 | 
 97 | 
 98 | # building `iepm` "immutable external prefix map"
 99 | # (https://dsiutils.di.unimi.it/docs/it/unimi/dsi/util/ImmutableExternalPrefixMap.html)
100 | # bidirectional mapping from node names to node IDs
101 | if [ -e $NAME.iepm ]; then
102 |     index_status
103 |     exit 0
104 | fi
105 | CAT_VERTICES="zcat $VERTICES"
106 | if [ -d $VERTICES ]; then
107 |     # host-level webgraph, multiple vertex files
108 |     CAT_VERTICES="zcat $VERTICES/*.txt.gz"
109 | fi
110 | if (set -eo pipefail;
111 |     eval $CAT_VERTICES \
112 |         | cut -f2 \
113 |         | "$WG" it.unimi.dsi.util.ImmutableExternalPrefixMap --block-size 4Ki $NAME.iepm); then
114 |     echo "immutable external prefix map successfully built: $NAME.iepm"
115 |     index_status
116 |     exit 0
117 | fi
118 | # Note: building the `iepm` may fail for older versions of the domain
119 | # graph (before the graphs of May, June/July and August 2022) because
120 | # the nodes were not properly lexicographically sorted while folding
121 | # host names to domain names. If this is the case, continue to create
122 | # instead mappings which do not depend on proper sorting.
123 | 
124 | # build
125 | # - the `mph` (minimal perfect hash) file mapping from node label
126 | #   (reversed domain name) to node ID
127 | # - a front coded list to map node IDs to node labels
128 | if ! [ -e $NAME.mph ] || ! [ -e $NAME.fcl ]; then
129 |     zcat $VERTICES \
130 |         | cut -f2 \
131 |         | tee >("$WG" it.unimi.dsi.sux4j.mph.GOV4Function $NAME.mph) \
132 |         | "$WG" it.unimi.dsi.util.FrontCodedStringList --utf8 --ratio 32 $NAME.fcl
133 | fi
134 | 
135 | # build the `smph` file (string map perfect hash) required to
136 | # determine whether a node label is present in the `mph` file
137 | if ! [ -e $NAME.smph ]; then
138 |     zcat $VERTICES \
139 |         | cut -f2 \
140 |         | "$WG" it.unimi.dsi.util.ShiftAddXorSignedStringMap $NAME.mph $NAME.smph
141 | fi
142 | 
143 | 
144 | index_status
145 | 


--------------------------------------------------------------------------------
/src/script/webgraph_ranking/graph_explore_download_webgraph.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | NAME="$1"
  4 | if ! shift 1; then
  5 |     echo "$(basename $0) <name>"
  6 |     echo
  7 |     echo "Download all files required to interactively explore a Common Crawl webgraph."
  8 |     echo "The downloaded files are placed in the current directory."
  9 |     echo "Wget or curl are required for downloading"
 10 |     echo
 11 |     echo "<name>   webgraph base name without file suffix, eg. cc-main-2023-mar-may-oct-domain"
 12 |     echo
 13 |     exit 1
 14 | fi
 15 | 
 16 | export LC_ALL=C
 17 | 
 18 | BIN="$(dirname $0)"
 19 | 
 20 | USING_CURL=false
 21 | USING_WGET=false
 22 | if command -v curl &>/dev/null; then
 23 |     USING_CURL=true
 24 | elif command -v wget &>/dev/null; then
 25 |     USING_WGET=true
 26 | else
 27 |     echo "Either curl or wget are required for downloading" >&2
 28 |     exit 1
 29 | fi
 30 | 
 31 | declare -A suffix_name_map
 32 | suffix_name_map=(
 33 |     graph      "webgraph / BVGraph"
 34 |     properties "webgraph properties"
 35 |     offsets    "webgraph offsets"
 36 |     stats      "webgraph statistics"
 37 |     txt.gz     "text file (vertex labels)"
 38 | )
 39 | 
 40 | function list_webgraph_files() {
 41 |     name="$1"; shift
 42 |     ok=true
 43 |     for suffix in "$@"; do
 44 |         if [ -e $name.$suffix ]; then
 45 |             printf " .%-10s : %-20s  (%s)\n" "$suffix" \
 46 |                    "${suffix_name_map[$suffix]}" "$name.$suffix"
 47 |         elif [ -d "$name" ] && [[ "$suffix" =~ ^\*. ]]; then
 48 |             ls "$name"/* | sed 's/^/\t/'
 49 |         else
 50 |             echo -e "Missing $name.$suffix (${suffix_name_map[$suffix]})"
 51 |             ok=false
 52 |         fi
 53 |     done
 54 |     if ! $ok; then
 55 |         exit 1
 56 |     fi
 57 | }
 58 | 
 59 | function download_file() {
 60 |     FILE="$1"
 61 |     if [ -e "$FILE" ]; then
 62 |         return # already done
 63 |     fi
 64 |     URL="https://data.commoncrawl.org/projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/$FILE"
 65 |     echo "Downloading $URL"
 66 | 
 67 |     if $USING_CURL; then
 68 | 
 69 |         curl --silent --show-error --fail \
 70 |              --remote-time -o "$FILE" --time-cond "$FILE" --continue-at - \
 71 |              --retry 1000 --retry-delay 1 "$URL"
 72 | 
 73 |     elif $USING_WGET; then
 74 | 
 75 |         if [ "$(dirname "$FILE")" == "." ]; then
 76 |             wget --continue --timestamping --tries=0 --retry-on-http-error=503 --waitretry=1 "$URL"
 77 |         else
 78 |             wget --continue --timestamping --directory-prefix="$(dirname "$FILE")" \
 79 |                  --tries=0 --retry-on-http-error=503 --waitretry=1 "$URL"
 80 |         fi
 81 | 
 82 |     fi
 83 | }
 84 | 
 85 | function download_files() {
 86 |     name="$1"; shift
 87 |     for suffix in "$@"; do
 88 |         download_file "$name.$suffix"
 89 |     done
 90 | }
 91 | 
 92 | 
 93 | BASE_NAME="${NAME%-domain}"
 94 | BASE_NAME="${BASE_NAME%-host}"
 95 | GRAPH_AGGR_LEVEL="${NAME##*-}"
 96 | 
 97 | 
 98 | set -e # stop on errors
 99 | 
100 | download_files "$NAME" graph properties stats
101 | download_files "$NAME-t" graph properties
102 | 
103 | if [ "$GRAPH_AGGR_LEVEL" == "domain" ]; then
104 |     download_files "$NAME-vertices" txt.gz
105 | else
106 |     download_files "$NAME-vertices" paths.gz
107 |     zcat "$NAME-vertices".paths.gz \
108 |         | while read path; do
109 |         file=${path#projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/}
110 |         mkdir -p $(dirname "$file")
111 |         download_file "$file"
112 |      done
113 | fi
114 | 
115 | echo "Downloaded files"
116 | echo "- webgraph"
117 | list_webgraph_files $NAME graph properties stats
118 | echo "- webgraph (transpose)"
119 | list_webgraph_files $NAME-t graph properties
120 | echo "- webgraph vertices"
121 | if [ "$GRAPH_AGGR_LEVEL" == "domain" ]; then
122 |     list_webgraph_files $NAME-vertices txt.gz
123 | else
124 |     list_webgraph_files vertices "*.txt.gz"
125 | fi
126 | 


--------------------------------------------------------------------------------
/src/script/webgraph_ranking/graph_explore_load_graph.jsh:
--------------------------------------------------------------------------------
 1 | /open PRINTING
 2 | 
 3 | String graph = System.getProperty("graph")
 4 | println("Loading graph " + graph)
 5 | 
 6 | import org.commoncrawl.webgraph.explore.Graph
 7 | import org.commoncrawl.webgraph.explore.GraphExplorer
 8 | import it.unimi.dsi.webgraph.ImmutableGraph
 9 | 
10 | GraphExplorer e = new GraphExplorer(graph)
11 | Graph g = e.getGraph()
12 | 
13 | println()
14 | println("Graph " + graph + " loaded into GraphExplorer *e*")
15 | println("Type \"e.\" and press <TAB> to list the public methods of the class GraphExplorer")
16 | println("... or \"g.\" for the graph loaded for exploration")
17 | 
18 | /* Define commands provided by pywebgraph (cn, pwn, ls, sl) */
19 | void cn(String vertexLabel) { e.cn(vertexLabel); }
20 | void cn(long vertexID) { e.cn(vertexID); }
21 | void pwn() { e.pwn(); }
22 | void ls() { e.ls(); }
23 | void ls(long vertexId) { e.ls(vertexId); }
24 | void ls(String vertexLabel) { e.ls(vertexLabel); }
25 | void sl() { e.sl(); }
26 | void sl(long vertexId) { e.sl(vertexId); }
27 | void sl(String vertexLabel) { e.sl(vertexLabel); }
28 | 
29 | println()
30 | println("... or use one of the predefined methods:")
31 | /methods cn pwn ls sl
32 | println()


--------------------------------------------------------------------------------
/src/script/webgraph_ranking/process_webgraph.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | NAME="$1"
  4 | VERTICES="$2"
  5 | EDGES="$3"
  6 | if ! shift 3; then
  7 |     echo "$(basename $0) <name> <vertices> <edges> [<output_dir>]"
  8 |     exit 1
  9 | fi
 10 | 
 11 | if ! [[ "$NAME" =~ ^[a-zA-Z0-9_][a-zA-Z0-9_.-]+[a-zA-Z0-9_]$ ]]; then
 12 |     echo "Graph <name> should only contain [a-zA-Z0-9_.-] and start and end with [a-zA-Z0-9_]."
 13 |     echo "The graph name '$NAME' might not be safe as a graph base name (without suffix)"
 14 |     echo "or directory name to place the graph files into. Exiting..."
 15 |     exit 1
 16 | fi
 17 | 
 18 | OUTPUTDIR="$NAME"
 19 | if [ -n "$1" ]; then
 20 |     OUTPUTDIR="$1"
 21 |     shift
 22 |     case "$OUTPUTDIR" in
 23 |         *" "* )
 24 |             echo "The output directory <output_dir> must not contain white space. Exiting..."
 25 |             exit 1
 26 |             ;;
 27 |     esac
 28 | fi
 29 | FULLNAME="$OUTPUTDIR/$NAME"
 30 | 
 31 | 
 32 | set -e # fail if creation of output directory fails
 33 | 
 34 | if [ -d "$OUTPUTDIR" ]; then
 35 |     echo "Output directory $OUTPUTDIR/ exists"
 36 | else
 37 |     mkdir "$OUTPUTDIR"
 38 | fi
 39 | 
 40 | export LC_ALL=C
 41 | 
 42 | BIN=$(dirname $0)
 43 | WG=$BIN/run_webgraph.sh
 44 | LW=$BIN/run_webgraph.sh
 45 | 
 46 | source $BIN/../workflow_lib.sh
 47 | source $BIN/webgraph_config.sh
 48 | 
 49 | 
 50 | if ! ${USE_WEBGRAPH_BIG:-false} && [ $GRAPH_SIZE_NODES -gt $((0x7ffffff7)) ]; then
 51 |     echo "Graph has more nodes than max. array size in Java"
 52 |     echo "Using big version of webgraph framework"
 53 |     USE_WEBGRAPH_BIG=true
 54 | fi
 55 | if ${USE_WEBGRAPH_BIG:-false}; then
 56 |     WGP=it.unimi.dsi.big.webgraph
 57 | else
 58 |     WGP=it.unimi.dsi.webgraph
 59 | fi
 60 | 
 61 | 
 62 | # logging
 63 | test -d $OUTPUTDIR/logs || mkdir $OUTPUTDIR/logs
 64 | LOGDIR=$OUTPUTDIR/logs
 65 | # file to stop workflow
 66 | STOP_FILE_=$LOGDIR/$(basename $0 .sh).stop
 67 | 
 68 | function join_rank() (
 69 |     set -exo pipefail
 70 |     _DATA_TYPE=$1
 71 |     _IN=$2
 72 |     _VERT=$3
 73 |     _OUT=$4
 74 |     _EXTRA_FIELDS=""
 75 |     if [ -n "$5" ]; then
 76 |         _EXTRA_FIELDS=",$5"
 77 |     fi
 78 | 
 79 | 
 80 |     if [ -d $_VERT ]; then
 81 |         # _VERT is a directory with multiple vertices files
 82 |         _VERT="${_VERT}/*.gz"
 83 |     fi
 84 | 
 85 |     ### unpack scores with LAW, join node names via paste,
 86 |     ### assign ranks on sorted lines by nl
 87 |     $LW it.unimi.dsi.law.io.tool.DataInput2Text --type $_DATA_TYPE $_IN - \
 88 |         | paste - <(zcat $_VERT | cut -f2$_EXTRA_FIELDS) \
 89 |         | sort --batch-size=$SORT_BATCHES --buffer-size=$SORT_BUFFER_SIZE --compress-program=gzip -t$'\t' -k1,1gr --stable \
 90 |         | nl -w1 -nln \
 91 |         | gzip >$_OUT
 92 | )
 93 | 
 94 | function join_harmonicc_pagerank() (
 95 |     set -exo pipefail
 96 |     NAME="$1"
 97 |     _IN_HC="$2"
 98 |     _IN_PR="$3"
 99 |     _OUT="$4"
100 |     _EXTRA_FIELDS=""
101 |     HEADER="#harmonicc_pos\t#harmonicc_val\t#pr_pos\t#pr_val\t#host_rev"
102 |     if [ -n "$5" ]; then
103 |         _EXTRA_FIELDS=",$5"
104 |         HEADER="$HEADER\t$6"
105 |     fi
106 |     SORTOPTS="$SORT_PARALLEL_THREADS_OPT --batch-size=$SORT_BATCHES --buffer-size=$SORT_BUFFER_SIZE --compress-program=gzip"
107 |     (echo -e "$HEADER";
108 |      zcat $_IN_HC | sort $SORTOPTS -t$'\t' -k3,3 --unique --stable \
109 |          | join -a1 -a2 -e'---' -t$'\t' -j3 -o1.1,1.2,2.1,2.2,0$_EXTRA_FIELDS - \
110 |                 <(zcat $_IN_PR | sort $SORTOPTS -t$'\t' -k3,3 --unique --stable) \
111 |          | sort $SORTOPTS -t$'\t' -k1,1n -s) \
112 |      | gzip >$_OUT
113 | )
114 | 
115 | function join_ranks_in_memory() (
116 |     set -exo pipefail
117 |     _VERT="$1"
118 |     _HC="$2"
119 |     _PR="$3"
120 |     _OUT="$4"
121 |     HEADER="#harmonicc_pos\t#harmonicc_val\t#pr_pos\t#pr_val\t#host_rev"
122 |     if [ -n "$5" ]; then
123 |         HEADER="${HEADER}\t$5"
124 |     fi
125 |     if [ -d $_VERT ]; then
126 |         # _VERT is a directory with multiple vertices files
127 |         _VERT="$_VERT/*.gz"
128 |     fi
129 |     OPTS=""
130 |     # heuristics to set Java heap memory
131 |     # bytes required per node (in theory, 60% more in practice)
132 |     BYTES_MEM_REQUIRED=24
133 |     if $USE_WEBGRAPH_BIG; then
134 |         OPTS="--big"
135 |         BYTES_MEM_REQUIRED=36
136 |     fi
137 |     BYTES_MEM_REQUIRED=$(($BYTES_MEM_REQUIRED*$GRAPH_SIZE_NODES*16/10))
138 |     JAVA_HEAP_GB=$((($BYTES_MEM_REQUIRED/2**30)+1))
139 |     JAVAOPTS="-Xmx${JAVA_HEAP_GB}g"
140 |     SORTOPTS="$SORT_PARALLEL_THREADS_OPT --batch-size=$SORT_BATCHES --buffer-size=$SORT_BUFFER_SIZE --compress-program=gzip"
141 |     (echo -e "$HEADER";
142 |      JAVA_OPTS=$JAVA_OPTS $WG org.commoncrawl.webgraph.JoinSortRanks $OPTS <(zcat $_VERT) $_HC $_PR -) \
143 |       | sort $SORTOPTS -t$'\t' -k1,1n --stable | gzip >$_OUT
144 | )
145 | 
146 | function join_degrees() (
147 |     set -exo pipefail
148 |     _FULLNAME="$1"
149 |     _VERT="$2"
150 |     HEADER="#outdegree\t#indegree\t#host_rev"
151 |     if [ -n "$3" ]; then
152 |         HEADER="$HEADER\t$3"
153 |     fi
154 |     if [ -d $_VERT ]; then
155 |         # _VERT is a directory with multiple vertices files
156 |         _VERT="$_VERT/*.gz"
157 |     fi
158 |     zcat $_VERT \
159 |         | cut -f2- \
160 |         | paste $FULLNAME.outdegrees $FULLNAME.indegrees - \
161 |         | gzip >$FULLNAME-outdegrees-indegrees.txt.gz
162 |     # top-N out/indegrees
163 |     (echo -e "$HEADER";
164 |      set +o pipefail;
165 |      zcat $FULLNAME-outdegrees-indegrees.txt.gz \
166 |          | perl -aF'\t' -lne 'print if $F[0] > 1000' \
167 |          | sort -k1,1nr \
168 |          | head -10000) \
169 |         | gzip >$FULLNAME-outdegrees-indegrees-topout.txt.gz
170 |     (echo -e "$HEADER";
171 |      set +o pipefail;
172 |      zcat $FULLNAME-outdegrees-indegrees.txt.gz \
173 |          | perl -aF'\t' -lne 'print if $F[1] > 1000' \
174 |          | sort -k2,2nr \
175 |          | head -10000) \
176 |         | gzip >$FULLNAME-outdegrees-indegrees-topin.txt.gz
177 | )
178 | 
179 | function connected_distrib() (
180 |     set -exo pipefail
181 |     NUM_NODES=$1
182 |     INPUT=$2
183 |     OUTPUT=$3
184 |     (echo -e "#freq\t#size\t#perc"; \
185 |      $LW it.unimi.dsi.law.io.tool.DataInput2Text --type int $INPUT - \
186 |          | perl -lne '$h{$_}++; END { while (($k,$v)=each %h) { print sprintf("%d\t%d\t%9.6f%%", $v, $k, 100*$k*$v/'$NUM_NODES') } }' \
187 |          | sort -k2,2nr) \
188 |         | gzip >$OUTPUT
189 | )
190 | 
191 | function degree_distrib() (
192 |     set -exo pipefail
193 |     TYPE="$1"
194 |     NAME="$2"
195 |     (echo -e "#arcs\t#nodes";
196 |      perl -lne 'print sprintf("%d\t%s", ($.-1), $_) if $_ ne 0' $NAME.$TYPE) \
197 |         | gzip >$FULLNAME-$TYPE-distrib.txt.gz
198 | )
199 | 
200 | 
201 | 
202 | set -exo pipefail
203 | 
204 | if [ -d $EDGES ]; then
205 |     # edges is a directory with multiple files
206 |     sort_input=""
207 |     for e in $EDGES/part-*.gz; do
208 |         sort_input="$sort_input <(zcat $e)"
209 |     done
210 |     if ${USE_WEBGRAPH_BIG:-false}; then
211 |         ## TODO:
212 |         ##    * option  --threads  not available in webgraph-big
213 |         ##    * need to load from stdin
214 |         ##      (fails to read longs when reading BVGraph from file)
215 |         ##       Caused by: java.lang.IllegalArgumentException: 4635383979
216 |         ##               at it.unimi.dsi.big.webgraph.ImmutableGraph$BigImmutableGraphAdapter.check(ImmutableGraph.java:801)
217 |         ##               at it.unimi.dsi.big.webgraph.ImmutableGraph$BigImmutableGraphAdapter.access$200(ImmutableGraph.java:793)
218 |         ##               at it.unimi.dsi.big.webgraph.ImmutableGraph$BigImmutableGraphAdapter$1$1.nextInt(ImmutableGraph.java:832)
219 |         ##               at it.unimi.dsi.webgraph.LazyIntIterators.unwrap(LazyIntIterators.java:51)
220 |         ##               at it.unimi.dsi.webgraph.NodeIterator.successorArray(NodeIterator.java:70)
221 |         ##               at it.unimi.dsi.webgraph.ArrayListMutableGraph.<init>(ArrayListMutableGraph.java:114)
222 |         ##               at it.unimi.dsi.big.webgraph.ArcListASCIIGraph.load(ArcListASCIIGraph.java:283)
223 |         ##               at it.unimi.dsi.big.webgraph.ArcListASCIIGraph.load(ArcListASCIIGraph.java:279)
224 |         ##               at it.unimi.dsi.big.webgraph.ArcListASCIIGraph.loadOffline(ArcListASCIIGraph.java:255)
225 |         _step bvgraph \
226 |               bash -c "eval \"sort --batch-size=$SORT_BATCHES -t$'\t' -k1,1n -k2,2n --stable --merge $sort_input\" | $WG $WGP.BVGraph --once -g $WGP.ArcListASCIIGraph - $FULLNAME"
227 |     else
228 |         _step bvgraph \
229 |               bash -c "$WG $WGP.BVGraph --threads $THREADS -g $WGP.ArcListASCIIGraph <(eval \"sort --batch-size=$SORT_BATCHES -t$'\t' -k1,1n -k2,2n --stable --merge $sort_input\") $FULLNAME"
230 |     fi
231 | else
232 |     if ${USE_WEBGRAPH_BIG:-false}; then
233 |         _step bvgraph \
234 |               bash -c "zcat $EDGES | $WG $WGP.BVGraph --once -g $WGP.ArcListASCIIGraph - $FULLNAME"
235 |     else
236 |         _step bvgraph \
237 |               $WG $WGP.BVGraph --threads $THREADS -g $WGP.ArcListASCIIGraph <(zcat $EDGES) $FULLNAME
238 |     fi
239 | fi
240 | 
241 | if ${USE_WEBGRAPH_BIG:-false}; then
242 |     _step transpose \
243 |           $WG $WGP.Transform transposeOffline $FULLNAME $FULLNAME-t
244 | else
245 |     # if low memory, add
246 |     # --offline, combined with
247 |     # -Djava.io.tmpdir=... to point to a temporary directory with free space 2 times the graph size
248 |     #     (see also run_webgraph.sh)
249 |     _step transpose \
250 |           $WG $WGP.Transform transpose $FULLNAME $FULLNAME-t
251 | fi
252 | # _step symmetrize \
253 | #       $WG $WGP.Transform symmetrize $FULLNAME $FULLNAME-t $FULLNAME-sym
254 | 
255 | _step hyperball \
256 |       $WG $WGP.algo.HyperBall --threads $THREADS --offline --log2m $HYPERBALL_REGISTERS \
257 |       --harmonic-centrality $FULLNAME-harmonicc.bin $FULLNAME-t $FULLNAME
258 | 
259 | if ${USE_WEBGRAPH_BIG:-false}; then
260 |     _step pagerank \
261 |           $LW it.unimi.dsi.law.big.rank.PageRankParallelGaussSeidel      --mapped --threads $THREADS $FULLNAME-t $FULLNAME-pagerank
262 | else
263 |     _step pagerank \
264 |           $LW it.unimi.dsi.law.rank.PageRankParallelGaussSeidel --expand --mapped --threads $THREADS $FULLNAME-t $FULLNAME-pagerank
265 | fi
266 | 
267 | _step_bg connected 15 \
268 |          $WG $WGP.algo.ConnectedComponents --threads $THREADS -m --renumber --sizes -t $FULLNAME-t $FULLNAME
269 | connected_pid=$!
270 | _step_bg strongly_connected 15 \
271 |          $WG $WGP.algo.StronglyConnectedComponents --renumber --sizes $FULLNAME
272 | strongly_connected_pid=$!
273 | 
274 | EXTRA_FIELDS=""
275 | EXTRA_FIELDS_JOIN=""
276 | EXTRA_FIELDS_HEADER=""
277 | if [ $VERTICES_FIELDS -gt 2 ]; then
278 |     EXTRA_FIELDS="3-$VERTICES_FIELDS"
279 |     EXTRA_FIELDS_JOIN="1.4"
280 |     EXTRA_FIELDS_HEADER="#n_hosts"
281 |     for i in $(seq 4 $VERTICES_FIELDS); do
282 |         EXTRA_FIELDS_JOIN="${EXTRA_FIELDS_JOIN},1.$(($i+1))"
283 |     done
284 | fi
285 | 
286 | if ${JOIN_RANKS_IN_MEMORY}; then
287 |     _step_bg join_ranks 15 \
288 |              join_ranks_in_memory $VERTICES $FULLNAME-harmonicc.bin $FULLNAME-pagerank.ranks $FULLNAME-ranks.txt.gz "$EXTRA_FIELDS_HEADER"
289 | else
290 |     _step_bg join_harmonicc 15 \
291 |              join_rank float $FULLNAME-harmonicc.bin $VERTICES $FULLNAME-harmonic-centrality.txt.gz "$EXTRA_FIELDS"
292 |     _step_bg join_pr_gs 15 \
293 |              join_rank double $FULLNAME-pagerank.ranks $VERTICES $FULLNAME-pagerank.txt.gz
294 |     wait # until background processes are finished
295 |     # join ranks into one file
296 |     _step_bg join_harmonicc_pagerank 60 \
297 |              join_harmonicc_pagerank $NAME $FULLNAME-harmonic-centrality.txt.gz $FULLNAME-pagerank.txt.gz $FULLNAME-ranks.txt.gz "$EXTRA_FIELDS_JOIN" "$EXTRA_FIELDS_HEADER"
298 | fi
299 | 
300 | # stats use connected components files, wait for these to be finished
301 | if ! kill -0 $connected_pid; then
302 |     : # step connected already finished
303 | else
304 |     wait $connected_pid
305 | fi
306 | if ! kill -0 $strongly_connected_pid; then
307 |     : # step strongly_connected already finished
308 | else
309 |     wait $strongly_connected_pid
310 | fi
311 | 
312 | _step stats \
313 |          $WG $WGP.Stats --save-degrees $FULLNAME
314 | 
315 | _step_bg join_degrees 15 \
316 |          join_degrees $FULLNAME $VERTICES "$EXTRA_FIELDS_HEADER"
317 | 
318 | NODES=$(perl -lne 'print if s@^nodes=@@' $FULLNAME.stats)
319 | _step connected_distrib \
320 |       connected_distrib $NODES $FULLNAME.wccsizes $FULLNAME-connected-components-distrib.txt.gz
321 | # it.unimi.dsi.webgraph.Stats writes *.sccdistr (but there is no *.wccdistr)
322 | # _step strongly_connected_distrib \
323 | #       connected_distrib $NODES $FULLNAME.sccsizes $FULLNAME-strongly-connected-components-distrib.txt.gz
324 | 
325 | _step indegree_distrib \
326 |       degree_distrib indegree $FULLNAME
327 | _step outdegree_distrib \
328 |       degree_distrib outdegree $FULLNAME
329 | 
330 | wait # until background processes are finished
331 | 


--------------------------------------------------------------------------------
/src/script/webgraph_ranking/process_webgraph_degrees.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eo pipefail
 4 | 
 5 | NAME="$1"
 6 | TYPE="${2:-domain}"
 7 | 
 8 | if [ -z "$NAME" ]; then
 9 |     echo "Usage: $(basename $0) <graph-name> [<type>]"
10 |     echo -e "\tgraph-name\tbase name of the webgraph (without the file suffix .graph)"
11 |     echo -e "\ttype\ttype (level) of the graph aggregation: domain (default) or host"
12 |     exit 1
13 | fi
14 | 
15 | WG=$(dirname $0)/run_webgraph.sh
16 | 
17 | if [ -e $NAME.outdegrees ] && [ -e $NAME.indegrees ]; then
18 |     : # out/indegrees already done
19 | else
20 |     $WG it.unimi.dsi.webgraph.Stats --save-degrees "$NAME"
21 | fi
22 | 
23 | 
24 | if [ "$TYPE" == "domain" ]; then
25 |     zcat $NAME-vertices.txt.gz
26 | else
27 |     zcat vertices/*.txt.gz
28 | fi \
29 |     | cut -f2- \
30 |     | paste $NAME.outdegrees $NAME.indegrees - \
31 |     | gzip >$NAME-outdegrees-indegrees.txt.gz
32 | 
33 | 
34 | HEADER="outdegree\tindegree\tname"
35 | if [ "$TYPE" == "domain" ]; then
36 |     HEADER="outdegree\tindegree\tname\tnumsubdomains"
37 | fi
38 | 
39 | (echo -e "$HEADER";
40 |  set +o pipefail;
41 |  zcat $NAME-outdegrees-indegrees.txt.gz \
42 |      | perl -aF'\t' -lne 'print if $F[0] > 1000' \
43 |      | sort -k1,1nr \
44 |      | head -10000) \
45 |     | gzip >$NAME-outdegrees-indegrees-topout.txt.gz
46 | 
47 | (echo -e "$HEADER";
48 |  set +o pipefail;
49 |  zcat $NAME-outdegrees-indegrees.txt.gz \
50 |      | perl -aF'\t' -lne 'print if $F[1] > 1000' \
51 |      | sort -k2,2nr \
52 |      | head -10000) \
53 |     | gzip >$NAME-outdegrees-indegrees-topin.txt.gz
54 | 
55 | 


--------------------------------------------------------------------------------
/src/script/webgraph_ranking/run_webgraph.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export LC_ALL=C
 4 | 
 5 | source "$(dirname $0)"/webgraph_config.sh
 6 | 
 7 | CC_WEBGRAPH_JAR="${CC_WEBGRAPH_JAR:-$(dirname $0)/../../../target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar}"
 8 | if ! [ -e $CC_WEBGRAPH_JAR ]; then
 9 |     echo "Jar file $CC_WEBGRAPH_JAR not found"
10 |     echo "Java project needs to be build by running"
11 |     echo "  mvn package"
12 |     exit 1
13 | fi
14 | 
15 | _CLASSPATH="$CC_WEBGRAPH_JAR"
16 | if [ -n "$CLASSPATH" ]; then
17 |     _CLASSPATH=$CLASSPATH:$_CLASSPATH
18 | fi
19 | 
20 | if ! echo "$JAVA_OPTS" | grep -qE -e "-Xmx[0-9]+"; then
21 |     # heuristics to run webgraph with 80% of available RAM (or all RAM - 8 GB if this is larger)
22 |     MEMMB=$(free -m | perl -ne 'do { $p80 = int($1*.8); $a8 = int($1-8192); $m = $p80; $m = $a8 if $a8 > $p80; print $m; last } if /(\d+)/')
23 |     JAVA_OPTS="$JAVA_OPTS -Xmx${MEMMB}m"
24 | fi
25 | 
26 | if [ -n "$TMPDIR" ]; then
27 |     JAVA_OPTS="$JAVA_OPTS -Djava.io.tmpdir=$TMPDIR"
28 | fi
29 | 
30 | case "$1" in
31 |     it.unimi.dsi.webgraph.algo.HyperBall \
32 |         | it.unimi.dsi.big.webgraph.algo.HyperBall \
33 |         | it.unimi.dsi.law.rank.PageRankParallelGaussSeidel \
34 |         | it.unimi.dsi.big.law.rank.PageRankParallelGaussSeidel )
35 |         # Java options for HyperBall, recommended in
36 |         #  https://webgraph.di.unimi.it/docs/it/unimi/dsi/webgraph/algo/HyperBall.html
37 |         JAVA_OPTS="$JAVA_OPTS -server -Xss256K -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=$(($MEMMB/3))m \
38 |           -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB"
39 |     ;;
40 | esac
41 | 
42 | set -x
43 | time $JAVA_HOME/bin/java $JAVA_OPTS -cp $_CLASSPATH "$@"
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/src/script/webgraph_ranking/webgraph_config.sh:
--------------------------------------------------------------------------------
 1 | # configuration to process web graphs using the webgraph framework
 2 | 
 3 | # size of the graph (default: 64 million nodes)
 4 | # - no exact size is needed, just to estimate the required Java heap space
 5 | GRAPH_SIZE_NODES=${GRAPH_SIZE_NODES:-67108864}
 6 | 
 7 | # for big graphs with more than 2^31 nodes/vertices
 8 | USE_WEBGRAPH_BIG=${USE_WEBGRAPH_BIG:-false}
 9 | 
10 | # join node names and ranks in memory
11 | JOIN_RANKS_IN_MEMORY=${JOIN_RANKS_IN_MEMORY:-true}
12 | 
13 | 
14 | # number of registers used for Hyperball / harmonic centrality calculation
15 | #
16 | # The number of Hyperball registers depend on
17 | # - the size of the machine (here EC2 instance)
18 | # - and of the graph to be processed
19 | # => it's an empirically determined value and
20 | #    possibly needs to be adjusted
21 | # It can be overridden by the environment variable
22 | # HYPERBALL_REGISTERS, see below.
23 | HYP_REG=12
24 | ## on r8.24.xlarge (768 GB, 96 CPUs)
25 | #HYP_REG=10 (host-level graph, 300M nodes)
26 | #HYP_REG=12 (domain-level graph, 130M nodes)
27 | 
28 | HYPERBALL_REGISTERS=${HYPERBALL_REGISTERS:-$HYP_REG}
29 | 
30 | # number of threads
31 | # THREAD=0 : let the webgraph tools decide how many threads,
32 | #            given the available CPU cores, using
33 | #            java.lang.Runtime.availableProcessors()
34 | THREADS=${THREADS:-0}
35 | 
36 | 
37 | 
38 | # number of fields in vertices file(s)
39 | #  (default: 2)
40 | #   <id, name>
41 | #  (if 3, for domain graphs)
42 | #   <id, name, number_of_hosts_in_domain>
43 | VERTICES_FIELDS=${VERTICES_FIELDS:-2}
44 | 
45 | 
46 | # threads and buffer size used for sorting
47 | export SORT_PARALLEL_THREADS_OPT=""
48 | if echo -e "b\na\nc" | sort --parallel=2 >/dev/null; then
49 |     echo "The sort command supports parallel sort threads" >&2
50 |     SORT_PARALLEL_THREADS_OPT="--parallel=$((($THREADS > 4) ? ($THREADS/2) : 2))"
51 | fi
52 | 
53 | # take 20% of main memory, at least 1 GB, for sorting "chunks"
54 | MEM_20PERC=$(free -g | perl -ne 'do { print 1+int($1*.2), "g"; last } if /(\d+)/')
55 | export SORT_BUFFER_SIZE=${SORT_BUFFER_SIZE:-$MEM_20PERC}
56 | 
57 | # max. number of merge inputs
58 | # (should be not less than number of vertices / edges files to be merged)
59 | export SORT_BATCHES=${SORT_BATCHES:-240}
60 | 
61 | 


--------------------------------------------------------------------------------
/src/script/workflow_lib.sh:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # Copyright (C) 2022 Common Crawl and contributors
 3 | 
 4 | ### functions used to run webgraph... workflow
 5 | 
 6 | function LOG__() {
 7 |     echo $(date '+[%Y-%m-%d %H:%M:%S]') "$@"
 8 | }
 9 | 
10 | function _test_step() {
11 |     if [ -n "$STOP_FILE_" ] && [ -e "$STOP_FILE_" ]; then
12 |         LOG__ INFO "Found stop file: $STOP_FILE_"
13 |         exit 0
14 |     fi
15 |     _STEP__="$1"; shift
16 |     if [ -e "$LOGDIR"/"$_STEP__".log ] \
17 |            || [ -e "$LOGDIR"/"$_STEP__".log.xz ] \
18 |            || [ -e "$LOGDIR"/"$_STEP__".log.gz ] \
19 |            || [ -e "$LOGDIR"/"$_STEP__".log.bz2 ]; then
20 |         LOG__ INFO "Step $_STEP__ already done, $LOGDIR/$_STEP__.log exists"
21 |         return 1
22 |     fi
23 |     return 0
24 | }
25 | 
26 | function _step() {
27 |     _STEP__="$1"; shift
28 |     if _test_step "$_STEP__"; then
29 |         LOG__ INFO "Running step $_STEP__ ..."
30 |         if "$@" &>"$LOGDIR"/"$_STEP__".log; then
31 |              LOG__ INFO "Step $_STEP__ succeeded."
32 |         else
33 |             RES=$?
34 |             LOG__ ERROR "Step $_STEP__ failed with $RES"
35 |             mv "$LOGDIR"/"$_STEP__".log "$LOGDIR"/"$_STEP__".failed.$(date +%Y-%m-%d-%H-%M-%S).log
36 |             LOG__ ERROR "Exiting ..."
37 |             exit $RES
38 |         fi
39 |     fi
40 | }
41 | 
42 | function _step_bg() {
43 |     _STEP__="$1"
44 |     _SLEEP_="$2"
45 |     shift 2
46 |     LOG__ INFO "Running background step $_STEP__ ..."
47 |     if ! [ "$_SLEEP_" -eq "$_SLEEP_" ] 2>/dev/null; then
48 |         echo "_step_bg <name> <sleep> <command>..."
49 |         echo " parameter <sleep> must be an integer"
50 |         echo " (sleep seconds after launching command, before executing next step)"
51 |         exit 1
52 |     fi
53 |     if _test_step "$_STEP__"; then
54 |         _step "$_STEP__" "$@" &
55 |         sleep $_SLEEP_
56 |     fi
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright (C) 2022 Common Crawl and contributors
 4 |  */
 5 | package org.commoncrawl.webgraph;
 6 | 
 7 | import static org.junit.jupiter.api.Assertions.assertEquals;
 8 | import static org.junit.jupiter.api.Assertions.assertFalse;
 9 | import static org.junit.jupiter.api.Assertions.assertTrue;
10 | 
11 | import java.util.Arrays;
12 | 
13 | import org.junit.jupiter.api.Test;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | import it.unimi.dsi.webgraph.LazyIntIterator;
18 | import it.unimi.dsi.webgraph.LazyIntIterators;
19 | 
20 | public class TestCountingMergedIntIterator {
21 | 
22 | 	protected static Logger LOG = LoggerFactory.getLogger(TestCountingMergedIntIterator.class);
23 | 
24 | 	@Test
25 | 	void testSimple() {
26 | 		CountingMergedIntIterator iter = new CountingMergedIntIterator(LazyIntIterators.EMPTY_ITERATOR);
27 | 		assertFalse(iter.hasNext());
28 | 
29 | 		int[][][]  testArrays = { //
30 | 				{{0, 1}}, //
31 | 				{{0}, {1}}, //
32 | 				{{1}, {0}}, //
33 | 				{{1}, {0}, {}}, //
34 | 				{{1}, {0}, {}, {0}, {0}}, //
35 | 				{{1}, {0}, {}, {0}, {0, 1}}, //
36 | 				// tests for input arrays with repeating numbers
37 | 				{{1, 1}, {0, 0}, {}, {0, 0}, {0, 0}}, //
38 | 				{{1, 1}, {0, 0}, {}, {0}, {0, 1}} //
39 | 		};
40 | 
41 | 		for (int[][] tArrays : testArrays) {
42 | 			LazyIntIterator[] tIters = new LazyIntIterator[tArrays.length];
43 | 			int totalCountExpected = 0;
44 | 			for (int i = 0; i < tArrays.length; i++) {
45 | 				tIters[i] = LazyIntIterators.wrap(tArrays[i]);
46 | 				totalCountExpected += tArrays[i].length;
47 | 			}
48 | 			int totalCount = 0;
49 | 			iter = new CountingMergedIntIterator(tIters);
50 | 			assertTrue(iter.hasNext());
51 | 			
52 | 			assertEquals(0, iter.nextInt());
53 | 			assertTrue(iter.getCount() > 0);
54 | 			totalCount += iter.getCount();
55 | 			assertTrue(iter.hasNext());
56 | 			assertEquals(1, iter.nextInt());
57 | 			assertTrue(iter.getCount() > 0);
58 | 			totalCount += iter.getCount();
59 | 			assertFalse(iter.hasNext());
60 | 			assertEquals(totalCountExpected, totalCount,
61 | 					"expected total count for input " + Arrays.deepToString(tArrays) + " is " + totalCountExpected);
62 | 		}
63 | 
64 | 		// test skip(n)
65 | 		for (int n = 0; n <= 5; n++) {
66 | 			for (int[][] tArrays : testArrays) {
67 | 				LazyIntIterator[] tIters = new LazyIntIterator[tArrays.length];
68 | 				for (int i = 0; i < tArrays.length; i++) {
69 | 					tIters[i] = LazyIntIterators.wrap(tArrays[i]);
70 | 				}
71 | 				iter = new CountingMergedIntIterator(tIters);
72 | 				assertEquals(Math.min(n, 2), iter.skip(n));
73 | 			}
74 | 		}
75 | 	}
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-License-Identifier: Apache-2.0
  3 |  * Copyright (C) 2022 Common Crawl and contributors
  4 |  */
  5 | package org.commoncrawl.webgraph;
  6 | 
  7 | import static org.junit.jupiter.api.Assertions.assertArrayEquals;
  8 | import static org.junit.jupiter.api.Assertions.assertTrue;
  9 | import static org.junit.jupiter.api.Assertions.fail;
 10 | 
 11 | import java.io.ByteArrayOutputStream;
 12 | import java.io.PrintStream;
 13 | import java.nio.charset.StandardCharsets;
 14 | import java.util.Arrays;
 15 | 
 16 | import org.commoncrawl.webgraph.HostToDomainGraph.Domain;
 17 | import org.junit.jupiter.api.BeforeEach;
 18 | import org.junit.jupiter.api.Test;
 19 | import org.slf4j.Logger;
 20 | import org.slf4j.LoggerFactory;
 21 | 
 22 | class TestHostToDomainGraph {
 23 | 
 24 | 	protected static Logger LOG = LoggerFactory.getLogger(TestHostToDomainGraph.class);
 25 | 
 26 | 	static final int maxGraphNodes = 128;
 27 | 
 28 | 	HostToDomainGraph converter;
 29 | 
 30 | 	String[] hostGraphSimple = { //
 31 | 			"0\tcom.example", //
 32 | 			"1\tcom.example.www,", //
 33 | 			"2\tcom.example.xyz,", //
 34 | 			"3\torg.example" //
 35 | 	};
 36 | 	String[] domainGraphSimple = { //
 37 | 			"0\tcom.example\t3", //
 38 | 			"1\torg.example\t1" //
 39 | 	};
 40 | 
 41 | 	String[] hostGraphNamesNotSorted = { //
 42 | 			"0\tcom.example", //
 43 | 			"1\tcom.example.xyz,", //
 44 | 			"2\tcom.example.www,", //
 45 | 			"3\torg.example" //
 46 | 	};
 47 | 
 48 | 	String[] hostGraphHyphenatedDomains = { //
 49 | 			"0\tac.e-bike", //
 50 | 			"1\tac.e-bikes", //
 51 | 			"2\tac.e-com", //
 52 | 			"3\tac.e.subdomain", //
 53 | 			"4\tac.eagle", //
 54 | 			"5\tac.gov", // domain name public suffix only
 55 | 			"6\tac.gov.ascension", //
 56 | 			"7\tac.gov.ascension-island", //
 57 | 			"8\tac.gov.ascension.mail", //
 58 | 			"9\tac.gov.conservation-ascension-island", //
 59 | 			"10\tac.gov.postoffice", //
 60 | 	};
 61 | 	String[] domainGraphHyphenatedDomains = { //
 62 | 			"0\tac.e\t1", //
 63 | 			"1\tac.e-bike\t1", //
 64 | 			"2\tac.e-bikes\t1", //
 65 | 			"3\tac.e-com\t1", //
 66 | 			"4\tac.eagle\t1", //
 67 | 			"5\tac.gov.ascension\t2", //
 68 | 			"6\tac.gov.ascension-island\t1", //
 69 | 			"7\tac.gov.conservation-ascension-island\t1", //
 70 | 			"8\tac.gov.postoffice\t1", //
 71 | 	};
 72 | 	String[] domainGraphHyphenatedDomainsInclMultiPartSuffixes = { //
 73 | 			"0\tac.e\t1", //
 74 | 			"1\tac.e-bike\t1", //
 75 | 			"2\tac.e-bikes\t1", //
 76 | 			"3\tac.e-com\t1", //
 77 | 			"4\tac.eagle\t1", //
 78 | 			"5\tac.gov\t1", //
 79 | 			"6\tac.gov.ascension\t2", //
 80 | 			"7\tac.gov.ascension-island\t1", //
 81 | 			"8\tac.gov.conservation-ascension-island\t1", //
 82 | 			"9\tac.gov.postoffice\t1", //
 83 | 	};
 84 | 
 85 | 	String[] hostGraphHyphenatedDomainsSubDomainOnly = { //
 86 | 			"0\tac.gov.ascension-island", //
 87 | 			"1\tac.gov.ascension.mail", //
 88 | 			"2\tac.gov.conservation-ascension-island", //
 89 | 			"3\tac.gov.postoffice", //
 90 | 	};
 91 | 	String[] domainGraphHyphenatedDomainsSubDomainOnly = { //
 92 | 			"0\tac.gov.ascension\t1", //
 93 | 			"1\tac.gov.ascension-island\t1", //
 94 | 			"2\tac.gov.conservation-ascension-island\t1", //
 95 | 			"3\tac.gov.postoffice\t1", //
 96 | 	};
 97 | 
 98 | 	String[] hostGraphDuplicatedDomains = { //
 99 | 			"0\tno.hordaland", //
100 | 			"1\tno.hordaland-teater", //
101 | 			"2\tno.hordaland.os", //
102 | 			"3\tno.hordaland.os.bibliotek", //
103 | 			"4\tno.hordaland.oygarden", //
104 | 			"5\tno.hordalandfolkemusikklag", //
105 | 	};
106 | 	String[] domainGraphDuplicatedDomains = { //
107 | 			"0\tno.hordaland\t2", //
108 | 			"1\tno.hordaland-teater\t1", //
109 | 			"2\tno.hordaland.os.bibliotek\t1", //
110 | 			"3\tno.hordalandfolkemusikklag\t1", //
111 | 	};
112 | 
113 | 	/**
114 | 	 * <code>forgot.his.name</name> is in the "private section" of the public suffix
115 | 	 * list, while <code>name</name> is in the ICANN section, see
116 | 	 * {@link HostToDomainGraph#doPrivateDomains(boolean)}
117 | 	 */
118 | 	String[] hostGraphPrivateDomains = { //
119 | 			"0\tname.hiro", //
120 | 			"1\tname.hiropo", //
121 | 			"2\tname.his.forgot.adam", //
122 | 			"3\tname.his.forgot.ben", //
123 | 			"4\tname.his.forgot.never", //
124 | 			"5\tname.his.prz", //
125 | 			"6\tname.hista.tac", //
126 | 			"7\tname.history", //
127 | 			"8\tname.history.0.aba", //
128 | 			"9\tname.hit", //
129 | 	};
130 | 	String[] domainGraphPrivateDomains = { //
131 | 			"0\tname.hiro\t1", //
132 | 			"1\tname.hiropo\t1", //
133 | 			"2\tname.his\t1", //
134 | 			"3\tname.his.forgot.adam\t1", //
135 | 			"4\tname.his.forgot.ben\t1", //
136 | 			"5\tname.his.forgot.never\t1", //
137 | 			"6\tname.hista\t1", //
138 | 			"7\tname.history\t2", //
139 | 			"8\tname.hit\t1", //
140 | 	};
141 | 
142 | 	@BeforeEach
143 | 	void init() {
144 | 		converter = new HostToDomainGraph(maxGraphNodes);
145 | 	}
146 | 
147 | 	@Test
148 | 	void testDomainComparison() {
149 | 		assertTrue("org.example.".compareTo("org.example-domain.") > 0);
150 | 		assertTrue(Domain.compareRevDomainsSafe("org.example", "org.example") == 0);
151 | 		assertTrue(Domain.compareRevDomainsSafe("org.example", "org.exampledomain") < 0);
152 | 		assertTrue(Domain.compareRevDomainsSafe("org.example", "org.example-domain") > 0);
153 | 		assertTrue(Domain.compareRevDomainsSafe("org.example", "org.example.domain") > 0);
154 | 	}
155 | 
156 | 	private String[] convert(HostToDomainGraph converter, String[] hostGraph) {
157 | 		ByteArrayOutputStream domainBytes = new ByteArrayOutputStream();
158 | 		PrintStream domainOut = new PrintStream(domainBytes);
159 | 		converter.convert(converter::convertNode, Arrays.stream(hostGraph), domainOut);
160 | 		converter.finishNodes(domainOut);
161 | 		return new String(domainBytes.toByteArray(), StandardCharsets.UTF_8).split("\n");
162 | 	}
163 | 
164 | 	private String[] stripCounts(String[] domainGraph) {
165 | 		return Arrays.stream(domainGraph).map(s -> s.replaceFirst("\\t[^\\t]*$", "")).toArray(String[]::new);
166 | 	}
167 | 
168 | 	private String[] getNodeNames(String[] graph) {
169 | 		return Arrays.stream(graph).map(s -> s.split("\t")[1]).toArray(String[]::new);
170 | 	}
171 | 
172 | 	private long[] getNodeIDs(String[] graph) {
173 | 		return Arrays.stream(graph).mapToLong(s -> Long.parseLong(s.split("\t")[0])).toArray();
174 | 	}
175 | 
176 | 	/**
177 | 	 * test whether node names are properly sorted and IDs are correctly assigned
178 | 	 * (sequentially, strictly monotonically increasing, no gaps)
179 | 	 */
180 | 	void testSorted(String[] graph) {
181 | 		String[] names = getNodeNames(graph);
182 | 		String[] namesSorted = Arrays.copyOf(names, names.length);
183 | 		Arrays.sort(namesSorted);
184 | 		assertArrayEquals(namesSorted, names);
185 | 		long lastId = -1;
186 | 		for (long id : getNodeIDs(graph)) {
187 | 			if ((lastId + 1) != id) {
188 | 				fail("IDs not correctly assigned: " + lastId + ", " + id);
189 | 			}
190 | 			lastId = id;
191 | 		}
192 | 	}
193 | 
194 | 	@Test
195 | 	void testConvertNodesSimple() {
196 | 		testSorted(hostGraphSimple);
197 | 		converter.doCount(false);
198 | 		assertArrayEquals(stripCounts(domainGraphSimple), convert(converter, hostGraphSimple));
199 | 		testSorted(domainGraphSimple);
200 | 	}
201 | 
202 | 	@Test
203 | 	void testConvertNodesSimpleCount() {
204 | 		converter.doCount(true);
205 | 		assertArrayEquals(domainGraphSimple, convert(converter, hostGraphSimple));
206 | 	}
207 | 
208 | 	@Test
209 | 	void testConvertNodesNotSorted() {
210 | 		try {
211 | 			convert(converter, hostGraphNamesNotSorted);
212 | 			fail("Unable to convert to domain graph from not properly sorted input");
213 | 		} catch (Exception e) {
214 | 			LOG.info("Expected exception on input not properly sorted", e.getMessage());
215 | 		}
216 | 	}
217 | 
218 | 	@Test
219 | 	void testConvertNodesHyphenatedDomains() {
220 | 		// verify sorting of input and expected output
221 | 		testSorted(hostGraphHyphenatedDomains);
222 | 		testSorted(domainGraphHyphenatedDomains);
223 | 		converter.doCount(true);
224 | 		assertArrayEquals(domainGraphHyphenatedDomains, convert(converter, hostGraphHyphenatedDomains));
225 | 	}
226 | 
227 | 	@Test
228 | 	void testConvertNodesHyphenatedDomainsSubDomainOnly() {
229 | 		// verify sorting of input and expected output
230 | 		testSorted(hostGraphHyphenatedDomainsSubDomainOnly);
231 | 		testSorted(domainGraphHyphenatedDomains);
232 | 		converter.doCount(true);
233 | 		assertArrayEquals(domainGraphHyphenatedDomainsSubDomainOnly,
234 | 				convert(converter, hostGraphHyphenatedDomainsSubDomainOnly));
235 | 	}
236 | 
237 | 	@Test
238 | 	void testConvertNodesDuplicatedDomain() {
239 | 		// verify sorting of input and expected output
240 | 		testSorted(hostGraphDuplicatedDomains);
241 | 		testSorted(domainGraphDuplicatedDomains);
242 | 		converter.doCount(true);
243 | 		assertArrayEquals(domainGraphDuplicatedDomains, convert(converter, hostGraphDuplicatedDomains));
244 | 	}
245 | 
246 | 	@Test
247 | 	void testConvertNodesHyphenatedDomainsIncludingMultiPartSuffixes() {
248 | 		// verify sorting of input and expected output
249 | 		testSorted(hostGraphHyphenatedDomains);
250 | 		testSorted(domainGraphHyphenatedDomainsInclMultiPartSuffixes);
251 | 		converter.doCount(true);
252 | 		converter.multiPartSuffixesAsDomains(true);
253 | 		assertArrayEquals(domainGraphHyphenatedDomainsInclMultiPartSuffixes,
254 | 				convert(converter, hostGraphHyphenatedDomains));
255 | 	}
256 | 
257 | 	@Test
258 | 	void testConvertPrivateDomain() {
259 | 		// verify sorting of input and expected output
260 | 		testSorted(hostGraphPrivateDomains);
261 | 		testSorted(domainGraphPrivateDomains);
262 | 		converter.doCount(true);
263 | 		converter.doPrivateDomains(true);
264 | 		converter.multiPartSuffixesAsDomains(true);
265 | 		assertArrayEquals(domainGraphPrivateDomains, convert(converter, hostGraphPrivateDomains));
266 | 	}
267 | 
268 | }
269 | 


--------------------------------------------------------------------------------
/src/test/java/org/commoncrawl/webgraph/TestJoinSortRanks.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-License-Identifier: Apache-2.0
 3 |  * Copyright (C) 2022 Common Crawl and contributors
 4 |  */
 5 | package org.commoncrawl.webgraph;
 6 | 
 7 | import static org.junit.jupiter.api.Assertions.assertEquals;
 8 | import static org.junit.jupiter.api.Assertions.fail;
 9 | 
10 | import java.io.File;
11 | import java.io.IOException;
12 | 
13 | import org.junit.jupiter.api.Disabled;
14 | import org.junit.jupiter.api.Test;
15 | import org.slf4j.Logger;
16 | import org.slf4j.LoggerFactory;
17 | 
18 | import it.unimi.dsi.fastutil.io.BinIO;
19 | 
20 | public class TestJoinSortRanks {
21 | 
22 | 	protected static Logger LOG = LoggerFactory.getLogger(TestJoinSortRanks.class);
23 | 
24 | 	/**
25 | 	 * Reproduce issue in fastutil 8.5.8 loading (double) arrays from files of size
26 | 	 * 2^31 bytes or more.
27 | 	 */
28 | 	@Disabled("Fixed in fastutil 8.5.9")
29 | 	@Test
30 | 	void testLoadingDoubleArray() {
31 | 		File file;
32 | 		try {
33 | 			file = File.createTempFile("test", ".bin");
34 | 		} catch (IOException e) {
35 | 			LOG.error("Skipping test, failed to create temporary file to hold array:", e);
36 | 			return;
37 | 		}
38 | 		long intOverflow = 1L << 31;
39 | 		int arrSize = (int) (intOverflow / Double.BYTES);
40 | 		double[] arr = new double[arrSize];
41 | 		try {
42 | 			LOG.info("Storing double array of length {} in file {}", arrSize, file.getAbsolutePath());
43 | 			BinIO.storeDoubles(arr, file);
44 | 			LOG.info("Successfully stored double array of length {} in file {}, resulting file size: {} bytes", arrSize,
45 | 					file.getAbsolutePath(), file.length());
46 | 			assertEquals(intOverflow, file.length());
47 | 			LOG.info("Trying to clean up Java heap space...");
48 | 			arr = null;
49 | 			System.gc();
50 | 			LOG.info("Loading double array from file {}", file.getAbsolutePath());
51 | 			arr = BinIO.loadDoubles(file.getAbsolutePath());
52 | 			assertEquals(arrSize, arr.length);
53 | 			LOG.info("Successfully loaded double array of length {} from file {}", arr.length, file.getAbsolutePath());
54 | 		} catch (IOException e) {
55 | 			fail("Failed to store and load double array: " + e);
56 | 		} finally {
57 | 			file.delete();
58 | 		}
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------