├── .github └── workflows │ └── build.yml ├── .gitignore ├── LICENSE ├── README.md ├── graph-exploration-README.md ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── commoncrawl │ │ └── webgraph │ │ ├── CountingMergedIntIterator.java │ │ ├── CreatePreferenceVector.java │ │ ├── HostToDomainGraph.java │ │ ├── JoinSortRanks.java │ │ ├── explore │ │ ├── Graph.java │ │ └── GraphExplorer.java │ │ └── package-info.java └── resources │ └── simplelogger.properties ├── script ├── host2domaingraph.sh ├── hostgraph │ ├── build_hostgraph.sh │ └── hostgraph_config.sh ├── webgraph_ranking │ ├── graph_explore_build_vertex_map.sh │ ├── graph_explore_download_webgraph.sh │ ├── graph_explore_load_graph.jsh │ ├── process_webgraph.sh │ ├── process_webgraph_degrees.sh │ ├── run_webgraph.sh │ └── webgraph_config.sh └── workflow_lib.sh └── test └── java └── org └── commoncrawl └── webgraph ├── TestCountingMergedIntIterator.java ├── TestHostToDomainGraph.java └── TestJoinSortRanks.java /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: cc-webgraph build 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | java: [ 11, 17, 21 ] 17 | name: Java ${{ matrix.java }} 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Setup JDK 22 | uses: actions/setup-java@v4 23 | with: 24 | distribution: 'temurin' 25 | java-version: ${{ matrix.java }} 26 | cache: 'maven' 27 | 28 | - name: Build 29 | run: mvn verify javadoc:aggregate 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.ear 17 | *.zip 18 | *.tar.gz 19 | *.rar 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | 24 | # maven build directory 25 | /target/ 26 | 27 | # Eclipse project files 28 | .project 29 | .classpath 30 | .settings/ 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cc-webgraph 2 | 3 | Tools to construct web graphs from Common Crawl data, process and explore them. 4 | 5 | ## Compiling and Packaging Java Tools 6 | 7 | Java 11 or upwards are required. 8 | 9 | The Java tools are compiled and packaged by [Maven](https://maven.apache.org/). If Maven is installed just run `mvn package`. Now the Java tools can be run via 10 | ``` 11 | java -cp target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar ... 12 | ``` 13 | 14 | The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/) and [LAW](https://law.di.unimi.it/software.php) packages required to process the webgraphs and compute [PageRank](https://en.wikipedia.org/wiki/PageRank) or [Harmonic Centrality](https://en.wikipedia.org/wiki/Centrality#Harmonic_centrality). 15 | 16 | 17 | ### Javadocs 18 | 19 | The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser. 20 | 21 | 22 | ## Memory and Disk Requirements 23 | 24 | Note that the webgraphs are usually multiple Gigabytes in size and require for processing 25 | - a sufficient Java heap size ([Java option](https://docs.oracle.com/en/java/javase/21/docs/specs/man/java.html#extra-options-for-java) `-Xmx`) 26 | - enough disk space to store the graphs and temporary data. 27 | 28 | The exact requirements depend on the graph size and the task – graph exploration or ranking, etc. 29 | 30 | 31 | ## Construction and Ranking of Host- and Domain-Level Web Graphs 32 | 33 | ### Host-Level Web Graph 34 | 35 | The host-level web graph is built with help of PySpark, the corresponding code is found in the project [cc-pyspark](https://github.com/commoncrawl/cc-pyspark). Instructions are found in the script [build_hostgraph.sh](src/script/hostgraph/build_hostgraph.sh). 36 | 37 | ### Domain-Level Web Graph 38 | 39 | The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh). 40 | 41 | ### Processing Graphs using the WebGraph Framework 42 | 43 | To analyze the graph structure and calculate rankings you may further process the graphs using software from the Laboratory for Web Algorithmics (LAW) at the University of Milano, namely the [WebGraph framework](https://webgraph.di.unimi.it/) and the [LAW library](https://law.di.unimi.it/software.php). 44 | 45 | A couple of scripts may help you to run the WebGraph tools to build and process the graphs are provided in [src/script/webgraph_ranking/](src/script/webgraph_ranking/). They're also used to prepare the Common Crawl web graph releases. 46 | 47 | To process a webgraph and rank the nodes, you should first adapt the configuration to your graph and hardware setup: 48 | ``` 49 | vi ./src/script/webgraph_ranking/webgraph_config.sh 50 | ``` 51 | After running 52 | ``` 53 | ./src/script/webgraph_ranking/process_webgraph.sh graph_name vertices.txt.gz edges.txt.gz output_dir 54 | ``` 55 | the `output_dir/` should contain all generated files, eg. `graph_name.graph` and `graph_name-ranks.txt.gz`. 56 | 57 | The shell script is easily adapted to your needs. Please refer to the [LAW dataset tutorial](https://law.di.unimi.it/tutorial.php), the [API docs of LAW](https://law.di.unimi.it/software/law-docs/index.html) and [webgraph](https://webgraph.di.unimi.it/docs/) for further information. 58 | 59 | 60 | ## Exploring Webgraph Data Sets 61 | 62 | The Common Crawl webgraph data sets are announced on the [Common Crawl web site](https://commoncrawl.org/tag/webgraph/). 63 | 64 | For instructions how to explore the webgraphs using the JShell please see the tutorial [Interactive Graph Exploration](./graph-exploration-README.md). For an older approach using [Jython](https://www.jython.org/) and [pyWebGraph](https://github.com/mapio/py-web-graph), see the [cc-notebooks project](//github.com/commoncrawl/cc-notebooks/tree/master/cc-webgraph-statistics). 65 | 66 | 67 | ## Credits 68 | 69 | Thanks to the authors of the [WebGraph framework](https://webgraph.di.unimi.it/) used to process the graphs and compute page rank and harmonic centrality. See also Sebastiano Vigna's projects [webgraph](//github.com/vigna/webgraph) and [webgraph-big](//github.com/vigna/webgraph-big). 70 | -------------------------------------------------------------------------------- /graph-exploration-README.md: -------------------------------------------------------------------------------- 1 | # Interactive Graph Exploration 2 | 3 | A tutorial how to interactively explore the Common Crawl webgraphs – or other graphs using the webgraph format – using the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html) and the [GraphExplorer](src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java) class. 4 | 5 | 6 | ## Quick Start 7 | 8 | 1. change into the "cc-webgraph" project directory, [build the cc-webgraph JAR](README.md#compiling-and-packaging-java-tools) and remember the project directory and the JAR using environment variables: 9 | 10 | ``` 11 | $> cd .../cc-webgraph 12 | 13 | $> mvn clean package 14 | 15 | $> CC_WEBGRAPH="$PWD" 16 | $> CC_WEBGRAPH_JAR=$(ls "$PWD"/target/cc-webgraph-*-jar-with-dependencies.jar) 17 | ``` 18 | 19 | 2. select a web graph you want to explore, choose a download directory and download the web graph 20 | 21 | ``` 22 | $> GRAPH=cc-main-2024-feb-apr-may-domain 23 | 24 | $> mkdir .../my-webgraphs/$GRAPH 25 | $> cd .../my-webgraphs/$GRAPH 26 | ``` 27 | 28 | About 15 GiB disk are needed to hold all files of a domain-level webgraph. 29 | 30 | ``` 31 | $> "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_download_webgraph.sh $GRAPH 32 | ``` 33 | 34 | 3. Build the map from vertex label to vertex ID and vice versa. This allows to look up a reverse domain name (e.g. "org.commoncrawl") and get the corresponding vertex ID. 35 | 36 | ``` 37 | $> "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh $GRAPH $GRAPH-vertices.txt.gz 38 | ``` 39 | 40 | 4. Launch the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html) 41 | 42 | ``` 43 | $> jshell --class-path "$CC_WEBGRAPH_JAR" 44 | | Welcome to JShell -- Version 21.0.3 45 | | For an introduction type: /help intro 46 | 47 | jshell> 48 | ``` 49 | 50 | Now you may play around with the JShell or load the GraphExplorer class and your graph: 51 | 52 | ``` 53 | jshell> import org.commoncrawl.webgraph.explore.GraphExplorer 54 | 55 | jshell> GraphExplorer e = new GraphExplorer("cc-main-2024-feb-apr-may-domain") 56 | 2024-06-23 13:38:51:084 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph 57 | 2024-06-23 13:38:51:193 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph 58 | 2024-06-23 13:38:51:279 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap) 59 | 2024-06-23 13:38:52:356 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph 60 | e ==> org.commoncrawl.webgraph.explore.GraphExplorer@4cc0edeb 61 | ``` 62 | 63 | But for now exit the JShell 64 | ``` 65 | jshell> /exit 66 | | Goodbye 67 | ``` 68 | 69 | To make the loading easier, you may use the load script [graph_explore_load_graph.jsh](src/script/webgraph_ranking/graph_explore_load_graph.jsh) and pass the graph name as a Java property to the JShell via command-line option `-R-Dgraph=$GRAPH` 70 | 71 | ``` 72 | $> jshell --class-path "$CC_WEBGRAPH_JAR" \ 73 | -R-Dgraph=$GRAPH \ 74 | "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_load_graph.jsh 75 | Loading graph cc-main-2024-feb-apr-may-domain 76 | 2024-06-23 13:30:14:134 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph 77 | 2024-06-23 13:30:14:340 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph 78 | 2024-06-23 13:30:14:439 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap) 79 | 2024-06-23 13:30:15:595 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph 80 | 81 | Graph cc-main-2024-feb-apr-may-domain loaded into GraphExplorer *e* 82 | Type "e." and press to list the public methods of the class GraphExplorer 83 | ... or "g." for the graph loaded for exploration 84 | 85 | ... or use one of the predefined methods: 86 | void cn(String) 87 | void cn(long) 88 | void pwn() 89 | void ls() 90 | void ls(long) 91 | void ls(String) 92 | void sl() 93 | void sl(long) 94 | void sl(String) 95 | 96 | | Welcome to JShell -- Version 21.0.3 97 | | For an introduction type: /help intro 98 | 99 | jshell> 100 | ``` 101 | 102 | The predefined methods are those provided by [pyWebGraph](https://github.com/mapio/py-web-graph). 103 | 104 | ``` 105 | jshell> cn("org.commoncrawl") 106 | #111997321 org.commoncrawl 107 | 108 | jshell> pwn() 109 | #111997321 org.commoncrawl 110 | 111 | jshell> ls() // list successors (vertices linked from the domain commoncrawl.org or one of its subdomains) 112 | 113 | jshell> sl() // list predecessors (vertices connected via incoming links) 114 | ``` 115 | 116 | 117 | ## Using the Java Classes 118 | 119 | The Java classes "GraphExplorer" and "Graph" bundle a set of methods which help exploring the graphs: 120 | - load the webgraph, its transpose and the vertex map 121 | - access the vertices and their successors or predecessors 122 | - utilities to import or export a list of vertices or counts from or into a file 123 | 124 | The methods are bundled in the classes of the Java package `org.commoncrawl.webgraph.explore`. To get an overview over all provided methods, inspect the source code or see the section [Javadocs](README.md#javadocs) in the main README for how to read the Javadocs. Here only few examples are presented. 125 | 126 | We start again with launching the JShell and loading a webgraph: 127 | 128 | ``` 129 | $> jshell --class-path "$CC_WEBGRAPH_JAR" \ 130 | -R-Dgraph=$GRAPH \ 131 | "$CC_WEBGRAPH"/src/script/webgraph_ranking/graph_explore_load_graph.jsh 132 | jshell> 133 | ``` 134 | 135 | Two classes are already instantiated – the *GraphExplorer* `e` and the *Graph* `g`, the former holds a reference to the latter: 136 | 137 | ``` 138 | jshell> /vars 139 | | String graph = "cc-main-2024-feb-apr-may-domain" 140 | | GraphExplorer e = org.commoncrawl.webgraph.explore.GraphExplorer@7dc7cbad 141 | | Graph g = org.commoncrawl.webgraph.explore.Graph@4f933fd1 142 | 143 | jshell> e.getGraph() 144 | $45 ==> org.commoncrawl.webgraph.explore.Graph@4f933fd1 145 | ``` 146 | 147 | First, the vertices in the webgraphs are represented by numbers. So, we need to translage between vertex label and ID: 148 | 149 | ``` 150 | jshell> g.vertexLabelToId("org.wikipedia") 151 | $46 ==> 115107569 152 | 153 | jshell> g.vertexIdToLabel(115107569) 154 | $47 ==> "org.wikipedia" 155 | ``` 156 | 157 | One important note: Common Crawl's webgraphs list the host or domain names in [reverse domain name notation](https://en.wikipedia.org/wiki/Reverse_domain_name_notation). The vertex lists are sorted by the reversed names in lexicographic order and then numbered continuously. This gives a close-to-perfect compression of the webgraphs itself. Most of the arcs are close in terms of locality because subdomains or sites of the same region (by country-code top-level domain) are listed in one continous block. Cf. the paper [The WebGraph Framework I: Compression Techniques](https://vigna.di.unimi.it/ftp/papers/WebGraphI.pdf) by Paolo Boldi and Sebastiano Vigna. 158 | 159 | Now, let's look how many other domains are linked from Wikipedia? 160 | 161 | ``` 162 | jshell> g.outdegree("org.wikipedia") 163 | $46 ==> 2106338 164 | ``` 165 | 166 | Another note: Common Crawl's webgraphs are based on sample crawls of the web. Same as the crawls, also the webgraphs are not complete and the Wikipedia may in reality link to far more domains. But 2 million linked domains is already not a small sample. 167 | 168 | The Graph class also gives you access to the successors of a vertex, as array or stream of integers, but also as stream of strings (vertex labels): 169 | 170 | ``` 171 | jshell> g.successors("org.wikipedia").length 172 | $48 ==> 2106338 173 | 174 | jshell> g.successorIntStream("org.wikipedia").count() 175 | $49 ==> 2106338 176 | 177 | jshell> g.successorStream("org.wikipedia").limit(10).forEach(System.out::println) 178 | abb.global 179 | abb.nic 180 | abbott.cardiovascular 181 | abbott.globalpointofcare 182 | abbott.molecular 183 | abbott.pk 184 | abc.www 185 | abudhabi.gov 186 | abudhabi.mediaoffice 187 | abudhabi.tamm 188 | ``` 189 | 190 | Using Java streams it's easy to translate between the both representations: 191 | 192 | ``` 193 | jshell> g.successorIntStream("org.wikipedia").limit(5).mapToObj(i -> g.vertexIdToLabel(i)).forEach(System.out::println) 194 | abb.global 195 | abb.nic 196 | abbott.cardiovascular 197 | abbott.globalpointofcare 198 | abbott.molecular 199 | ``` 200 | 201 | Successors represent outgoing links to other domains. We can do the same for predecsors, that is incoming links from other domains: 202 | 203 | ``` 204 | jshell> g.indegree("org.wikipedia") 205 | $50 ==> 2752391 206 | 207 | jshell> g.predecessorIntStream("org.wikipedia").count() 208 | $51 ==> 2752391 209 | 210 | jshell> g.predecessorStream("org.wikipedia").limit(5).forEach(System.out::println) 211 | abogado.fabiobalbuena 212 | abogado.jacksonville 213 | abogado.jaskot 214 | abogado.super 215 | ac.789bet 216 | ``` 217 | 218 | Technically, webgraphs only store successor lists. But the Graph class holds also two graphs: the "original" one and its transpose. In the transposed graph "successors" are "predecessors", and "outdegree" means "indegree". Some methods on a deeper level take one of the two webgraphs as argument, here it makes a difference whether you pass `g.graph` or `g.graphT`, here to a method which translates vertex IDs to labels and extracts the top-level domain: 219 | 220 | ``` 221 | jshell> g.successorTopLevelDomainStream(g.graph, g.vertexLabelToId("org.wikipedia")).limit(5).forEach(System.out::println) 222 | abb 223 | abb 224 | abbott 225 | abbott 226 | abbott 227 | 228 | jshell> g.successorTopLevelDomainStream(g.graphT, g.vertexLabelToId("org.wikipedia")).limit(5).forEach(System.out::println) 229 | abogado 230 | abogado 231 | abogado 232 | abogado 233 | ac 234 | ``` 235 | 236 | The top-level domains repeat, and you may want to count the occurrences and create a frequency list. There is a predefined method to perform this: 237 | 238 | ``` 239 | jshell> g.successorTopLevelDomainCounts("org.wikipedia").filter(e -> e.getKey().startsWith("abb")).forEach(e -> System.out.printf("%8d\t%s\n", e.getValue(), e.getKey())) 240 | 4 abbott 241 | 2 abb 242 | 243 | jshell> g.successorTopLevelDomainCounts("org.wikipedia").limit(10).forEach(e -> System.out.printf("%8d\t%s\n", e.getValue(), e.getKey())) 244 | 706707 com 245 | 213406 org 246 | 117042 de 247 | 86684 net 248 | 65906 ru 249 | 55914 fr 250 | 53628 uk 251 | 52828 it 252 | 51622 jp 253 | 33729 br 254 | ``` 255 | 256 | The same can be done for predecessors using the method "Graph::predecessorTopLevelDomainCounts". 257 | 258 | Dealing with large successor or predecessor lists can be painful and viewing them in a terminal window is practically impossible. We've already discussed how to compress the list to top-level domain counts. Alternatively, you could select the labels by prefix... 259 | 260 | ``` 261 | jshell> g.successorStream("org.wikipedia", "za.org.").limit(10).forEach(System.out::println) 262 | za.org.61mech 263 | za.org.aadp 264 | za.org.aag 265 | za.org.abc 266 | za.org.acaparty 267 | za.org.acbio 268 | za.org.accord 269 | za.org.acd 270 | za.org.acdp 271 | za.org.acjr 272 | ``` 273 | 274 | ... but even then the list may be huge. Then the best option is to write the stream output (vertex labels or top-level domain frequencies) into a file and view it later using a file viewer or use any other tool for further processing: 275 | 276 | ``` 277 | jshell> e.saveVerticesToFile(g.successors("org.wikipedia"), "org-wikipedia-successors.txt") 278 | 279 | jshell> e.saveCountsToFile(g.successorTopLevelDomainCounts("org.wikipedia"), "org-wikipedia-successors-tld-counts.txt") 280 | ``` 281 | 282 | ## Final Remarks 283 | 284 | We hope these few examples will support either to have fun exploring the graphs or to develop your own pipeline to extract insights from the graphs. 285 | 286 | Finally, thanks to the authors of the [WebGraph framework](https://webgraph.di.unimi.it/) and of [pyWebGraph](https://github.com/mapio/py-web-graph) for their work on these powerful tools and for any inspiration taken into these examples. 287 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | org.commoncrawl 6 | cc-webgraph 7 | 0.1-SNAPSHOT 8 | jar 9 | 10 | cc-webgraph 11 | https://github.com/commoncrawl/cc-webgraph 12 | 13 | 14 | UTF-8 15 | 11 16 | 17 | 3.6.10 18 | 3.7.0 19 | 2.7.2 20 | 8.5.15 21 | 1.4 22 | 23 | 2.0.16 24 | 25 | 5.11.2 26 | 27 | 28 | 29 | 30 | 31 | src/main/resources 32 | 33 | 34 | 35 | 36 | maven-compiler-plugin 37 | 3.14.0 38 | 39 | ${java.version} 40 | ${java.version} 41 | 42 | 43 | 44 | maven-assembly-plugin 45 | 3.7.1 46 | 47 | 48 | jar-with-dependencies 49 | 50 | cc-webgraph-${project.version} 51 | 52 | 53 | 54 | package 55 | 56 | single 57 | 58 | 59 | 60 | 61 | 62 | maven-surefire-plugin 63 | 3.5.2 64 | 65 | 66 | org.apache.maven.plugins 67 | maven-enforcer-plugin 68 | 3.5.0 69 | 70 | 71 | enforce-maven 72 | 73 | enforce 74 | 75 | 76 | 77 | 78 | 3.6.3 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | org.junit 94 | junit-bom 95 | ${junit.version} 96 | pom 97 | import 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | org.slf4j 106 | slf4j-api 107 | ${slf4j-api.version} 108 | 109 | 110 | 111 | com.github.crawler-commons 112 | crawler-commons 113 | ${crawler.commons.version} 114 | 115 | 116 | 117 | it.unimi.dsi 118 | fastutil-core 119 | ${fastutil.version} 120 | 121 | 122 | 123 | commons-cli 124 | commons-cli 125 | 1.5.0 126 | 127 | 128 | 131 | 132 | it.unimi.dsi 133 | webgraph 134 | ${webgraph.version} 135 | 136 | 137 | net.sf.jung 138 | jung-api 139 | 140 | 141 | net.sf.jung 142 | jung-io 143 | 144 | 145 | ch.qos.logback 146 | logback-classic 147 | 148 | 149 | 150 | 151 | 152 | it.unimi.dsi 153 | webgraph-big 154 | ${webgraph.big.version} 155 | 156 | 157 | ch.qos.logback 158 | logback-classic 159 | 160 | 161 | 162 | 163 | 164 | it.unimi.dsi 165 | law 166 | ${law.version} 167 | 168 | 169 | net.sf.jung 170 | jung-api 171 | 172 | 173 | net.sf.jung 174 | jung-io 175 | 176 | 177 | org.apache.httpcomponents 178 | httpclient 179 | 180 | 181 | org.apache.httpcomponents 182 | httpasyncclient 183 | 184 | 185 | org.eclipse.jetty.aggregate 186 | jetty-all 187 | 188 | 189 | org.softee 190 | pojo-mbean 191 | 192 | 193 | com.fasterxml.jackson 194 | jackson-bom 195 | 196 | 197 | it.unimi.di 198 | mg4j 199 | 200 | 201 | it.unimi.di 202 | mg4j-big 203 | 204 | 205 | org.wikidata.wdtk 206 | wdtk-dumpfiles 207 | 208 | 209 | info.bliki.wiki 210 | bliki-core 211 | 212 | 213 | it.unimi.di.law 214 | jericho-html-dev 215 | 216 | 217 | ch.qos.logback 218 | logback-classic 219 | 220 | 221 | org.slf4j 222 | log4j-over-slf4j 223 | 224 | 225 | org.slf4j 226 | jcl-over-slf4j 227 | 228 | 229 | 230 | 231 | 232 | org.apache.commons 233 | commons-configuration2 234 | 2.10.1 235 | runtime 236 | 237 | 238 | org.slf4j 239 | slf4j-simple 240 | ${slf4j-api.version} 241 | 242 | 243 | 244 | 245 | org.junit.jupiter 246 | junit-jupiter 247 | test 248 | 249 | 250 | 251 | 252 | -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2024 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph; 6 | 7 | import java.util.PriorityQueue; 8 | 9 | import it.unimi.dsi.fastutil.ints.IntIterator; 10 | import it.unimi.dsi.webgraph.LazyIntIterator; 11 | import it.unimi.dsi.webgraph.LazyIntIterators; 12 | 13 | /** 14 | * An iterator counting the integers returned by multiple 15 | * {@link LazyIntIterator}s. The input iterators must return integers in a 16 | * monotonically non-decreasing order. The resulting iterator returns the 17 | * unified input integers in strictly non-decreasing order. The method 18 | * {@link getCount()} is used to access the count of the integer returned last 19 | * by {@link nextInt()}. The count equals the number of times any of the 20 | * iterators returned the current integer value. See also 21 | * {@link it.unimi.dsi.webgraph.MergedIntIterator}. 22 | */ 23 | public class CountingMergedIntIterator implements IntIterator { 24 | 25 | protected class QueuedIterator implements Comparable { 26 | LazyIntIterator iter; 27 | int value; 28 | 29 | public QueuedIterator(LazyIntIterator iterator) { 30 | iter = iterator; 31 | value = iterator.nextInt(); 32 | } 33 | 34 | @Override 35 | public int compareTo(QueuedIterator o) { 36 | if (value < o.value) { 37 | return -1; 38 | } 39 | if (value > o.value) { 40 | return 1; 41 | } 42 | return 0; 43 | } 44 | } 45 | 46 | public static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt(); 47 | 48 | private final PriorityQueue iters = new PriorityQueue<>(); 49 | private int currentCount = 0; 50 | 51 | /** 52 | * @param iterators input iterators 53 | */ 54 | public CountingMergedIntIterator(LazyIntIterator... iterators) { 55 | for (final LazyIntIterator iter : iterators) { 56 | final QueuedIterator qiter = new QueuedIterator(iter); 57 | if (qiter.value != LAZY_INT_ITERATOR_EMPTY_VALUE) { 58 | iters.add(qiter); 59 | } 60 | } 61 | } 62 | 63 | /** 64 | * {@inheritDoc} 65 | */ 66 | @Override 67 | public boolean hasNext() { 68 | return iters.size() > 0; 69 | } 70 | 71 | /** 72 | * {@inheritDoc} 73 | * 74 | * @deprecated Please use {@link nextInt()} instead. 75 | */ 76 | @Deprecated 77 | @Override 78 | public Integer next() { 79 | return Integer.valueOf(nextInt()); 80 | } 81 | 82 | /** 83 | * {@inheritDoc} 84 | */ 85 | @Override 86 | public int nextInt() { 87 | QueuedIterator qiter = iters.peek(); 88 | final int value = qiter.value; 89 | int count = 1; 90 | while (true) { 91 | iters.remove(); 92 | int val; 93 | while ((val = qiter.iter.nextInt()) == value) { 94 | count++; 95 | } 96 | if (val != LAZY_INT_ITERATOR_EMPTY_VALUE) { 97 | qiter.value = val; 98 | iters.add(qiter); 99 | } 100 | if (iters.isEmpty()) { 101 | break; 102 | } 103 | qiter = iters.peek(); 104 | if (qiter.value == value) { 105 | count++; 106 | } else { 107 | break; 108 | } 109 | } 110 | currentCount = count; 111 | return value; 112 | } 113 | 114 | /** 115 | * @return the count how often the last integer (returned by {@link nextInt()}) 116 | * was seen in the input iterators 117 | */ 118 | public int getCount() { 119 | return currentCount; 120 | } 121 | 122 | /** 123 | * {@inheritDoc} 124 | */ 125 | @Override 126 | public int skip(int n) { 127 | int i = 0; 128 | while (i < n && hasNext()) { 129 | nextInt(); 130 | i++; 131 | } 132 | return i; 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/webgraph/CreatePreferenceVector.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2022 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph; 6 | 7 | import java.io.DataOutputStream; 8 | import java.io.IOException; 9 | import java.nio.file.Files; 10 | import java.nio.file.Paths; 11 | import java.util.Iterator; 12 | import java.util.Objects; 13 | import java.util.stream.Stream; 14 | 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | import it.unimi.dsi.fastutil.longs.LongArrayList; 19 | import it.unimi.dsi.fastutil.longs.LongList; 20 | 21 | /** 22 | * Create a preference vector used for PageRank calculations, e.g., 23 | * (Anti)TrustRank. See PageRank.buildProperties(...). 25 | */ 26 | public class CreatePreferenceVector { 27 | 28 | protected static Logger LOG = LoggerFactory.getLogger(CreatePreferenceVector.class); 29 | 30 | private long lastId = 0; 31 | private Iterator preferenceIterator; 32 | private LongList preferenceIds = new LongArrayList(); 33 | private double defaultPreferenceValue; 34 | private String nextPreferenceName; 35 | long recordsProcessed; 36 | long preferenceNamesFound; 37 | 38 | 39 | public CreatePreferenceVector(double defVal) { 40 | defaultPreferenceValue = defVal; 41 | } 42 | 43 | private boolean nextPreferenceElement() { 44 | if (preferenceIterator.hasNext()) { 45 | nextPreferenceName = preferenceIterator.next(); 46 | return true; 47 | } else { 48 | nextPreferenceName = null; 49 | return false; 50 | } 51 | } 52 | 53 | private void setPrefSet(Stream pref) { 54 | preferenceIterator = pref.iterator(); 55 | nextPreferenceElement(); 56 | } 57 | 58 | private void logProgress() { 59 | LOG.info("Processed {} nodes, found {} preference elements", recordsProcessed, preferenceNamesFound); 60 | } 61 | 62 | private long readJoinNode(String line) { 63 | int sep1 = line.indexOf('\t'); 64 | if (sep1 == -1) { 65 | return -1; 66 | } 67 | lastId = Long.parseLong(line.substring(0, sep1)); 68 | sep1++; 69 | int sep2 = line.indexOf('\t', sep1); 70 | if (sep2 == -1) { 71 | sep2 = line.length(); 72 | } 73 | String name = line.substring(sep1, sep2); 74 | long res = -1; 75 | if (nextPreferenceName != null) { 76 | int c = name.compareTo(nextPreferenceName); 77 | while (c > 0 && nextPreferenceElement()) { 78 | c = name.compareTo(nextPreferenceName); 79 | } 80 | if (c == 0) { 81 | preferenceNamesFound++; 82 | nextPreferenceElement(); 83 | res = lastId; 84 | } 85 | } 86 | recordsProcessed++; 87 | if ((recordsProcessed % 1000000) == 0) { 88 | logProgress(); 89 | } 90 | return res; 91 | } 92 | 93 | private Double convertNode(String line) { 94 | if (readJoinNode(line) < 0) { 95 | return 0.0; 96 | } 97 | return defaultPreferenceValue; 98 | } 99 | 100 | private void read(Stream in) { 101 | in.map(this::readJoinNode).forEach(id -> { 102 | if (id >= 0) { 103 | preferenceIds.add((long) id); 104 | } 105 | }); 106 | } 107 | 108 | private void write(DataOutputStream out) throws IOException { 109 | long id = 0; 110 | Iterator prefIdIter = preferenceIds.iterator(); 111 | long nextPrefId = Long.MAX_VALUE; 112 | if (prefIdIter.hasNext()) { 113 | nextPrefId = prefIdIter.next(); 114 | } 115 | defaultPreferenceValue = 1.0 / preferenceIds.size(); 116 | LOG.info("Preference value = {}", defaultPreferenceValue); 117 | while (id <= lastId) { 118 | double res = 0.0; 119 | if (id == nextPrefId) { 120 | res = defaultPreferenceValue; 121 | if (prefIdIter.hasNext()) { 122 | nextPrefId = prefIdIter.next(); 123 | } else { 124 | nextPrefId = Long.MAX_VALUE; 125 | } 126 | } 127 | out.writeDouble(res); 128 | id++; 129 | if ((id % 1000000) == 0) { 130 | LOG.info("{}% of preference vector written", String.format("%.2f", (100.0 * id / lastId))); 131 | } 132 | } 133 | } 134 | 135 | private void convert(Stream in, DataOutputStream out) { 136 | in.map(this::convertNode).filter(Objects::nonNull).forEach(t -> { 137 | try { 138 | out.writeDouble(t); 139 | } catch (IOException e) { 140 | LOG.error("Failed to write preference vector:", e); 141 | System.exit(1); 142 | } 143 | }); 144 | } 145 | 146 | /** 147 | * Check preference vector whether values sum up to 1.0, see isStochastic() 149 | */ 150 | private boolean validatePreferenceVector() { 151 | double sumPreferenceValues = preferenceNamesFound * defaultPreferenceValue; 152 | if (Math.abs(sumPreferenceValues - 1.0) > 1E-6) { 153 | LOG.error("Sum of preference values not within tolerance: abs({} - 1.0) > {}", sumPreferenceValues, 1E-6); 154 | return false; 155 | } 156 | return true; 157 | } 158 | 159 | private static void showHelp() { 160 | System.err.println( 161 | "CreatePreferenceVector [--value ] "); 162 | System.err.println(""); 163 | System.err.println("Options:"); 164 | System.err.println(" --value \tprecalculated preference value"); 165 | System.err.println(" \t1/n for n preferred vertices)\");"); 166 | System.err.println("If no preference value is given, the preference set is kept"); 167 | System.err.println("in memory, and the preference value is calculated using"); 168 | System.err.println("the number of found preference elements"); 169 | System.err.println(""); 170 | System.err.println("Input / output parameters"); 171 | System.err.println(" \tvertices file with format:"); 172 | System.err.println(" \t \\t "); 173 | System.err.println(" \tfile containing set of \"preferred\" vertices,"); 174 | System.err.println(" \tone vertex per line"); 175 | System.err.println(" \toutput file, binary preference vector,"); 176 | System.err.println(" \tused as \"--preference-vector\""); 177 | System.err.println(" \tfor the LAW PageRank classes"); 178 | System.err.println("Both input files, vertices and preference set, must be sorted"); 179 | System.err.println("lexicographically by vertex names, vertex ids are assigned"); 180 | System.err.println("in sequential order starting from 0."); 181 | System.err.println(""); 182 | } 183 | 184 | public static void main(String[] args) { 185 | double defaultPrefVal = 0.0; 186 | boolean inMemory = true; 187 | int argpos = 0; 188 | while (argpos < args.length && args[argpos].startsWith("-")) { 189 | switch (args[argpos]) { 190 | case "--value": 191 | try { 192 | defaultPrefVal = Double.parseDouble(args[++argpos]); 193 | } catch (NumberFormatException e) { 194 | LOG.error("Invalid number: " + args[argpos]); 195 | System.exit(1); 196 | } 197 | inMemory = false; 198 | break; 199 | default: 200 | System.err.println("Unknown option " + args[argpos]); 201 | showHelp(); 202 | System.exit(1); 203 | } 204 | argpos++; 205 | } 206 | 207 | if (args.length < 3) { 208 | showHelp(); 209 | System.exit(1); 210 | } 211 | String nodesIn = args[argpos++]; 212 | String prefSet = args[argpos++]; 213 | String prefOut = args[argpos++]; 214 | 215 | CreatePreferenceVector converter = new CreatePreferenceVector(defaultPrefVal); 216 | 217 | try (Stream in = Files.lines(Paths.get(nodesIn)); 218 | Stream pref = Files.lines(Paths.get(prefSet))) { 219 | DataOutputStream out; 220 | if (prefOut.equals("-")) { 221 | out = new DataOutputStream(System.out); 222 | } else { 223 | out = new DataOutputStream(Files.newOutputStream(Paths.get(prefOut))); 224 | } 225 | converter.setPrefSet(pref); 226 | if (inMemory) { 227 | LOG.info("Reading preference vector..."); 228 | converter.read(in); 229 | LOG.info("Writing preference vector..."); 230 | converter.write(out); 231 | } else { 232 | LOG.info("Converting preference vector..."); 233 | converter.convert(in, out); 234 | } 235 | converter.logProgress(); 236 | if (!converter.validatePreferenceVector()) { 237 | System.exit(2); 238 | } 239 | } catch (IOException e) { 240 | LOG.error("Failed to create preference vector:", e); 241 | System.exit(1); 242 | } 243 | } 244 | 245 | } -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2022 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph; 6 | 7 | import java.io.IOException; 8 | import java.io.PrintStream; 9 | import java.nio.charset.StandardCharsets; 10 | import java.nio.file.Files; 11 | import java.nio.file.Paths; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import java.util.Objects; 15 | import java.util.TreeMap; 16 | import java.util.function.Consumer; 17 | import java.util.function.Function; 18 | import java.util.regex.Pattern; 19 | import java.util.stream.Stream; 20 | 21 | import org.slf4j.Logger; 22 | import org.slf4j.LoggerFactory; 23 | 24 | import crawlercommons.domains.EffectiveTldFinder; 25 | import it.unimi.dsi.fastutil.Arrays; 26 | import it.unimi.dsi.fastutil.BigArrays; 27 | import it.unimi.dsi.fastutil.longs.LongBigArrays; 28 | 29 | /** 30 | * Convert host-level webgraph to domain-level webgraph. A webgraph is 31 | * represented by two text files/streams with tab-separated columns 32 | *
33 | *
vertices
34 | *
<id, revName>
35 | *
edges
36 | *
<fromId, toId>
37 | *
38 | * Host or domain names are reversed (www.example.com is written as 39 | * com.example.www). The vertices file is sorted lexicographically 40 | * by host name in 41 | * reverse 42 | * domain name notation. IDs (0,1,...,n) are assigned in this sort order. 43 | * The edges file is sorted numerically first by fromId, second by toId. These 44 | * sorting restrictions allow to convert large host graphs with acceptable 45 | * memory requirements (number of hosts × 4 bytes plus some memory to 46 | * queue domains unless all hosts under this domain are processed). 47 | * 48 | *

49 | * Notes, assumptions and preconditions: 50 | *

51 | *
    52 | *
  • host vertices must be sorted lexicographically by reversed host name, see 53 | * above
  • 54 | *
  • the host-domain map is hold as array. To overcome Java's max array size 55 | * (approx. 2^32 or {@link Arrays#MAX_ARRAY_SIZE}) {@link HostToDomainGraphBig} 56 | * (based on fastutils' {@link BigArrays}) is used if the array size limit is 57 | * hit by the number of hosts. This number (or an estimate) needs to be known 58 | * ahead.
  • 59 | *
  • the number of resulting domains is limited by Java's max. array size. 60 | * This shouldn't be a problem.
  • 61 | *
  • also the number of hosts per domain is limited by Java's max. array 62 | * size.
  • 63 | *
64 | */ 65 | public class HostToDomainGraph { 66 | 67 | protected static Logger LOG = LoggerFactory.getLogger(HostToDomainGraph.class); 68 | 69 | protected boolean countHosts = false; 70 | protected boolean privateDomains = false; 71 | protected boolean includeMultiPartSuffixes = false; 72 | 73 | protected long maxSize; 74 | private int[] ids; 75 | protected long currentId = -1; 76 | protected long lastFromId = -1; 77 | protected long lastToId = -1; 78 | private long numInputLinesNodes = 0; 79 | private long numInputLinesEdges = 0; 80 | protected String lastRevHost = null; 81 | protected Domain lastDomain = null; 82 | private TreeMap domainQueue = new TreeMap<>(); 83 | private int maxQueueUsed = 0; 84 | 85 | private static Pattern SPLIT_HOST_PATTERN = Pattern.compile("\\."); 86 | 87 | private Consumer reporterInputNodes = (String line) -> { 88 | if ((numInputLinesNodes % 500000) != 0 || numInputLinesNodes == 0) { 89 | return; 90 | } 91 | LOG.info("Processed {} node input lines, mapped to {} domains, domain queue usage: {} (max. {})", 92 | numInputLinesNodes, (currentId + 1), domainQueue.size(), maxQueueUsed); 93 | }; 94 | 95 | private Consumer reporterInputEdges = (String line) -> { 96 | if ((numInputLinesEdges % 5000000) != 0 || numInputLinesEdges == 0) { 97 | return; 98 | } 99 | LOG.info("Processed {} edge input lines, last edge from node id = {}", numInputLinesEdges, lastFromId); 100 | }; 101 | 102 | private void reportConfig() { 103 | LOG.info("{} with {} host vertices", this.getClass().getSimpleName(), maxSize); 104 | LOG.info(" - map to {} domains", (privateDomains ? "private" : "ICANN")); 105 | LOG.info(" - {}multi-part public suffixes as domains", (includeMultiPartSuffixes ? "" : "no ")); 106 | } 107 | 108 | /** 109 | * Representation of a domain as a result of folding one or more host names to a 110 | * domain name. Holds all information for the given domain to convert host 111 | * vertices and associated edges into a domain graph. 112 | */ 113 | protected static class Domain implements Comparable { 114 | final static char HYPHEN = '-'; 115 | final static char DOT = '.'; 116 | String name; 117 | String revName; 118 | long id; 119 | long numberOfHosts; 120 | List ids = new ArrayList<>(); 121 | 122 | public Domain(String name, String revName, long id, long numberOfHosts) { 123 | this.name = name; 124 | this.revName = revName; 125 | this.id = id; 126 | this.numberOfHosts = numberOfHosts; 127 | } 128 | 129 | public Domain(String name, long id, long numberOfHosts) { 130 | this(name, reverseHost(name), id, numberOfHosts); 131 | } 132 | 133 | public Domain(String name) { 134 | this(name, -1, 0); 135 | } 136 | 137 | public Domain(String name, String revName) { 138 | this(name, revName, -1, 0); 139 | } 140 | 141 | public Domain(String name, long hostId) { 142 | this(name, -1, 0); 143 | add(hostId); 144 | } 145 | 146 | public void add(long hostId) { 147 | ids.add(hostId); 148 | numberOfHosts++; 149 | } 150 | 151 | @Override 152 | public String toString() { 153 | return name; 154 | } 155 | 156 | @Override 157 | public int compareTo(Domain o) { 158 | return revName.compareTo(o.revName); 159 | } 160 | 161 | /** 162 | * Whether the domain is safe to output given the reversed domain name seen 163 | * next. 164 | * 165 | * @param nextDomainRevName next name in lexicographically sorted list of 166 | * reversed domain names 167 | * @return true if the domain is safe to output, that is from a list of sorted 168 | * host names no host later in this list may fold to this domain name 169 | */ 170 | public boolean isSafeToOutput(String nextDomainRevName) { 171 | return isSafeToOutput(this.revName, nextDomainRevName); 172 | } 173 | 174 | public static boolean isSafeToOutput(String domainRevName, String nextDomainRevName) { 175 | return compareRevDomainsSafe(domainRevName, nextDomainRevName) < 0; 176 | } 177 | 178 | public static int compareRevDomainsSafe(String d1, String d2) { 179 | int l1 = d1.length(); 180 | int l2 = d2.length(); 181 | int l = Math.min(l1, l2); 182 | int dots = 0; 183 | for (int i = 0; i < l; i++) { 184 | char c1 = d1.charAt(i); 185 | char c2 = d2.charAt(i); 186 | if (c1 != c2) { 187 | return c1 - c2; 188 | } else if (c1 == HYPHEN) { 189 | /* 190 | * cannot finish "org.example-domain" unless "org.example" is done 191 | */ 192 | return 0; 193 | } else if (c1 == DOT) { 194 | dots++; 195 | if (dots > 1) { 196 | /* 197 | * cannot finish "name.his.forgot.foobar" unless "name.his" is done 198 | * 199 | * This is a special case of multi-part suffixes with more than two parts when 200 | * the first part is also a public suffix, e.g. (in reversed domain name 201 | * notation) if "a" and "a.b.c" are public suffixes, and the input hosts are 202 | * (sorted): "a.b.c.d", "a.b.c.e" and "a.b.f", then we need to delay the output 203 | * of "a.b.c.*" until "a.b" is done. 204 | */ 205 | return 0; 206 | } 207 | } 208 | } 209 | if (l1 == l2) { 210 | return 0; 211 | } 212 | if (l1 > l2) { 213 | char c1 = d1.charAt(l2); 214 | switch (c1) { 215 | case HYPHEN: 216 | /* 217 | * cannot finish "org.example-domain" unless "org.example" is done 218 | */ 219 | case DOT: 220 | // cannot finish "tld.suffix.suffix2.domain" unless "tld.suffix" is done 221 | return 1; 222 | } 223 | return c1 - DOT; 224 | } 225 | char c2 = d2.charAt(l1); 226 | if (c2 == HYPHEN || c2 == DOT) 227 | return 1; 228 | return DOT - c2; 229 | } 230 | } 231 | 232 | private HostToDomainGraph() { 233 | } 234 | 235 | public HostToDomainGraph(int maxSize) { 236 | this.maxSize = maxSize; 237 | ids = new int[maxSize]; 238 | } 239 | 240 | /** 241 | * @param countHosts if true count the number of hosts per domain 242 | */ 243 | public void doCount(boolean countHosts) { 244 | this.countHosts = countHosts; 245 | } 246 | 247 | /** 248 | * @param privateDomains if true map host to domain names using also the 249 | * suffixes from the subdivision 251 | * of "private domains" in the public suffix list in 252 | * addition to the "ICANN domains" used otherwise 253 | */ 254 | public void doPrivateDomains(boolean privateDomains) { 255 | this.privateDomains = privateDomains; 256 | } 257 | 258 | /** 259 | * deprecated, use {@link #multiPartSuffixesAsDomains(boolean)} instead (note 260 | * that this requires to invert boolean parameter) 261 | * 262 | * @param strict if false map host names equal to any multi-part public suffix 263 | * (the suffix contains a dot) (eg. gov.uk or 264 | * freight.aero) one by one to domain names. 265 | */ 266 | @Deprecated 267 | public void setStrictDomainValidate(boolean strict) { 268 | this.includeMultiPartSuffixes = !strict; 269 | } 270 | 271 | /** 272 | * @param include if true map host names equal to any multi-part public suffix 273 | * (the suffix contains a dot) (eg. gov.uk or 274 | * freight.aero) one by one to domain names. 275 | */ 276 | public void multiPartSuffixesAsDomains(boolean include) { 277 | this.includeMultiPartSuffixes = include; 278 | } 279 | 280 | /** 281 | * Reverse host name, eg. www.example.com is reversed to 282 | * com.example.www. Can also be used to "unreverse" a reversed host 283 | * name. 284 | * 285 | * @param host name 286 | * @return host in reverse 288 | * domain name notation 289 | */ 290 | public static String reverseHost(String host) { 291 | String[] rev = SPLIT_HOST_PATTERN.split(host); 292 | for (int i = 0; i < (rev.length / 2); i++) { 293 | String temp = rev[i]; 294 | rev[i] = rev[rev.length - i - 1]; 295 | rev[rev.length - i - 1] = temp; 296 | } 297 | return String.join(".", rev); 298 | } 299 | 300 | protected void setValue(long id, long value) { 301 | ids[(int) id] = (int) value; 302 | } 303 | 304 | protected long getValue(long id) { 305 | return ids[(int) id]; 306 | } 307 | 308 | public String convertNode(String line) { 309 | numInputLinesNodes++; 310 | int sep = line.indexOf('\t'); 311 | if (sep == -1) { 312 | LOG.warn("Skipping invalid line: <{}>", line); 313 | return ""; 314 | } 315 | long id = Long.parseLong(line.substring(0, sep)); 316 | String revHost = line.substring(sep + 1); 317 | if (lastRevHost != null) { 318 | if (lastRevHost.compareTo(revHost) >= 0) { 319 | String msg = "Reversed host names in input are not properly sorted: " + lastRevHost + " <> " + revHost; 320 | LOG.error(msg); 321 | throw new RuntimeException(msg); 322 | } 323 | } 324 | lastRevHost = revHost; 325 | String host = reverseHost(revHost); 326 | String domain = EffectiveTldFinder.getAssignedDomain(host, true, !privateDomains); 327 | StringBuilder sb = new StringBuilder(); 328 | if (domain == null && includeMultiPartSuffixes) { 329 | if (EffectiveTldFinder.getEffectiveTLDs().containsKey(host) && host.indexOf('.') != -1) { 330 | LOG.info("Accepting public suffix (containing dot) as domain: {}", host); 331 | } 332 | domain = host; 333 | } 334 | if (domain == null) { 335 | LOG.warn("No domain for host: {}", host); 336 | setValue(id, -1); 337 | return null; 338 | } 339 | if (lastDomain != null && domain.equals(lastDomain.name)) { 340 | // short cut for the common case of many subsequent subdomains of the same domain 341 | lastDomain.add(id); 342 | return null; 343 | } 344 | lastDomain = queueDomain(sb, domain); 345 | if (lastDomain != null) { 346 | lastDomain.add(id); 347 | } 348 | if (sb.length() == 0) { 349 | return null; 350 | } 351 | return sb.toString(); 352 | } 353 | 354 | /** 355 | * Add the domain name to the queue if it is not already queued. Flush the 356 | * queue, assuming properly sorted input. 357 | * 358 | * @param sb domains which are safe to print are added to this 359 | * StringBuilder. 360 | * @param domainName domain name to be queued 361 | * @return the queued domain object 362 | */ 363 | private Domain queueDomain(StringBuilder sb, String domainName) { 364 | String revDomainName = reverseHost(domainName); 365 | Domain domain = null; 366 | // first, poll all queued domains safe to output 367 | while (!domainQueue.isEmpty()) { 368 | String firstDomain = domainQueue.firstKey(); 369 | if (!Domain.isSafeToOutput(firstDomain, revDomainName)) { 370 | /* 371 | * queued domains are sorted lexicographically: if the first/current domain 372 | * cannot be safely dequeued and written to output, this is also the case for 373 | * the following ones. 374 | */ 375 | break; 376 | } 377 | Domain d = domainQueue.pollFirstEntry().getValue(); 378 | d.id = ++currentId; 379 | getNodeLine(sb, d); 380 | } 381 | if (domainQueue.containsKey(revDomainName)) { 382 | domain = domainQueue.get(revDomainName); 383 | } else { 384 | domain = new Domain(domainName); 385 | domainQueue.put(revDomainName, domain); 386 | if (domainQueue.size() > maxQueueUsed) { 387 | maxQueueUsed = domainQueue.size(); 388 | } 389 | } 390 | return domain; 391 | } 392 | 393 | private String getNodeLine(Domain domain) { 394 | StringBuilder b = new StringBuilder(); 395 | getNodeLine(b, domain); 396 | return b.toString(); 397 | } 398 | 399 | private void getNodeLine(StringBuilder b, Domain domain) { 400 | if (domain == null) 401 | return; 402 | if (domain.id >= 0 && domain.name != null) { 403 | if (b.length() > 0) { 404 | b.append('\n'); 405 | } 406 | b.append(domain.id); 407 | b.append('\t'); 408 | b.append(reverseHost(domain.name)); 409 | if (countHosts) { 410 | b.append('\t'); 411 | b.append(domain.numberOfHosts); 412 | } 413 | } 414 | for (Long hostId : domain.ids) { 415 | setValue(hostId.longValue(), domain.id); 416 | } 417 | } 418 | 419 | public String convertEdge(String line) { 420 | numInputLinesEdges++; 421 | int sep = line.indexOf('\t'); 422 | if (sep == -1) { 423 | return ""; 424 | } 425 | long fromId = Long.parseLong(line.substring(0, sep)); 426 | long toId = Long.parseLong(line.substring(sep + 1)); 427 | fromId = getValue(fromId); 428 | toId = getValue(toId); 429 | if (fromId == toId || fromId == -1 || toId == -1 || (lastFromId == fromId && lastToId == toId)) { 430 | return null; 431 | } 432 | lastFromId = fromId; 433 | lastToId = toId; 434 | return fromId + "\t" + toId; 435 | } 436 | 437 | public void convert(Function func, Stream in, PrintStream out) { 438 | in.map(func).filter(Objects::nonNull).forEach(out::println); 439 | } 440 | 441 | public void convert(Function func, Stream in, PrintStream out, 442 | Consumer reporter) { 443 | convert(func, in.peek(reporter), out); 444 | } 445 | 446 | public void finishNodes(PrintStream out) { 447 | for (Domain domain : domainQueue.values()) { 448 | domain.id = ++currentId; 449 | out.println(getNodeLine(domain)); 450 | } 451 | out.flush(); 452 | domainQueue.clear(); 453 | LOG.info("Number of input lines: {}", numInputLinesNodes); 454 | LOG.info("Number of domain nodes: {}", currentId + 1); 455 | LOG.info("Max. domain queue usage: {}", maxQueueUsed); 456 | } 457 | 458 | /** 459 | * Holds a host to domain graph mapping if the size of the host graph exceeds 460 | * {@link Arrays#MAX_ARRAY_SIZE}. 461 | */ 462 | public static class HostToDomainGraphBig extends HostToDomainGraph { 463 | 464 | private long[][] ids; 465 | 466 | public HostToDomainGraphBig(long maxSize) { 467 | this.maxSize = maxSize; 468 | ids = LongBigArrays.newBigArray(maxSize); 469 | } 470 | 471 | @Override 472 | protected void setValue(long id, long value) { 473 | BigArrays.set(ids, id, value); 474 | } 475 | 476 | @Override 477 | protected long getValue(long id) { 478 | return BigArrays.get(ids, id); 479 | } 480 | } 481 | 482 | private static void showHelp() { 483 | System.err.println("HostToDomainGraph [options]... "); 484 | System.err.println(""); 485 | System.err.println("Convert host-level webgraph to domain-level webgraph."); 486 | System.err.println("Both input and output must be UTF-8 or ASCII, the input is required"); 487 | System.err.println("to be sorted lexicographically by node labels given in reversed domain name notation."); 488 | System.err.println(""); 489 | System.err.println("Options:"); 490 | System.err.println(" -h\t(also -? or --help) show usage message and exit"); 491 | System.err.println(" -c\tcount hosts per domain (additional column in "); 492 | System.err.println(" --private-domains\tconvert to private domains (include suffixes from the"); 493 | System.err.println(" \tPRIVATE domains subdivision of the public suffix list,"); 494 | System.err.println(" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions"); 495 | System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part"); 496 | System.err.println(" \tpublic suffixes (the suffix contains a dot) as domain"); 497 | System.err.println(" \tnames, eg. `gov.uk', `freight.aero' or `altoadige.it'."); 498 | System.err.println(" \tNo further validation (DNS lookup) is performed."); 499 | } 500 | 501 | public static void main(String[] args) { 502 | boolean countHosts = false; 503 | boolean includeMultiPartSuffixes = false; 504 | boolean privateDomains = false; 505 | int argpos = 0; 506 | while (argpos < args.length && args[argpos].startsWith("-")) { 507 | switch (args[argpos]) { 508 | case "-?": 509 | case "-h": 510 | case "--help": 511 | showHelp(); 512 | System.exit(0); 513 | case "-c": 514 | countHosts = true; 515 | break; 516 | case "--multipart-suffixes-as-domains": 517 | case "--no-strict-domain-validate": // back-ward compatibility 518 | includeMultiPartSuffixes = true; 519 | break; 520 | case "--private-domains": 521 | case "--private": // back-ward compatibility 522 | privateDomains = true; 523 | break; 524 | default: 525 | System.err.println("Unknown option " + args[argpos]); 526 | showHelp(); 527 | System.exit(1); 528 | } 529 | argpos++; 530 | } 531 | if ((args.length - argpos) < 5) { 532 | showHelp(); 533 | System.exit(1); 534 | } 535 | long maxSize = 0; 536 | try { 537 | maxSize = Long.parseLong(args[argpos + 0]); 538 | } catch (NumberFormatException e) { 539 | LOG.error("Invalid number: " + args[argpos + 0]); 540 | System.exit(1); 541 | } 542 | HostToDomainGraph converter; 543 | if (maxSize <= Arrays.MAX_ARRAY_SIZE) { 544 | converter = new HostToDomainGraph((int) maxSize); 545 | } else { 546 | converter = new HostToDomainGraphBig(maxSize); 547 | } 548 | converter.doCount(countHosts); 549 | converter.multiPartSuffixesAsDomains(includeMultiPartSuffixes); 550 | converter.doPrivateDomains(privateDomains); 551 | converter.reportConfig(); 552 | String nodesIn = args[argpos + 1]; 553 | String nodesOut = args[argpos + 2]; 554 | try (Stream in = Files.lines(Paths.get(nodesIn), StandardCharsets.UTF_8); 555 | PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(nodesOut)), false, 556 | StandardCharsets.UTF_8)) { 557 | converter.convert(converter::convertNode, in, out, converter.reporterInputNodes); 558 | converter.finishNodes(out); 559 | LOG.info("Finished conversion of nodes/vertices"); 560 | } catch (IOException e) { 561 | LOG.error("Failed to convert nodes", e); 562 | System.exit(1); 563 | } 564 | String edgesIn = args[argpos + 3]; 565 | String edgesOut = args[argpos + 4]; 566 | try (Stream in = Files.lines(Paths.get(edgesIn), StandardCharsets.UTF_8); 567 | PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(edgesOut)), false, 568 | StandardCharsets.UTF_8)) { 569 | converter.convert(converter::convertEdge, in, out, converter.reporterInputEdges); 570 | LOG.info("Finished conversion of edges"); 571 | } catch (IOException e) { 572 | LOG.error("Failed to convert edges", e); 573 | System.exit(1); 574 | } 575 | } 576 | 577 | } 578 | -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/webgraph/JoinSortRanks.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2022 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph; 6 | 7 | import java.io.IOException; 8 | import java.io.OutputStream; 9 | import java.io.PrintStream; 10 | import java.nio.charset.StandardCharsets; 11 | import java.nio.file.Files; 12 | import java.nio.file.Paths; 13 | import java.util.function.Function; 14 | import java.util.stream.Stream; 15 | 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import it.unimi.dsi.fastutil.Arrays; 20 | import it.unimi.dsi.fastutil.BigArrays; 21 | import it.unimi.dsi.fastutil.ints.IntArrays; 22 | import it.unimi.dsi.fastutil.ints.IntComparator; 23 | import it.unimi.dsi.fastutil.io.BinIO; 24 | import it.unimi.dsi.fastutil.longs.LongBigArrays; 25 | import it.unimi.dsi.fastutil.longs.LongComparator; 26 | 27 | /** 28 | * Assign ranks to harmonic centrality and page rank values, join ranks with 29 | * node names and sort by decreasing harmonic centrality rank/score. 30 | * 31 | * Sorting and joining is done in memory. For a graph with n nodes, the 32 | * required memory is 24 * n bytes, resp. 36 * n bytes if n 33 | * > {@link Arrays#MAX_ARRAY_SIZE}. In practice, the requirements are higher 34 | * by about 50%. 35 | */ 36 | public class JoinSortRanks { 37 | 38 | protected static Logger LOG = LoggerFactory.getLogger(JoinSortRanks.class); 39 | 40 | private float[] harmonicCentralityValues; 41 | private double[] pageRankValues; 42 | 43 | private int[] harmonicCentralityRanks; 44 | private int[] pageRankRanks; 45 | private int[] indirectSortPerm; 46 | 47 | public void loadHarmonicCentrality(String ranksHC) throws IOException { 48 | harmonicCentralityValues = BinIO.loadFloats(ranksHC); 49 | harmonicCentralityRanks = new int[harmonicCentralityValues.length]; 50 | } 51 | 52 | public void loadPageRank(String ranksPR) throws IOException { 53 | pageRankValues = BinIO.loadDoubles(ranksPR); 54 | pageRankRanks = new int[pageRankValues.length]; 55 | } 56 | 57 | private int compareHarmonicCentralityIndirect(int k1, int k2) { 58 | k1 = indirectSortPerm[k1]; 59 | k2 = indirectSortPerm[k2]; 60 | float f1 = harmonicCentralityValues[k1]; 61 | float f2 = harmonicCentralityValues[k2]; 62 | // sort in reverse order, higher values first 63 | if (f1 < f2) { 64 | return 1; 65 | } 66 | if (f1 > f2) { 67 | return -1; 68 | } 69 | // secondary sorting by original order (node IDs) 70 | return Integer.compare(k1, k2); 71 | } 72 | 73 | private int comparePageRankIndirect(int k1, int k2) { 74 | k1 = indirectSortPerm[k1]; 75 | k2 = indirectSortPerm[k2]; 76 | double f1 = pageRankValues[k1]; 77 | double f2 = pageRankValues[k2]; 78 | // sort in reverse order, higher values first 79 | if (f1 < f2) { 80 | return 1; 81 | } 82 | if (f1 > f2) { 83 | return -1; 84 | } 85 | // secondary sorting by original order (node IDs) 86 | return Integer.compare(k1, k2); 87 | } 88 | 89 | private void swapIndirect(int k1, int k2) { 90 | IntArrays.swap(indirectSortPerm, k1, k2); 91 | } 92 | 93 | private void assignRank(int[] ranks, IntComparator comp) { 94 | int length = ranks.length; 95 | indirectSortPerm = new int[length]; 96 | for (int i = 0; i < length; i++) { 97 | indirectSortPerm[i] = i; 98 | } 99 | Arrays.parallelQuickSort(0, length, comp, this::swapIndirect); 100 | for (int i = 0; i < length; ) { 101 | ranks[indirectSortPerm[i]] = ++i; 102 | } 103 | indirectSortPerm = null; 104 | } 105 | 106 | public void assignHarmonicCentralityRank() { 107 | assignRank(harmonicCentralityRanks, this::compareHarmonicCentralityIndirect); 108 | } 109 | 110 | public void assignPageRankRank() { 111 | assignRank(pageRankRanks, this::comparePageRankIndirect); 112 | } 113 | 114 | protected float getHarmonicCentralityValue(long id) { 115 | return harmonicCentralityValues[(int) id]; 116 | } 117 | 118 | protected long getHarmonicCentralityRank(long id) { 119 | return harmonicCentralityRanks[(int) id]; 120 | } 121 | 122 | protected double getPageRankValue(long id) { 123 | return pageRankValues[(int) id]; 124 | } 125 | 126 | protected long getPageRankRank(long id) { 127 | return pageRankRanks[(int) id]; 128 | } 129 | 130 | public void convert(Function func, Stream in, PrintStream out) { 131 | in.map(func).forEach(out::println); 132 | } 133 | 134 | public String addRanks(String line) { 135 | int sep = line.indexOf('\t'); 136 | if (sep == -1) { 137 | return ""; 138 | } 139 | long id = Long.parseLong(line.substring(0, sep)); 140 | // check whether new line is already contained 141 | int end = line.lastIndexOf('\n'); 142 | String revHost = line.substring(sep+1); 143 | float hcv = getHarmonicCentralityValue(id); 144 | long hcr = getHarmonicCentralityRank(id); 145 | double prv = getPageRankValue(id); 146 | long prr = getPageRankRank(id); 147 | StringBuilder sb = new StringBuilder(); 148 | sb.append(hcr); 149 | sb.append('\t'); 150 | sb.append(hcv); 151 | sb.append('\t'); 152 | sb.append(prr); 153 | sb.append('\t'); 154 | sb.append(prv); 155 | sb.append('\t'); 156 | sb.append(revHost); 157 | if (end != -1) { 158 | sb.append('\n'); 159 | } 160 | return sb.toString(); 161 | } 162 | 163 | 164 | /** 165 | * Implementation of {@link JoinSortRanks} for lists exceeding 166 | * {@link Arrays#MAX_ARRAY_SIZE}. 167 | */ 168 | public static class JoinSortRanksBig extends JoinSortRanks { 169 | 170 | private float[][] harmonicCentralityValues; 171 | private double[][] pageRankValues; 172 | 173 | private long[][] harmonicCentralityRanks; 174 | private long[][] pageRankRanks; 175 | private long[][] indirectSortPerm; 176 | 177 | public void loadHarmonicCentrality(String ranksFile) throws IOException { 178 | harmonicCentralityValues = BinIO.loadFloatsBig(ranksFile); 179 | long length = BigArrays.length(harmonicCentralityValues); 180 | harmonicCentralityRanks = LongBigArrays.newBigArray(length); 181 | } 182 | 183 | public void loadPageRank(String ranksFile) throws IOException { 184 | pageRankValues = BinIO.loadDoublesBig(ranksFile); 185 | long length = BigArrays.length(pageRankValues); 186 | pageRankRanks = LongBigArrays.newBigArray(length); 187 | } 188 | 189 | private int compareHarmonicCentralityIndirect(long k1, long k2) { 190 | k1 = BigArrays.get(indirectSortPerm, k1); 191 | k2 = BigArrays.get(indirectSortPerm, k2); 192 | float f1 = BigArrays.get(harmonicCentralityValues, k1); 193 | float f2 = BigArrays.get(harmonicCentralityValues, k2); 194 | // sort in reverse order, higher values first 195 | if (f1 < f2) { 196 | return 1; 197 | } 198 | if (f1 > f2) { 199 | return -1; 200 | } 201 | // secondary sorting by original order (node IDs) 202 | return Long.compare(k1, k2); 203 | } 204 | 205 | private int comparePageRankIndirect(long k1, long k2) { 206 | k1 = BigArrays.get(indirectSortPerm, k1); 207 | k2 = BigArrays.get(indirectSortPerm, k2); 208 | double f1 = BigArrays.get(pageRankValues, k1); 209 | double f2 = BigArrays.get(pageRankValues, k2); 210 | // sort in reverse order, higher values first 211 | if (f1 < f2) { 212 | return 1; 213 | } 214 | if (f1 > f2) { 215 | return -1; 216 | } 217 | // secondary sorting by original order (node IDs) 218 | return Long.compare(k1, k2); 219 | } 220 | 221 | private void swapIndirect(long k1, long k2) { 222 | BigArrays.swap(indirectSortPerm, k1, k2); 223 | } 224 | 225 | private void assignRank(long[][] ranks, LongComparator comp) { 226 | long length = BigArrays.length(ranks); 227 | indirectSortPerm = LongBigArrays.newBigArray(length); 228 | for (long i = 0; i < length; i++) { 229 | BigArrays.set(indirectSortPerm, i, i); 230 | } 231 | BigArrays.quickSort(0, length, comp, this::swapIndirect); 232 | for (long i = 0; i < length; ) { 233 | BigArrays.set(ranks, BigArrays.get(indirectSortPerm, i), ++i); 234 | } 235 | indirectSortPerm = null; 236 | } 237 | 238 | public void assignHarmonicCentralityRank() { 239 | assignRank(harmonicCentralityRanks, this::compareHarmonicCentralityIndirect); 240 | } 241 | 242 | public void assignPageRankRank() { 243 | assignRank(pageRankRanks, this::comparePageRankIndirect); 244 | } 245 | 246 | protected float getHarmonicCentralityValue(long id) { 247 | return BigArrays.get(harmonicCentralityValues, id); 248 | } 249 | 250 | protected long getHarmonicCentralityRank(long id) { 251 | return BigArrays.get(harmonicCentralityRanks, id); 252 | } 253 | 254 | protected double getPageRankValue(long id) { 255 | return BigArrays.get(pageRankValues, id); 256 | } 257 | 258 | protected long getPageRankRank(long id) { 259 | return BigArrays.get(pageRankRanks, id); 260 | } 261 | 262 | } 263 | 264 | private static void showHelp() { 265 | System.err.println("JoinSortRanks [--big] "); 266 | System.err.println(""); 267 | System.err.println("Assign ranks to harmonic centrality and page rank values,"); 268 | System.err.println("and join ranks with node names."); 269 | System.err.println(""); 270 | System.err.println("Options:"); 271 | System.err.println(" --big\tgraphs are \"big\" (more than 2^31 nodes)"); 272 | System.err.println(""); 273 | System.err.println("Input / output parameters (text must be UTF-8)"); 274 | System.err.println(" \tvertices file with format:"); 275 | System.err.println(" \t \\t [ \\t ]..."); 276 | System.err.println(" \tharmonic centrality values, binary floats"); 277 | System.err.println(" \tpage rank values, binary doubles"); 278 | System.err.println(" \tranks output, tab-separated:"); 279 | System.err.println(" \t ..."); 280 | System.err.println(""); 281 | } 282 | 283 | public static void main(String[] args) { 284 | boolean useBigGraph = false; 285 | int argpos = 0; 286 | while (argpos < args.length && args[argpos].startsWith("-")) { 287 | switch (args[argpos]) { 288 | case "--big": 289 | useBigGraph = true; 290 | break; 291 | default: 292 | System.err.println("Unknown option " + args[argpos]); 293 | showHelp(); 294 | System.exit(1); 295 | } 296 | argpos++; 297 | } 298 | if ((args.length - argpos) < 4) { 299 | showHelp(); 300 | System.exit(1); 301 | } 302 | JoinSortRanks converter; 303 | if (useBigGraph) { 304 | converter = new JoinSortRanksBig(); 305 | } else { 306 | converter = new JoinSortRanks(); 307 | } 308 | 309 | String nodesIn = args[argpos++]; 310 | String ranksHC = args[argpos++]; 311 | String ranksPR = args[argpos++]; 312 | String ranksOut = args[argpos++]; 313 | try (Stream in = Files.lines(Paths.get(nodesIn), StandardCharsets.UTF_8)) { 314 | OutputStream ranksOutStream; 315 | if (ranksOut.equals("-")) { 316 | ranksOutStream = System.out; 317 | } else { 318 | ranksOutStream = Files.newOutputStream(Paths.get(ranksOut)); 319 | } 320 | PrintStream out = new PrintStream(ranksOutStream, false, StandardCharsets.UTF_8); 321 | LOG.info("Loading harmonic centrality values from {}", ranksHC); 322 | converter.loadHarmonicCentrality(ranksHC); 323 | LOG.info("Loading page rank values from {}", ranksPR); 324 | converter.loadPageRank(ranksPR); 325 | LOG.info("Assigning harmonic centrality ranks"); 326 | converter.assignHarmonicCentralityRank(); 327 | LOG.info("Assigning page rank ranks"); 328 | converter.assignPageRankRank(); 329 | LOG.info("Joining ranks"); 330 | converter.convert(converter::addRanks, in, out); 331 | LOG.info("Finished joining ranks"); 332 | } catch (IOException e) { 333 | LOG.error("Failed to join ranks:", e); 334 | System.exit(1); 335 | } 336 | } 337 | 338 | } 339 | -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/webgraph/explore/Graph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2024 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph.explore; 6 | 7 | import java.io.IOException; 8 | import java.nio.file.Files; 9 | import java.nio.file.Paths; 10 | import java.util.AbstractMap.SimpleEntry; 11 | import java.util.Arrays; 12 | import java.util.Collections; 13 | import java.util.LinkedList; 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.Map.Entry; 17 | import java.util.PrimitiveIterator; 18 | import java.util.stream.IntStream; 19 | import java.util.stream.Stream; 20 | 21 | import org.commoncrawl.webgraph.CountingMergedIntIterator; 22 | import org.commoncrawl.webgraph.HostToDomainGraph; 23 | import org.slf4j.Logger; 24 | import org.slf4j.LoggerFactory; 25 | 26 | import crawlercommons.domains.EffectiveTldFinder; 27 | import it.unimi.dsi.fastutil.io.BinIO; 28 | import it.unimi.dsi.fastutil.longs.LongArrayList; 29 | import it.unimi.dsi.lang.MutableString; 30 | import it.unimi.dsi.sux4j.mph.GOV4Function; 31 | import it.unimi.dsi.util.FrontCodedStringList; 32 | import it.unimi.dsi.util.ImmutableExternalPrefixMap; 33 | import it.unimi.dsi.util.Interval; 34 | import it.unimi.dsi.util.LiterallySignedStringMap; 35 | import it.unimi.dsi.util.ShiftAddXorSignedStringMap; 36 | import it.unimi.dsi.webgraph.ImmutableGraph; 37 | import it.unimi.dsi.webgraph.LazyIntIterator; 38 | import it.unimi.dsi.webgraph.LazyIntIterators; 39 | 40 | /** 41 | * Holds webgraph-related data structures and access methods for graph 42 | * exploration. 43 | */ 44 | public class Graph { 45 | 46 | private static Logger LOG = LoggerFactory.getLogger(Graph.class); 47 | 48 | /** The base name of the graph */ 49 | public String name; 50 | /** The graph */ 51 | public ImmutableGraph graph; 52 | /** The transpose of the graph */ 53 | public ImmutableGraph graphT; 54 | 55 | /* Maps to translate between vertex label an ID */ 56 | protected ImmutableExternalPrefixMap vertexMap; 57 | protected FrontCodedStringList vertexMapFcl; 58 | protected ShiftAddXorSignedStringMap vertexMapSmph; 59 | protected GOV4Function vertexMapMph; 60 | protected LiterallySignedStringMap vertexMapLmap; 61 | 62 | private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt(); 63 | 64 | public Graph(String name) throws Exception { 65 | this.name = name; 66 | try { 67 | LOG.info("Loading graph {}.graph", name); 68 | graph = ImmutableGraph.loadMapped(name); 69 | LOG.info("Loading transpose of the graph {}-t.graph", name); 70 | graphT = ImmutableGraph.loadMapped(name + "-t"); 71 | if (Files.exists(Paths.get(name + ".iepm"))) { 72 | LOG.info("Loading vertex map {}.iepm (ImmutableExternalPrefixMap)", name); 73 | vertexMap = (ImmutableExternalPrefixMap) BinIO.loadObject(name + ".iepm"); 74 | } else if (Files.exists(Paths.get(name + ".fcl"))) { 75 | LOG.info("Loading vertex map {}.fcl (FrontCodedStringList, maps vertex IDs to labels)", name); 76 | vertexMapFcl = (FrontCodedStringList) BinIO.loadObject(name + ".fcl"); 77 | if (Files.exists(Paths.get(name + ".smph"))) { 78 | LOG.info("Loading vertex map {}.smph (string map perfect hash, maps vertex labels to IDs)", name); 79 | vertexMapSmph = (ShiftAddXorSignedStringMap) BinIO.loadObject(name + ".smph"); 80 | } else if (Files.exists(Paths.get(name + ".mph"))) { 81 | LOG.info("Loading vertex map {}.mph (minimal perfect hash, maps vertex labels to IDs)", name); 82 | vertexMapMph = (GOV4Function) BinIO.loadObject(name + ".mph"); 83 | LOG.warn( 84 | "Using a minimal perfect hash as vertex map does not allow to verify that a vertex label exists. " 85 | + "Non-existant labels are mapped to quasi-random IDs."); 86 | } else { 87 | LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); 88 | } 89 | } else if (Files.exists(Paths.get(name + ".lmap"))) { 90 | LOG.info("Loading vertex map {}.lmap (LiterallySignedStringMap)", name); 91 | vertexMapLmap = (LiterallySignedStringMap) BinIO.loadObject(name + ".lmap"); 92 | } else { 93 | LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); 94 | } 95 | } catch (IOException | ClassNotFoundException e) { 96 | LOG.error("Failed to load graph {}:", name, e); 97 | throw e; 98 | } 99 | LOG.info("Loaded graph {}.graph", name); 100 | } 101 | 102 | public String vertexIdToLabel(long id) { 103 | if (vertexMap != null) { 104 | return vertexMap.list().get((int) id).toString(); 105 | } else if (vertexMapFcl != null) { 106 | return vertexMapFcl.get((int) id).toString(); 107 | } else if (vertexMapLmap != null) { 108 | return vertexMapLmap.list().get((int) id).toString(); 109 | } else { 110 | throw new RuntimeException("No vertex map loaded."); 111 | } 112 | } 113 | 114 | public long vertexLabelToId(String label) { 115 | if (vertexMap != null) { 116 | return vertexMap.getLong(label); 117 | } else if (vertexMapSmph != null) { 118 | return vertexMapSmph.getLong(label); 119 | } else if (vertexMapMph != null) { 120 | return vertexMapMph.getLong(label); 121 | } else if (vertexMapLmap != null) { 122 | return vertexMapLmap.getLong(label); 123 | } else { 124 | throw new RuntimeException("No vertex map loaded."); 125 | } 126 | } 127 | 128 | public int outdegree(long vertexId) { 129 | return graph.outdegree((int) vertexId); 130 | } 131 | 132 | public int outdegree(String vertexLabel) { 133 | return graph.outdegree((int) vertexLabelToId(vertexLabel)); 134 | } 135 | 136 | public int indegree(long vertexId) { 137 | return graphT.outdegree((int) vertexId); 138 | } 139 | 140 | public int indegree(String vertexLabel) { 141 | return graphT.outdegree((int) vertexLabelToId(vertexLabel)); 142 | } 143 | 144 | public int[] successors(long vertexId) { 145 | return graph.successorArray((int) vertexId); 146 | } 147 | 148 | public int[] successors(String vertexLabel) { 149 | return graph.successorArray((int) vertexLabelToId(vertexLabel)); 150 | } 151 | 152 | public Stream successorStream(String vertexLabel) { 153 | return successorStream(graph, vertexLabelToId(vertexLabel)); 154 | } 155 | 156 | public IntStream successorIntStream(String vertexLabel) { 157 | return successorIntStream(graph, vertexLabelToId(vertexLabel)); 158 | } 159 | 160 | public Stream successorStream(String vertexLabel, String prefix) { 161 | return successorStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); 162 | } 163 | 164 | public IntStream successorIntStream(String vertexLabel, String prefix) { 165 | return successorIntStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); 166 | } 167 | 168 | public Stream> successorTopLevelDomainCounts(String vertexLabel) { 169 | return successorTopLevelDomainCounts(graph, vertexLabelToId(vertexLabel)); 170 | } 171 | 172 | public Stream successorStream(ImmutableGraph graph, long vertexId) { 173 | return successorIntStream(graph, vertexId).mapToObj(i -> vertexIdToLabel(i)); 174 | } 175 | 176 | public IntStream successorIntStream(ImmutableGraph graph, long vertexId) { 177 | return Arrays.stream(graph.successorArray((int) vertexId)); 178 | } 179 | 180 | private Stream successorStream(ImmutableGraph graph, long vertexId, Interval interval) { 181 | return successorIntStream(graph, vertexId, interval).mapToObj(i -> vertexIdToLabel(i)); 182 | } 183 | 184 | public IntStream successorIntStream(ImmutableGraph graph, long vertexId, Interval interval) { 185 | return Arrays.stream(graph.successorArray((int) vertexId)).filter(x -> (interval.compareTo(x) == 0)); 186 | } 187 | 188 | public Stream successorTopLevelDomainStream(ImmutableGraph graph, long vertexId) { 189 | return Arrays.stream(graph.successorArray((int) vertexId)).mapToObj(i -> getTopLevelDomain(vertexIdToLabel(i))); 190 | } 191 | 192 | public Stream> successorTopLevelDomainCounts(ImmutableGraph graph, long vertexId) { 193 | if (vertexMap != null) { 194 | /* 195 | * speed up if we have a prefix map, utilizing the fact that vertex labels are 196 | * lexicographically sorted by reversed domain name 197 | */ 198 | List> res = new LinkedList<>(); 199 | LazyIntIterator iter = graph.successors((int) vertexId); 200 | int curr = iter.nextInt(); 201 | while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE) { 202 | final MutableString currLabel = vertexMap.list().get(curr); 203 | final int pos = currLabel.indexOf('.'); 204 | final MutableString tldPrefix; 205 | final String tld; 206 | if (pos > -1 && (pos + 1) < currLabel.length()) { 207 | tldPrefix = currLabel.substring(0, pos + 1); 208 | tld = tldPrefix.substring(0, pos).toString(); 209 | } else { 210 | tldPrefix = currLabel; 211 | tld = currLabel.toString(); 212 | } 213 | long count = 1; 214 | final Interval interval = vertexMap.getInterval(tldPrefix); 215 | int next; 216 | while ((next = iter.nextInt()) != LAZY_INT_ITERATOR_EMPTY_VALUE) { 217 | if (next > interval.right) { 218 | break; 219 | } 220 | count++; 221 | } 222 | curr = next; 223 | res.add(new SimpleEntry<>(tld, count)); 224 | } 225 | return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue())); 226 | } 227 | return GraphExplorer.frequencies(successorTopLevelDomainStream(graph, vertexId)); 228 | } 229 | 230 | public Stream> topLevelDomainCounts(IntStream vertexIds) { 231 | if (vertexMap != null) { 232 | List> res = new LinkedList<>(); 233 | PrimitiveIterator.OfInt iter = vertexIds.iterator(); 234 | if (iter.hasNext()) { 235 | int curr = iter.nextInt(); 236 | do { 237 | final MutableString currLabel = vertexMap.list().get(curr); 238 | final int pos = currLabel.indexOf('.'); 239 | final MutableString tldPrefix; 240 | final String tld; 241 | if (pos > -1 && (pos + 1) < currLabel.length()) { 242 | tldPrefix = currLabel.substring(0, pos + 1); 243 | tld = tldPrefix.substring(0, pos).toString(); 244 | } else { 245 | tldPrefix = currLabel; 246 | tld = currLabel.toString(); 247 | } 248 | long count = 1; 249 | final Interval interval = vertexMap.getInterval(tldPrefix); 250 | int next = -1; 251 | while (iter.hasNext()) { 252 | next = iter.nextInt(); 253 | if (next > interval.right) { 254 | break; 255 | } 256 | count++; 257 | } 258 | curr = next; 259 | res.add(new SimpleEntry<>(tld, count)); 260 | } while (curr > -1); 261 | } 262 | return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue())); 263 | } 264 | return GraphExplorer.frequencies(vertexIds.mapToObj(i -> Graph.getTopLevelDomain(vertexIdToLabel(i)))); 265 | } 266 | 267 | public int[] predecessors(long vertexId) { 268 | return graphT.successorArray((int) vertexId); 269 | } 270 | 271 | public int[] predecessors(String vertexLabel) { 272 | return graphT.successorArray((int) vertexLabelToId(vertexLabel)); 273 | } 274 | 275 | public Stream predecessorStream(String vertexLabel) { 276 | return successorStream(graphT, vertexLabelToId(vertexLabel)); 277 | } 278 | 279 | public IntStream predecessorIntStream(String vertexLabel) { 280 | return successorIntStream(graphT, vertexLabelToId(vertexLabel)); 281 | } 282 | 283 | public Stream predecessorStream(String vertexLabel, String prefix) { 284 | return successorStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); 285 | } 286 | 287 | public IntStream predecessorIntStream(String vertexLabel, String prefix) { 288 | return successorIntStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); 289 | } 290 | 291 | public Stream> predecessorTopLevelDomainCounts(String vertexLabel) { 292 | return successorTopLevelDomainCounts(graphT, vertexLabelToId(vertexLabel)); 293 | } 294 | 295 | public long[] sharedPredecessors(long[] vertices) { 296 | return sharedPredecessors(vertices, vertices.length, vertices.length); 297 | } 298 | 299 | public long[] sharedPredecessors(long[] vertices, int minShared, int maxShared) { 300 | return sharedSuccessors(graphT, vertices, minShared, maxShared); 301 | } 302 | 303 | public long[] sharedSuccessors(long[] vertices) { 304 | return sharedSuccessors(vertices, vertices.length, vertices.length); 305 | } 306 | 307 | public long[] sharedSuccessors(long[] vertices, int minShared, int maxShared) { 308 | return sharedSuccessors(graph, vertices, minShared, maxShared); 309 | } 310 | 311 | /** 312 | * Get shared successors (children) of all {@code vertices} in a {@code graph}. 313 | * The parameters {@code minShared} and {@code maxShared} allow to select the 314 | * intersection, the union or a subset with a specific overlap (shared 315 | * successors). If vertex a has the successors d, e, vertex 316 | * b has d, f and vertex c has d, e, g, then 317 | *
    318 | *
  • {@code minShared} = {@code maxShared} = {@code vertices.length} returns 319 | * the intersection (d)
  • 320 | *
  • {@code minShared} = 1 and {@code maxShared} = {@code vertices.length} 321 | * returns the union (d, e, f)
  • 322 | *
  • {@code minShared} = {@code maxShared} = 2 returns all successors shared 323 | * by exactly two of the {@code vertices} (e)
  • 324 | *
325 | * 326 | * @param graph the graph used to access the successors of a vertex (the 327 | * transpose of the graph will give the predecessors of the 328 | * vertex) 329 | * @param vertices list of vertex IDs 330 | * @param minShared the minimum number of shared links to successors 331 | * @param maxShared the minimum number of shared links to successors 332 | * @return shared successors 333 | */ 334 | public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minShared, int maxShared) { 335 | LazyIntIterator[] iters = new LazyIntIterator[vertices.length]; 336 | for (int i = 0; i < vertices.length; i++) { 337 | iters[i] = graph.successors((int) vertices[i]); 338 | } 339 | CountingMergedIntIterator iter = new CountingMergedIntIterator(iters); 340 | LongArrayList res = new LongArrayList(); 341 | int id; 342 | while (iter.hasNext()) { 343 | id = iter.nextInt(); 344 | if (iter.getCount() >= minShared && iter.getCount() <= maxShared) { 345 | res.add(id); 346 | } 347 | } 348 | res.trim(); 349 | return res.elements(); 350 | } 351 | 352 | public static String getTopLevelDomain(String reversedDomainName) { 353 | int dot = reversedDomainName.indexOf('.'); 354 | if (dot < reversedDomainName.length()) { 355 | return reversedDomainName.substring(0, dot); 356 | } 357 | return reversedDomainName; 358 | } 359 | 360 | /** 361 | * Get the registered domain for a host name based on the ICANN section of the 362 | * public suffix list. 363 | * 364 | * @see EffectiveTldFinder 365 | * 366 | * @param hostName host name, e.g. www.example.org.uk 367 | * @param strict if true return null instead of hostName if no 368 | * valid public suffix is detected 369 | * @return the domain name below the public suffix, e.g. 370 | * example.org.uk 371 | */ 372 | public static String getRegisteredDomain(String hostName, boolean strict) { 373 | return EffectiveTldFinder.getAssignedDomain(hostName, strict, true); 374 | } 375 | 376 | /** 377 | * Get the registered domain for a host name, both in 378 | * reverse 379 | * domain name notation. 380 | * 381 | * @see #getRegisteredDomain(String, boolean) 382 | * 383 | * @param reversedHostName host name in reverse domain name notation, e.g. 384 | * uk.ork.example.www 385 | * @param strict if true return null instead of 386 | * reversedHostName if no valid public 387 | * suffix is detected 388 | * @return the domain name below the public suffix, e.g. 389 | * uk.org.example (in reverse domain name notation) 390 | */ 391 | public static String getRegisteredDomainReversed(String reversedHostName, boolean strict) { 392 | String hostName = reverseDomainName(reversedHostName); 393 | String domainName = getRegisteredDomain(hostName, strict); 394 | if (strict && domainName == null) { 395 | return null; 396 | } else if (hostName.equals(domainName)) { 397 | return reversedHostName; 398 | } 399 | return reverseDomainName(domainName); 400 | } 401 | 402 | /** 403 | * Reverse or "unreverse" a host/domain name: com.example.www is 404 | * reversed to www.example.com and vice versa. 405 | * 406 | * @param domainName domain name 407 | * @return domain name with reverse 409 | * domain name notation (un)applied 410 | */ 411 | public static String reverseDomainName(String domainName) { 412 | return HostToDomainGraph.reverseHost(domainName); 413 | } 414 | } 415 | -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2024 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph.explore; 6 | 7 | import java.io.IOException; 8 | import java.io.PrintStream; 9 | import java.nio.charset.StandardCharsets; 10 | import java.nio.file.Files; 11 | import java.nio.file.Paths; 12 | import java.util.Arrays; 13 | import java.util.Comparator; 14 | import java.util.Map.Entry; 15 | import java.util.function.Function; 16 | import java.util.stream.Collectors; 17 | import java.util.stream.IntStream; 18 | import java.util.stream.LongStream; 19 | import java.util.stream.Stream; 20 | 21 | import org.commoncrawl.webgraph.CountingMergedIntIterator; 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | import it.unimi.dsi.webgraph.LazyIntIterator; 26 | 27 | /** 28 | * Utility class for graph exploration: load and hold all required web graph 29 | * data structures, provided methods to interactively explore the graph. 30 | */ 31 | public class GraphExplorer { 32 | 33 | private static Logger LOG = LoggerFactory.getLogger(GraphExplorer.class); 34 | 35 | public class Vertex { 36 | private long id; 37 | private String label; 38 | 39 | public Vertex(String label) { 40 | this.label = label; 41 | id = g.vertexLabelToId(label); 42 | } 43 | 44 | public Vertex(long id) { 45 | this.id = id; 46 | label = g.vertexIdToLabel(id); 47 | } 48 | 49 | @Override 50 | public String toString() { 51 | return "#" + id + "\t" + label; 52 | } 53 | 54 | public int outdegree() { 55 | return g.outdegree((int) id); 56 | } 57 | 58 | public int indegree() { 59 | return g.indegree((int) id); 60 | } 61 | 62 | public int[] successors() { 63 | return g.graph.successorArray((int) id); 64 | } 65 | 66 | public int[] predecessors() { 67 | return g.graphT.successorArray((int) id); 68 | } 69 | } 70 | 71 | private Graph g = null; 72 | private Vertex v = null; 73 | 74 | public GraphExplorer(String name) throws Exception { 75 | g = new Graph(name); 76 | } 77 | 78 | public Graph getGraph() { 79 | return g; 80 | } 81 | 82 | public Vertex getVertex(String vertexLabel) { 83 | return new Vertex(vertexLabel); 84 | } 85 | 86 | public Vertex getVertex(long vertexId) { 87 | return new Vertex(vertexId); 88 | } 89 | 90 | public void setVertex(String vertexLabel) { 91 | v = getVertex(vertexLabel); 92 | } 93 | 94 | public void setVertex(long vertexId) { 95 | v = getVertex(vertexId); 96 | } 97 | 98 | /* Reimplementation of commands provided by pywebgraph (cn, pwn, ls, sl) */ 99 | 100 | /** 101 | * Change the current working node / vertex. 102 | * 103 | * @param vertexLabel vertex label (node name) 104 | */ 105 | public void cn(String vertexLabel) { 106 | setVertex(vertexLabel); 107 | pwn(); 108 | } 109 | 110 | /** 111 | * Change the current working node / vertex. 112 | * 113 | * @param vertexId vertex ID 114 | */ 115 | public void cn(long vertexId) { 116 | setVertex(vertexId); 117 | pwn(); 118 | } 119 | 120 | /** 121 | * Print the current working node / vertex. 122 | */ 123 | public void pwn() { 124 | if (v == null) { 125 | throw new NullPointerException("Current working node not set, use cn(...) to define the working node."); 126 | } 127 | print(v.toString()); 128 | } 129 | 130 | /** 131 | * Print the successors (outgoing links) of the current working node / vertex. 132 | */ 133 | public void ls() { 134 | if (v == null) { 135 | throw new NullPointerException("Current working node not set, use cn(...) to define the working node."); 136 | } 137 | ls(v.id); 138 | } 139 | 140 | /** 141 | * Print the successors (outgoing links) of a vertex. 142 | * 143 | * @param vertexId vertex ID 144 | */ 145 | public void ls(long vertexId) { 146 | printVertices(g.graph.successors((int) vertexId)); 147 | } 148 | 149 | /** 150 | * Print the successors (outgoing links) of a vertex. 151 | * 152 | * @param vertexLabel vertex label / vertex name 153 | */ 154 | public void ls(String vertexLabel) { 155 | ls(g.vertexLabelToId(vertexLabel)); 156 | } 157 | 158 | /** 159 | * Print the predecessors (incoming links) of the current working node / vertex. 160 | */ 161 | public void sl() { 162 | if (v == null) { 163 | throw new NullPointerException("Current working node not set, use cn(...) to define the working node."); 164 | } 165 | sl(v.id); 166 | } 167 | 168 | /** 169 | * Print the predecessors (incoming links) of a vertex. 170 | * 171 | * @param vertexId vertex ID 172 | */ 173 | public void sl(long vertexId) { 174 | printVertices(g.graphT.successors((int) vertexId)); 175 | } 176 | 177 | /** 178 | * Print the predecessors (incoming links) of a vertex. 179 | * 180 | * @param vertexLabel vertex label / vertex name 181 | */ 182 | public void sl(String vertexLabel) { 183 | sl(g.vertexLabelToId(vertexLabel)); 184 | } 185 | 186 | /* Utilities */ 187 | 188 | public long[] loadVerticesFromFile(String fileName) { 189 | try (Stream in = Files.lines(Paths.get(fileName), StandardCharsets.UTF_8)) { 190 | return in.mapToLong(label -> g.vertexLabelToId(label)).filter(id -> id > -1).toArray(); 191 | } catch (IOException e) { 192 | LOG.error("Failed to load vertices from file {}", fileName, e); 193 | } 194 | return new long[0]; 195 | } 196 | 197 | public void saveVerticesToFile(long[] vertexIDs, String fileName) { 198 | try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, 199 | StandardCharsets.UTF_8)) { 200 | Arrays.stream(vertexIDs).forEach(id -> out.println(g.vertexIdToLabel(id))); 201 | } catch (IOException e) { 202 | LOG.error("Failed to write vertices to file {}", fileName, e); 203 | } 204 | } 205 | 206 | public void saveVerticesToFile(int[] vertexIDs, String fileName) { 207 | saveVerticesToFile(Arrays.stream(vertexIDs), fileName); 208 | } 209 | 210 | public void saveVerticesToFile(IntStream vertexIDs, String fileName) { 211 | try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, 212 | StandardCharsets.UTF_8)) { 213 | vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id))); 214 | } catch (IOException e) { 215 | LOG.error("Failed to write vertices to file {}", fileName, e); 216 | } 217 | } 218 | 219 | public void saveVerticesToFile(LongStream vertexIDs, String fileName) { 220 | try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, 221 | StandardCharsets.UTF_8)) { 222 | vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id))); 223 | } catch (IOException e) { 224 | LOG.error("Failed to write vertices to file {}", fileName, e); 225 | } 226 | } 227 | 228 | public void saveToFile(Stream strings, String fileName) { 229 | try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, 230 | StandardCharsets.UTF_8)) { 231 | strings.forEach(out::println); 232 | } catch (IOException e) { 233 | LOG.error("Failed to write strings to file {}", fileName, e); 234 | } 235 | } 236 | 237 | public void saveCountsToFile(Stream> counts, String fileName) { 238 | try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, 239 | StandardCharsets.UTF_8)) { 240 | counts.forEach(c -> { 241 | out.print(c.getValue()); 242 | out.print('\t'); 243 | out.print(c.getKey()); 244 | out.print('\n'); 245 | }); 246 | } catch (IOException e) { 247 | LOG.error("Failed to write counts to file {}", fileName, e); 248 | } 249 | } 250 | 251 | private void print(String s) { 252 | System.out.println(s); 253 | } 254 | 255 | public void printVertices(LazyIntIterator it) { 256 | int next = it.nextInt(); 257 | int i = 0; 258 | while (next != CountingMergedIntIterator.LAZY_INT_ITERATOR_EMPTY_VALUE) { 259 | print(String.format("%d: %s", i, (new Vertex(next)).toString())); 260 | next = it.nextInt(); 261 | i++; 262 | } 263 | } 264 | 265 | public void printVertices(long[] vertexIDs) { 266 | int i = 0; 267 | for (long id : vertexIDs) { 268 | print(String.format("%d: %s", i, (new Vertex(id)).toString())); 269 | i++; 270 | } 271 | } 272 | 273 | public void printVertices(int[] vertexIDs) { 274 | int i = 0; 275 | for (long id : vertexIDs) { 276 | print(String.format("%d: %s", i, (new Vertex(id)).toString())); 277 | i++; 278 | } 279 | } 280 | 281 | /** 282 | * Count strings in a stream. Sort the resulting string-count pairs by 283 | * decreasing count (frequency) and secondarily by string in lexicographic 284 | * order. 285 | * 286 | * @param strings stream of strings 287 | * @return stream of pairs {@code } 288 | */ 289 | public static Stream> frequencies(Stream strings) { 290 | final Comparator> comp = Comparator.comparingLong((Entry e) -> e.getValue()) 291 | .reversed().thenComparing(Comparator.comparing((Entry e) -> e.getKey())); 292 | return strings.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet().stream() 293 | .sorted(comp); 294 | } 295 | } 296 | -------------------------------------------------------------------------------- /src/main/java/org/commoncrawl/webgraph/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Custom classes to build Common Crawl web graph data sets. Built on software 3 | * from the Laboratory for Web Algorithmics (LAW) at the University of Milano, 4 | * namely the WebGraph framework and 5 | * the LAW library. 6 | */ 7 | package org.commoncrawl.webgraph; -------------------------------------------------------------------------------- /src/main/resources/simplelogger.properties: -------------------------------------------------------------------------------- 1 | # SLF4J's SimpleLogger configuration file 2 | # Simple implementation of Logger that sends all enabled log messages, for all defined loggers, to System.err. 3 | 4 | # Default logging detail level for all instances of SimpleLogger. 5 | # Must be one of ("trace", "debug", "info", "warn", or "error"). 6 | # If not specified, defaults to "info". 7 | org.slf4j.simpleLogger.defaultLogLevel=debug 8 | 9 | # Logging detail level for a SimpleLogger instance named "xxxxx". 10 | # Must be one of ("trace", "debug", "info", "warn", or "error"). 11 | # If not specified, the default logging detail level is used. 12 | #org.slf4j.simpleLogger.log.xxxxx= 13 | 14 | # Set to true if you want the current date and time to be included in output messages. 15 | # Default is false, and will output the number of milliseconds elapsed since startup. 16 | org.slf4j.simpleLogger.showDateTime=true 17 | 18 | # The date and time format to be used in the output messages. 19 | # The pattern describing the date and time format is the same that is used in java.text.SimpleDateFormat. 20 | # If the format is not specified or is invalid, the default format is used. 21 | # The default format is yyyy-MM-dd HH:mm:ss:SSS Z. 22 | org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss:SSS Z 23 | 24 | # Set to true if you want to output the current thread name. 25 | # Defaults to true. 26 | org.slf4j.simpleLogger.showThreadName=true 27 | 28 | # Set to true if you want the Logger instance name to be included in output messages. 29 | # Defaults to true. 30 | org.slf4j.simpleLogger.showLogName=true 31 | 32 | # Set to true if you want the last component of the name to be included in output messages. 33 | # Defaults to false. 34 | org.slf4j.simpleLogger.showShortLogName=true 35 | -------------------------------------------------------------------------------- /src/script/host2domaingraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-License-Identifier: Apache-2.0 4 | # Copyright (C) 2022 Common Crawl and contributors 5 | 6 | FLAGS=() 7 | PROPERTIES=() 8 | while true; do 9 | case "$1" in 10 | "-D"* ) 11 | PROPERTIES=("${PROPERTIES[@]}" "$1") 12 | shift 13 | ;; 14 | "-"* ) 15 | FLAGS=("${FLAGS[@]}" "$1") 16 | shift 17 | ;; 18 | * ) 19 | break 20 | ;; 21 | esac 22 | done 23 | 24 | JAR=target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar 25 | 26 | if [ $# -lt 3 ]; then 27 | echo "$0 [...] []" >&2 28 | if [ ${#FLAGS[@]} -gt 0 ]; then 29 | echo "" 30 | echo "Calling HostToDomainGraph with provided flags (${FLAGS[*]}):" 31 | "$JAVA_HOME"/bin/java -cp "$CLASSPATH":"$JAR" "${PROPERTIES[@]}" \ 32 | org.commoncrawl.webgraph.HostToDomainGraph "${FLAGS[@]}" 33 | fi 34 | exit 1 35 | fi 36 | 37 | SIZE="$1" 38 | INPUTDIR="$2" 39 | OUTPUTDIR="$3" 40 | TMPDIR=${4:-./tmp/} 41 | 42 | MAIN_MEM_GB=16 43 | PARALLEL_SORT_THREADS=2 44 | 45 | # Reduce host-level web graph to domain-level graph 46 | # - running HostToDomainGraph which has low memory requirements 47 | # - requires properly sorted input: 48 | # * reversed host names 49 | # * all hosts/subdomains of one domain following in a single input block 50 | # - approx. memory requirements: 51 | # * for graphs with less than 2^31 vertices 52 | # 2 GB + 4*number_of_vertices Bytes 53 | # * larger graphs 54 | # 8 GB + 10*number_of_vertices Bytes 55 | 56 | # Notes about input sorting: 57 | # 58 | # 1 C locale is mandatory to keep reversed hosts of one domain or top-level domain 59 | # together in a single block: 60 | # echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=en_US.utf8 sort 61 | # vs. 62 | # echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=C sort 63 | # This requirement is met by the output of the cc-pyspark job. 64 | # 65 | # 2 the second problem stems from the fact that a hyphen (valid in host and 66 | # subdomain names) is sorted before the dot: 67 | # ac.gov 68 | # ac.gov.ascension 69 | # ac.gov.ascension-island 70 | # ac.gov.ascension.mail 71 | # Unfortunately the output of the cc-pyspark job does not completely meet this 72 | # sorting criterion. 73 | # The initial solution to ensure that the subdomains of "ac.gov.ascension" are not split 74 | # into two blocks, was to add an artificial dot temporarily to the end of each host 75 | # name during sorting: 76 | # zcat vertices.txt.gz | sed -e 's/$/./' \ 77 | # | sort $SORTOPTS -t$'\t' -k2,2 | sed -e 's/\.$//' 78 | # The domain name "ac.gov.ascension" in the example above becomes temporarily 79 | # "ac.gov.ascension." and is now sorted after "ac.gov.ascension-island." 80 | # 81 | # To avoid this step (re-sorting billions of lines is expensive), the HostToDomainGraph 82 | # class now caches potentially "missorted" candidates and processes them later together 83 | # with the related subdomains / host names. 84 | # 85 | # Note: The final sorting of the domain names is the same as if there would be 86 | # a trailing dot: 87 | # ac.gov.ascension-island 88 | # ac.gov.ascension 89 | 90 | 91 | export LC_ALL=C 92 | 93 | # sort with large buffers, merge sort over many files if possible 94 | SORTOPTS="--batch-size 128 --buffer-size $((1+MAIN_MEM_GB/5))g --parallel=$PARALLEL_SORT_THREADS --temporary-directory $TMPDIR" # --compress-program=gzip 95 | 96 | set -exo pipefail 97 | 98 | test -d "$TMPDIR" || mkdir "$TMPDIR" 99 | 100 | 101 | _EDGES=$INPUTDIR/edges.txt.gz 102 | if [ -e "$_EDGES" ]; then 103 | echo "Found single edges file: $_EDGES" 104 | elif [ -d "$INPUTDIR"/edges/ ]; then 105 | # edges is a directory with multiple edges files 106 | _EDGES="$INPUTDIR/edges/*.gz" 107 | echo "Found edges directory, using: $_EDGES" 108 | else 109 | echo "Input edges file(s) not found" 110 | exit 1 111 | fi 112 | 113 | _VERTICES=$INPUTDIR/vertices.txt.gz 114 | if [ -e "$_VERTICES" ]; then 115 | echo "Found single vertices file: $_VERTICES" 116 | elif [ -d "$INPUTDIR"/vertices/ ]; then 117 | # vertices is a directory with multiple vertices files 118 | echo "Found vertices directory, using: $_VERTICES" 119 | _VERTICES="$INPUTDIR/vertices/*.gz" 120 | else 121 | echo "Input vertices file(s) not found" 122 | exit 1 123 | fi 124 | 125 | 126 | mkdir -p "$OUTPUTDIR/" 127 | 128 | JXMX=$((2+1+5*SIZE/2**30)) 129 | if [ "$SIZE" -gt $((2**31-1024)) ]; then 130 | JXMX=$((8+1+10*SIZE/2**30)) 131 | fi 132 | 133 | "$JAVA_HOME"/bin/java -Xmx${JXMX}g -cp "$CLASSPATH":"$JAR" \ 134 | "${PROPERTIES[@]}" \ 135 | org.commoncrawl.webgraph.HostToDomainGraph \ 136 | "${FLAGS[@]}" \ 137 | $SIZE \ 138 | <(zcat $_VERTICES) \ 139 | >(gzip >"$OUTPUTDIR"/vertices.txt.gz) \ 140 | <(zcat $_EDGES) \ 141 | >(sort $SORTOPTS -t$'\t' -k1,1n -k2,2n -s -u | gzip >"$OUTPUTDIR"/edges.txt.gz) 142 | 143 | wait # for subshells to finish 144 | -------------------------------------------------------------------------------- /src/script/hostgraph/build_hostgraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-License-Identifier: Apache-2.0 4 | # Copyright (C) 2022 Common Crawl and contributors 5 | 6 | set -e 7 | set -o pipefail 8 | set -x 9 | 10 | # run the webgraph workflow (based on cc-pyspark) 11 | # - extract inter-host links 12 | # - construct the host-level graph 13 | 14 | # installation and execution: 15 | # - install cc-pyspark 16 | # git clone https://github.com/commoncrawl/cc-pyspark.git 17 | # - and make it the working directory 18 | # cd cc-pyspark 19 | # - point SPARK_HOME to your installation of Apache Spark (https://spark.apache.org/) 20 | # vi ./spark_env.sh 21 | # and make sure that your Spark cluster (on Hadoop YARN) is running! 22 | # - edit the hostgraph build configuration 23 | # vi .../hostgraph_config.sh 24 | # - run the workflow 25 | # .../build_hostgraph.sh 26 | 27 | # Note: the script is tested using a Hadoop cluster running 28 | # Apache Bigtop 3.x on Ubuntu 22.04. You may need to adapt it 29 | # to run on different Hadoop distributions. 30 | 31 | 32 | SPARK_ON_YARN="--master yarn" 33 | SPARK_HADOOP_OPTS="" 34 | SPARK_EXTRA_OPTS="" 35 | 36 | HOST_LINK_EXTRACTOR=./hostlinks_extract_fastwarc.py 37 | PYFILES_HOST_LINK_EXTRACTOR="sparkcc.py,sparkcc_fastwarc.py,wat_extract_links.py,json_importer.py" 38 | 39 | HOST_LINKS_TO_GRAPH=./hostlinks_to_graph.py 40 | PYFILES_HOST_LINKS_TO_GRAPH="sparkcc.py,iana_tld.py,wat_extract_links.py,json_importer.py" 41 | HOST_LINKS_TO_GRAPH_ARGS=(--validate_host_names) # --normalize_host_names 42 | 43 | 44 | # source library functions 45 | source "$(dirname "$0")"/../workflow_lib.sh 46 | 47 | # source workflow configuration 48 | source "$(dirname "$0")"/hostgraph_config.sh 49 | 50 | # define SPARK_HOME and HADOOP_CONF_DIR 51 | source "$PWD"/spark_env.sh 52 | 53 | 54 | ################################################################################ 55 | 56 | # upload Parquet graph 57 | function upload_parquet() ( 58 | set -xeo pipefail 59 | TABLE=$1 60 | UPLOAD_NAME=$2 61 | UPLOAD_DIR=$S3A_OUTPUT_PREFIX/$UPLOAD_NAME/hostgraph 62 | if hadoop fs -test -d "$UPLOAD_DIR"/vertices; then 63 | echo "Upload $UPLOAD_DIR/vertices already exists, skipping..." 64 | else 65 | hadoop distcp \ 66 | "$HDFS_BASE_DIR"/${TABLE}_vertices \ 67 | "$UPLOAD_DIR"/vertices 68 | fi 69 | if hadoop fs -test -d "$UPLOAD_DIR"/edges; then 70 | echo "Upload "$UPLOAD_DIR"/edges already exists, skipping..." 71 | else 72 | hadoop distcp \ 73 | "$HDFS_BASE_DIR"/${TABLE}_edges \ 74 | "$UPLOAD_DIR"/edges 75 | fi 76 | ) 77 | 78 | function upload_text() ( 79 | set -xeo pipefail 80 | NAME=$1 81 | UPLOAD_NAME=$2 82 | UPLOAD_DIR="$S3A_OUTPUT_PREFIX"/$UPLOAD_NAME/hostgraph/text 83 | PUBLIC=${3:-false} 84 | DISTCP_OPTS="" 85 | if $PUBLIC; then 86 | DISTCP_OPTS="$DISTCP_OPTS -Dfs.s3a.acl.default=PublicRead" 87 | fi 88 | if hadoop fs -test -d "$UPLOAD_DIR"/vertices; then 89 | echo "Upload $UPLOAD_DIR/vertices already exists, skipping..." 90 | else 91 | hadoop fs -rm -f "$HDFS_BASE_DIR"/text/$NAME/vertices/_SUCCESS 92 | hadoop distcp $DISTCP_OPTS \ 93 | "$HDFS_BASE_DIR"/text/$NAME/vertices \ 94 | "$UPLOAD_DIR"/vertices 95 | fi 96 | if hadoop fs -test -d "$UPLOAD_DIR"/edges; then 97 | echo "Upload $UPLOAD_DIR/edges already exists, skipping..." 98 | else 99 | hadoop fs -rm -f "$HDFS_BASE_DIR"/text/$NAME/edges/_SUCCESS 100 | hadoop distcp $DISTCP_OPTS \ 101 | "$HDFS_BASE_DIR"/text/$NAME/edges \ 102 | "$UPLOAD_DIR"/edges 103 | fi 104 | ) 105 | 106 | # text output 107 | function dump_upload_text() ( 108 | set -xeo pipefail 109 | NAME=$1 110 | UPLOAD_NAME=$2 111 | mkdir -p output/$NAME/hostgraph/tmp_edges/ 112 | mkdir -p output/$NAME/hostgraph/tmp_vertices/ 113 | hadoop fs -copyToLocal "$HDFS_BASE_DIR"/text/$NAME/vertices/*.gz output/$NAME/hostgraph/tmp_vertices/ 114 | n_vertex_files=$(ls output/$NAME/hostgraph/tmp_vertices/*.gz | wc -l) 115 | if [ $n_vertex_files -eq 1 ]; then 116 | mv output/$NAME/hostgraph/tmp_vertices/*.gz output/$NAME/hostgraph/vertices.txt.gz 117 | else 118 | zcat output/$NAME/hostgraph/tmp_vertices/*.gz | gzip >output/$NAME/hostgraph/vertices.txt.gz 119 | fi 120 | aws s3 cp --no-progress output/$NAME/hostgraph/vertices.txt.gz $S3_OUTPUT_PREFIX/$UPLOAD_NAME/hostgraph/ 121 | hadoop fs -copyToLocal "$HDFS_BASE_DIR"/text/$NAME/edges/*.gz output/$NAME/hostgraph/tmp_edges/ 122 | sort_input="" 123 | for e in output/$NAME/hostgraph/tmp_edges/*.gz; do 124 | sort_input="$sort_input <(zcat $e)" 125 | done 126 | mkdir -p tmp 127 | eval "sort --batch-size 96 --buffer-size 4g --parallel 2 --temporary-directory ./tmp/ --compress-program=gzip -t$'\t' -k1,1n -k2,2n --stable --merge $sort_input | gzip >output/$NAME/hostgraph/edges.txt.gz" 128 | aws s3 cp --no-progress output/$NAME/hostgraph/edges.txt.gz $S3_OUTPUT_PREFIX/$UPLOAD_NAME/hostgraph/ 129 | ) 130 | 131 | function create_input_splits() { 132 | CRAWL="$1" 133 | __INPUT_SPLITS=() 134 | if [ -d input/$CRAWL/ ]; then 135 | # input splits are already created locally, read the splits again 136 | # (this might happen if one of the steps/jobs has failed and 137 | # this script is run again) 138 | for split in input/$CRAWL/input_split_*.txt; do 139 | __INPUT_SPLITS=(${__INPUT_SPLITS[@]} "$HDFS_BASE_DIR/$split") 140 | done 141 | 142 | elif hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/ >&2; then 143 | # no local input splits but output on S3 144 | echo "Not creating input split for crawl $CRAWL because output prefix already exists on S3: $S3A_OUTPUT_PREFIX/$CRAWL/hostlinks/" >&2 145 | 146 | else 147 | mkdir -p input/$CRAWL 148 | cd input/$CRAWL 149 | aws s3 cp --quiet --no-progress s3://commoncrawl/crawl-data/$CRAWL/wat.paths.gz . 150 | aws s3 cp --quiet --no-progress s3://commoncrawl/crawl-data/$CRAWL/non200responses.paths.gz . 151 | if $INCLUDE_ROBOTSTXT_SITEMAP_LINKS; then 152 | aws s3 cp --quiet --no-progress s3://commoncrawl/crawl-data/$CRAWL/robotstxt.paths.gz . 153 | fi 154 | zcat ./*.paths.gz | shuf >input.txt 155 | NUM_INPUT_PATHS=$(wc -l &2 165 | ### copy input to hdfs:// 166 | hadoop fs -mkdir -p "$HDFS_BASE_DIR"/$CRAWL 167 | hadoop fs -mkdir -p "$HDFS_BASE_DIR"/input/$CRAWL/ 168 | hadoop fs -mkdir -p "$HDFS_BASE_DIR"/text/$CRAWL/ 169 | hadoop fs -copyFromLocal -f input/$CRAWL/input.txt "$HDFS_BASE_DIR"/input/$CRAWL/ 170 | for split in input/$CRAWL/input_split_*.txt; do 171 | hadoop fs -copyFromLocal -f $split "$HDFS_BASE_DIR"/input/$CRAWL/ 172 | done 173 | # The input list is considerably small because it only references s3:// paths: 174 | # deploy it on every node to make all tasks NODE_LOCAL 175 | hadoop fs -setrep $((NUM_EXECUTORS+1)) "$HDFS_BASE_DIR"/input/$CRAWL/ >&2 176 | fi 177 | echo "${__INPUT_SPLITS[@]}" 178 | } 179 | 180 | 181 | ################################################################################ 182 | 183 | MERGE_CRAWL_INPUT="" 184 | 185 | for CRAWL in ${CRAWLS[@]}; do 186 | 187 | INPUT_SPLITS=($(create_input_splits $CRAWL)) 188 | 189 | if [ -z "$INPUT_SPLITS" ]; then 190 | # no input splits signals that the crawl has already successfully processed 191 | if hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/; then 192 | echo "Output prefix for crawl $CRAWL already exists on S3: $S3A_OUTPUT_PREFIX/$CRAWL/hostlinks/" 193 | if ! hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/_SUCCESS; then 194 | echo "No success marker found below S3 output prefix: $S3A_OUTPUT_PREFIX/$CRAWL/hostlinks/_SUCCESS" 195 | echo "Please, verify the output and depending on the verification result, manually add the success marker or remove the output. Exiting ..." 196 | exit 1 197 | fi 198 | fi 199 | # add the existing output splits as input for host graph and merged graph 200 | for output_split in $(hadoop fs -ls -C "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/); do 201 | case "$output_split" in 202 | */_SUCCESS ) 203 | continue ;; 204 | esac 205 | if [ -z "$HOSTGRAPH_INPUT" ]; then 206 | HOSTGRAPH_INPUT="$output_split" 207 | else 208 | HOSTGRAPH_INPUT="--add_input $output_split $HOSTGRAPH_INPUT" 209 | fi 210 | if [ -z "$MERGE_CRAWL_INPUT" ]; then 211 | MERGE_CRAWL_INPUT="$output_split" 212 | else 213 | MERGE_CRAWL_INPUT="--add_input $output_split $MERGE_CRAWL_INPUT" 214 | fi 215 | done 216 | 217 | else 218 | echo "Input splits: ""${INPUT_SPLITS[*]}" 219 | 220 | for ((i=0; i<${#INPUT_SPLITS[@]}; i++)); do 221 | INPUT=${INPUT_SPLITS[$i]} 222 | NUM_INPUT_PATHS=$(wc -l $INPUT_PARTITIONS partitions" 225 | 226 | _step hostlinks.$CRAWL.split$i \ 227 | "$SPARK_HOME"/bin/spark-submit \ 228 | $SPARK_ON_YARN \ 229 | $SPARK_HADOOP_OPTS \ 230 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 231 | --conf spark.task.maxFailures=80 \ 232 | --conf spark.executor.memory=$EXECUTOR_MEM \ 233 | --conf spark.driver.memory=6g \ 234 | --conf spark.core.connection.ack.wait.timeout=600s \ 235 | --conf spark.network.timeout=300s \ 236 | --conf spark.shuffle.io.maxRetries=5 \ 237 | --conf spark.shuffle.io.retryWait=30s \ 238 | --conf spark.io.compression.codec=zstd \ 239 | --conf spark.checkpoint.compress=true \ 240 | --conf spark.locality.wait=0s \ 241 | --num-executors $NUM_EXECUTORS \ 242 | --executor-cores $EXECUTOR_CORES \ 243 | --executor-memory $EXECUTOR_MEM \ 244 | --conf spark.sql.warehouse.dir=$WAREHOUSE_DIR/$CRAWL \ 245 | --conf spark.sql.parquet.compression.codec=zstd \ 246 | --py-files "$PYFILES_HOST_LINK_EXTRACTOR" \ 247 | $SPARK_EXTRA_OPTS \ 248 | $HOST_LINK_EXTRACTOR \ 249 | --input_base_url $INPUT_BASE_URL \ 250 | --num_input_partitions $INPUT_PARTITIONS \ 251 | --num_output_partitions $OUTPUT_PARTITIONS \ 252 | --local_temp_dir "$TMPDIR" \ 253 | $INPUT hostlinks$i 254 | 255 | _step hostlinks.$CRAWL.split$i.distcp \ 256 | hadoop distcp \ 257 | -Dfs.s3a.connection.timeout=2000 \ 258 | -Dfs.s3a.attempts.maximum=3 \ 259 | "$HDFS_BASE_DIR"/$CRAWL/hostlinks$i \ 260 | "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/$i 261 | 262 | if [ -z "$HOSTGRAPH_INPUT" ]; then 263 | HOSTGRAPH_INPUT="$HDFS_BASE_DIR/$CRAWL/hostlinks$i" 264 | else 265 | HOSTGRAPH_INPUT="--add_input $HDFS_BASE_DIR/$CRAWL/hostlinks$i $HOSTGRAPH_INPUT" 266 | fi 267 | if [ -z "$MERGE_CRAWL_INPUT" ]; then 268 | MERGE_CRAWL_INPUT="$HDFS_BASE_DIR/$CRAWL/hostlinks$i" 269 | else 270 | MERGE_CRAWL_INPUT="--add_input $HDFS_BASE_DIR/$CRAWL/hostlinks$i $MERGE_CRAWL_INPUT" 271 | fi 272 | done # end input splits 273 | 274 | # Create the success marker on S3 275 | hadoop fs -touchz "$S3A_OUTPUT_PREFIX"/$CRAWL/hostlinks/_SUCCESS 276 | 277 | fi 278 | 279 | 280 | if $CONSTRUCT_HOSTGRAPH; then 281 | 282 | if hadoop fs -stat "$S3A_OUTPUT_PREFIX"/$CRAWL/hostgraph/; then 283 | echo "Skipping creation of hostgraph for crawl $CRAWL because output prefix already exists on S3: $S3A_OUTPUT_PREFIX/$CRAWL/hostgraph/" 284 | continue 285 | fi 286 | 287 | VERTEX_IDS="" 288 | if hadoop fs -stat "$HDFS_BASE_DIR"/$CRAWL/hostgraph_vertices; then 289 | VERTEX_IDS="--vertex_ids $HDFS_BASE_DIR/$CRAWL/hostgraph_vertices" 290 | fi 291 | 292 | _step hostgraph.$CRAWL \ 293 | "$SPARK_HOME"/bin/spark-submit \ 294 | $SPARK_ON_YARN \ 295 | $SPARK_HADOOP_OPTS \ 296 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 297 | --conf spark.task.maxFailures=10 \ 298 | --conf spark.executor.memory=$EXECUTOR_MEM \ 299 | --conf spark.driver.memory=6g \ 300 | --conf spark.core.connection.ack.wait.timeout=600s \ 301 | --conf spark.network.timeout=300s \ 302 | --conf spark.shuffle.io.maxRetries=5 \ 303 | --conf spark.shuffle.io.retryWait=30s \ 304 | --conf spark.locality.wait=1s \ 305 | --conf spark.io.compression.codec=zstd \ 306 | --conf spark.checkpoint.compress=true \ 307 | --num-executors $NUM_EXECUTORS \ 308 | --executor-cores $EXECUTOR_CORES \ 309 | --executor-memory $EXECUTOR_MEM \ 310 | --conf spark.sql.warehouse.dir=$WAREHOUSE_DIR/$CRAWL \ 311 | --conf spark.sql.parquet.compression.codec=zstd \ 312 | --py-files "$PYFILES_HOST_LINKS_TO_GRAPH" \ 313 | $SPARK_EXTRA_OPTS \ 314 | $HOST_LINKS_TO_GRAPH \ 315 | "${HOST_LINKS_TO_GRAPH_ARGS[@]}" \ 316 | --save_as_text "$HDFS_BASE_DIR"/text/$CRAWL \ 317 | --num_output_partitions $WEBGRAPH_EDGE_PARTITIONS \ 318 | --local_temp_dir $TMPDIR \ 319 | $VERTEX_IDS \ 320 | $HOSTGRAPH_INPUT hostgraph 321 | 322 | 323 | _step hostgraph.$CRAWL.upload.1 \ 324 | upload_parquet hostgraph $CRAWL 325 | 326 | _step hostgraph.$CRAWL.upload.2 \ 327 | dump_upload_text $CRAWL $CRAWL 328 | fi 329 | 330 | done # CRAWLS 331 | 332 | 333 | 334 | if [ -n "MERGE_NAME" ]; then 335 | 336 | hadoop fs -mkdir -p "$HDFS_BASE_DIR"/text/$MERGE_NAME 337 | 338 | for INP in "${MERGE_INPUT[@]}"; do 339 | if [ -z "$MERGE_CRAWL_INPUT" ]; then 340 | MERGE_CRAWL_INPUT="$INP" 341 | else 342 | MERGE_CRAWL_INPUT="--add_input $INP $MERGE_CRAWL_INPUT" 343 | fi 344 | done 345 | 346 | VERTEX_IDS="" 347 | if hadoop fs -test -d "$HDFS_BASE_DIR"/hostgraph_merged_vertices; then 348 | VERTEX_IDS="--vertex_ids $HDFS_BASE_DIR/hostgraph_merged_vertices" 349 | fi 350 | 351 | _step hostgraph_merged \ 352 | "$SPARK_HOME"/bin/spark-submit \ 353 | $SPARK_ON_YARN \ 354 | $SPARK_HADOOP_OPTS \ 355 | --py-files "$PYFILES_HOST_LINKS_TO_GRAPH" \ 356 | --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 357 | --conf spark.task.maxFailures=10 \ 358 | --conf spark.executor.memory=$EXECUTOR_MEM \ 359 | --conf spark.driver.memory=6g \ 360 | --conf spark.core.connection.ack.wait.timeout=600s \ 361 | --conf spark.network.timeout=300s \ 362 | --conf spark.shuffle.io.maxRetries=5 \ 363 | --conf spark.shuffle.io.retryWait=30s \ 364 | --conf spark.locality.wait=1s \ 365 | --conf spark.io.compression.codec=zstd \ 366 | --conf spark.checkpoint.compress=true \ 367 | --num-executors $NUM_EXECUTORS \ 368 | --executor-cores $EXECUTOR_CORES \ 369 | --executor-memory $EXECUTOR_MEM \ 370 | --conf spark.sql.warehouse.dir=$WAREHOUSE_DIR \ 371 | --conf spark.sql.parquet.compression.codec=zstd \ 372 | $SPARK_EXTRA_OPTS \ 373 | $HOST_LINKS_TO_GRAPH \ 374 | "${HOST_LINKS_TO_GRAPH_ARGS[@]}" \ 375 | --save_as_text "$HDFS_BASE_DIR"/text/$MERGE_NAME \ 376 | --vertex_partitions $WEBGRAPH_VERTEX_PARTITIONS \ 377 | --num_output_partitions $WEBGRAPH_EDGE_PARTITIONS \ 378 | --local_temp_dir "$TMPDIR" \ 379 | $VERTEX_IDS \ 380 | $MERGE_CRAWL_INPUT hostgraph_merged 381 | 382 | _step hostgraph_merged.upload.1 \ 383 | upload_parquet hostgraph_merged $MERGE_NAME 384 | 385 | _step hostgraph_merged.upload.2 \ 386 | upload_text $MERGE_NAME $MERGE_NAME true 387 | 388 | ### merge (one file for vertices, one for edges) and upload 389 | # _step hostgraph_merged.upload.2 \ 390 | # dump_upload_text $MERGE_NAME $MERGE_NAME 391 | 392 | elif [ -n "$MERGE_INPUT" ]; then 393 | 394 | echo "MERGE_INPUT is defined, but no MERGE_NAME given?" 395 | exit 1 396 | 397 | fi 398 | -------------------------------------------------------------------------------- /src/script/hostgraph/hostgraph_config.sh: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### configuration of Common Crawl webgraph releases 3 | ### (sourced from other scripts) 4 | ################################################################################ 5 | 6 | 7 | ################################################################################ 8 | ### Exctraction of inter-host links from 9 | ### - WAT files and 10 | ### - non-200 responses WARC files for redirects 11 | ### - (optionally) sitemap directives in robots.txt files 12 | ### saved as tuples 13 | 14 | # crawls to be processed 15 | CRAWLS=("CC-MAIN-2025-08" "CC-MAIN-2025-13" "CC-MAIN-2025-18") 16 | 17 | INPUT_BASE_URL="s3://commoncrawl/" 18 | 19 | # whether to include links to sitemaps contained in robots.txt files 20 | # Note: often links to sitemap indicate relations between domain owners. 21 | INCLUDE_ROBOTSTXT_SITEMAP_LINKS=true 22 | 23 | # whether to construct a host-level graph for each input crawl 24 | CONSTRUCT_HOSTGRAPH=false 25 | 26 | # max. number of input files (WARC/WAT) per Spark job 27 | # - splits hostlink extraction into multiple jobs 28 | # - output is checkpointed on S3 after each job 29 | # (useful if cluster runs on spot instances) 30 | MAX_INPUT_SIZE=64000 31 | 32 | # hdfs:// directory where input and output is kept 33 | HDFS_BASE_DIR=hdfs:///user/ubuntu/webgraph 34 | WAREHOUSE_DIR=$HDFS_BASE_DIR 35 | 36 | # where to keep results on s3:// 37 | # (note: this is a private bucket and needs to be changed) 38 | S3_OUTPUT_PREFIX=s3://commoncrawl-webgraph 39 | S3A_OUTPUT_PREFIX=s3a://commoncrawl-webgraph 40 | 41 | 42 | ################################################################################ 43 | # construct a merged graph of multiple monthly crawls 44 | 45 | MERGE_NAME=cc-main-2025-feb-mar-apr 46 | 47 | # Naming convention should be the three months' crawls that are 48 | # used to generate this graph release. In the event of multiple months 49 | # in a crawl, (e.g August & September, November & December) the first month is 50 | # used (e.g aug-nov). 51 | 52 | # input to construct a merged graph (over multiple months) 53 | # - used in addition to input crawls (see CRAWLS) 54 | # - output directories of hostlinks jobs of prior crawls 55 | # - list of fully-qualified paths: 56 | # ("s3a://.../hostlinks/0/" "s3a://.../hostlinks/1/" ...) 57 | # - ev. copy the data from s3:// to hdfs:// to avoid tasks 58 | # taking long while reading from S3 59 | MERGE_INPUT=() 60 | 61 | 62 | ################################################################################ 63 | # workflow runtime 64 | 65 | # temporary directory 66 | # - must exist on task/compute nodes for buffering data 67 | # - should provide several GBs of free space 68 | TMPDIR=/data/0/tmp 69 | 70 | # where to keep logs for steps 71 | LOGDIR=$PWD 72 | 73 | # file to stop the workflow (stops after a the currently running step(s) are done) 74 | STOP_FILE_=$LOGDIR/$(basename "$0" .sh).stop 75 | 76 | # use Python executable different than default "python" 77 | export PYSPARK_PYTHON=python3 78 | 79 | ################################################################################ 80 | 81 | 82 | ################################################################################ 83 | ### Spark / Yarn cluster configuration 84 | NUM_EXECUTORS=${NUM_EXECUTORS:-16} 85 | EXECUTOR_CONFIG=${EXECUTOR_CONFIG:-"r5.xlarge"} 86 | # NOTE: 87 | # - step 1 (host link extraction) can be run on smaller instances 88 | # or "compute optimized" instance types 89 | # - webgraph construction (esp. for merged graphs including multiple monthly crawls) 90 | # needs instances with sufficient amount of RAM (32 GB or more) 91 | # - assigning IDs in multiple partitions 92 | # (see hostlinks_to_graph.py --vertex_partitions) 93 | # reduces the memory requirements significantly 94 | 95 | 96 | case "$EXECUTOR_CONFIG" in 97 | c[5678]*.xlarge ) 98 | EXECUTOR_CORES=3 99 | EXECUTOR_MEM=5g 100 | NODEMANAGER_MEM_MB=$((6*1024)) 101 | ;; 102 | c[5678]*.2xlarge ) 103 | EXECUTOR_CORES=6 104 | EXECUTOR_MEM=10g 105 | NODEMANAGER_MEM_MB=$((11*1024)) 106 | ;; 107 | c[5678]*.4xlarge ) 108 | EXECUTOR_CORES=12 109 | EXECUTOR_MEM=22g 110 | NODEMANAGER_MEM_MB=$((24*1024)) 111 | ;; 112 | r[5678]*.xlarge ) 113 | EXECUTOR_CORES=4 114 | EXECUTOR_MEM=23g 115 | NODEMANAGER_MEM_MB=$((24*1024)) 116 | ;; 117 | r[5678]*.2xlarge ) 118 | EXECUTOR_CORES=7 119 | EXECUTOR_MEM=46g 120 | NODEMANAGER_MEM_MB=$((48*1024)) 121 | ;; 122 | r[5678]*.4xlarge ) 123 | EXECUTOR_CORES=15 124 | EXECUTOR_MEM=94g 125 | NODEMANAGER_MEM_MB=$((96*1024)) 126 | ;; 127 | r[5678]*.8xlarge ) 128 | EXECUTOR_CORES=30 129 | EXECUTOR_MEM=190g 130 | NODEMANAGER_MEM_MB=$((192*1024)) 131 | ;; 132 | m[5678]*.2xlarge ) 133 | EXECUTOR_CORES=8 134 | EXECUTOR_MEM=23g 135 | NODEMANAGER_MEM_MB=$((24*1024)) 136 | ;; 137 | m[5678]*.4xlarge ) 138 | EXECUTOR_CORES=16 139 | EXECUTOR_MEM=46g 140 | NODEMANAGER_MEM_MB=$((48*1024)) 141 | ;; 142 | m[5678]*.8xlarge ) 143 | EXECUTOR_CORES=32 144 | EXECUTOR_MEM=94g 145 | NODEMANAGER_MEM_MB=$((98*1024)) 146 | ;; 147 | "custom" ) 148 | if [ -z "$EXECUTOR_CORES" ] || [ -z "$EXECUTOR_MEM" ]; then 149 | echo "No valid custom executor configuration: must specify EXECUTOR_CORES and EXECUTOR_MEM'" >&2 150 | exit 1 151 | fi 152 | ;; 153 | * ) 154 | echo "No valid executor configuration: '$EXECUTOR_CONFIG'" >&2 155 | exit 1 156 | esac 157 | 158 | SPARK_EXTRA_OPTS="$SPARK_EXTRA_OPTS --conf spark.yarn.nodemanager.resource.memory-mb=$NODEMANAGER_MEM_MB" 159 | 160 | OUTPUT_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/2)) 161 | WEBGRAPH_EDGE_PARTITIONS=$((NUM_EXECUTORS*EXECUTOR_CORES/2)) 162 | WEBGRAPH_EDGE_PARTITIONS=$(((WEBGRAPH_EDGE_PARTITIONS " 10 | echo 11 | echo "Build node indexes to interactively explore a Common Crawl webgraph." 12 | echo "The webgraph files are expected to be placed in the current directory." 13 | echo 14 | echo " basename of the graph (without the .graph suffix)" 15 | echo " vertices file name (including the file suffix)" 16 | echo " or directory containing the vertices files" 17 | echo 18 | exit 1 19 | fi 20 | 21 | export LC_ALL=C 22 | 23 | BIN="$(dirname $0)" 24 | WG="$BIN/run_webgraph.sh" 25 | 26 | declare -A suffix_name_map 27 | suffix_name_map=( 28 | graph "webgraph / BVGraph" 29 | properties "webgraph properties" 30 | offsets "webgraph offsets" 31 | iepm "immutable external prefix map" 32 | mph "minimal perfect hash" 33 | fcl "front coded list" 34 | smph "string map perfect hash" 35 | ) 36 | 37 | function list_webgraph_files() { 38 | name="$1"; shift 39 | ok=true 40 | for suffix in "$@"; do 41 | if [ -e $name.$suffix ]; then 42 | printf " .%-10s : %-20s (%s)\n" "$suffix" \ 43 | "${suffix_name_map[$suffix]}" "$name.$suffix" 44 | else 45 | echo -e "Missing $name.$suffix (${suffix_name_map[$suffix]})" 46 | ok=false 47 | fi 48 | done 49 | if ! $ok; then 50 | exit 1 51 | fi 52 | } 53 | 54 | function index_status() { 55 | echo 56 | echo "Prepared webgraph $NAME for look-ups by node label." 57 | echo "The following files (by file suffix) will be used:" 58 | 59 | echo "Webgraph:" 60 | list_webgraph_files $NAME graph properties offsets 61 | echo "Webgraph (transpose):" 62 | list_webgraph_files $NAME-t graph properties offsets 63 | 64 | echo "Mapping vertex labels to vertex IDs:" 65 | if [ -e $NAME.iepm ]; then 66 | list_webgraph_files $NAME iepm 67 | else 68 | list_webgraph_files $NAME mph fcl smph 69 | fi 70 | } 71 | 72 | 73 | # check for graph files (.graph and .properties), also for the 74 | # transpose of the graph ($NAME-t.$suffix) 75 | echo "Verifying webgraph files:" 76 | list_webgraph_files $NAME graph properties 77 | echo "Verifying webgraph files (transpose of the graph):" 78 | list_webgraph_files $NAME-t graph properties 79 | 80 | # check for the vertices file 81 | if ! [ -e $VERTICES ]; then 82 | echo "Vertices file not found" 83 | exit 1 84 | fi 85 | 86 | 87 | # generate offsets (*.offsets and *.obl) 88 | if ! [ -e $NAME.offsets ]; then 89 | "$WG" it.unimi.dsi.webgraph.BVGraph --offsets --list $NAME 90 | echo "webgraph offsets file created" 91 | fi 92 | if ! [ -e $NAME-t.offsets ]; then 93 | "$WG" it.unimi.dsi.webgraph.BVGraph --offsets --list $NAME-t 94 | echo "webgraph offsets file created (transpose of the graph)" 95 | fi 96 | 97 | 98 | # building `iepm` "immutable external prefix map" 99 | # (https://dsiutils.di.unimi.it/docs/it/unimi/dsi/util/ImmutableExternalPrefixMap.html) 100 | # bidirectional mapping from node names to node IDs 101 | if [ -e $NAME.iepm ]; then 102 | index_status 103 | exit 0 104 | fi 105 | CAT_VERTICES="zcat $VERTICES" 106 | if [ -d $VERTICES ]; then 107 | # host-level webgraph, multiple vertex files 108 | CAT_VERTICES="zcat $VERTICES/*.txt.gz" 109 | fi 110 | if (set -eo pipefail; 111 | eval $CAT_VERTICES \ 112 | | cut -f2 \ 113 | | "$WG" it.unimi.dsi.util.ImmutableExternalPrefixMap --block-size 4Ki $NAME.iepm); then 114 | echo "immutable external prefix map successfully built: $NAME.iepm" 115 | index_status 116 | exit 0 117 | fi 118 | # Note: building the `iepm` may fail for older versions of the domain 119 | # graph (before the graphs of May, June/July and August 2022) because 120 | # the nodes were not properly lexicographically sorted while folding 121 | # host names to domain names. If this is the case, continue to create 122 | # instead mappings which do not depend on proper sorting. 123 | 124 | # build 125 | # - the `mph` (minimal perfect hash) file mapping from node label 126 | # (reversed domain name) to node ID 127 | # - a front coded list to map node IDs to node labels 128 | if ! [ -e $NAME.mph ] || ! [ -e $NAME.fcl ]; then 129 | zcat $VERTICES \ 130 | | cut -f2 \ 131 | | tee >("$WG" it.unimi.dsi.sux4j.mph.GOV4Function $NAME.mph) \ 132 | | "$WG" it.unimi.dsi.util.FrontCodedStringList --utf8 --ratio 32 $NAME.fcl 133 | fi 134 | 135 | # build the `smph` file (string map perfect hash) required to 136 | # determine whether a node label is present in the `mph` file 137 | if ! [ -e $NAME.smph ]; then 138 | zcat $VERTICES \ 139 | | cut -f2 \ 140 | | "$WG" it.unimi.dsi.util.ShiftAddXorSignedStringMap $NAME.mph $NAME.smph 141 | fi 142 | 143 | 144 | index_status 145 | -------------------------------------------------------------------------------- /src/script/webgraph_ranking/graph_explore_download_webgraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME="$1" 4 | if ! shift 1; then 5 | echo "$(basename $0) " 6 | echo 7 | echo "Download all files required to interactively explore a Common Crawl webgraph." 8 | echo "The downloaded files are placed in the current directory." 9 | echo "Wget or curl are required for downloading" 10 | echo 11 | echo " webgraph base name without file suffix, eg. cc-main-2023-mar-may-oct-domain" 12 | echo 13 | exit 1 14 | fi 15 | 16 | export LC_ALL=C 17 | 18 | BIN="$(dirname $0)" 19 | 20 | USING_CURL=false 21 | USING_WGET=false 22 | if command -v curl &>/dev/null; then 23 | USING_CURL=true 24 | elif command -v wget &>/dev/null; then 25 | USING_WGET=true 26 | else 27 | echo "Either curl or wget are required for downloading" >&2 28 | exit 1 29 | fi 30 | 31 | declare -A suffix_name_map 32 | suffix_name_map=( 33 | graph "webgraph / BVGraph" 34 | properties "webgraph properties" 35 | offsets "webgraph offsets" 36 | stats "webgraph statistics" 37 | txt.gz "text file (vertex labels)" 38 | ) 39 | 40 | function list_webgraph_files() { 41 | name="$1"; shift 42 | ok=true 43 | for suffix in "$@"; do 44 | if [ -e $name.$suffix ]; then 45 | printf " .%-10s : %-20s (%s)\n" "$suffix" \ 46 | "${suffix_name_map[$suffix]}" "$name.$suffix" 47 | elif [ -d "$name" ] && [[ "$suffix" =~ ^\*. ]]; then 48 | ls "$name"/* | sed 's/^/\t/' 49 | else 50 | echo -e "Missing $name.$suffix (${suffix_name_map[$suffix]})" 51 | ok=false 52 | fi 53 | done 54 | if ! $ok; then 55 | exit 1 56 | fi 57 | } 58 | 59 | function download_file() { 60 | FILE="$1" 61 | if [ -e "$FILE" ]; then 62 | return # already done 63 | fi 64 | URL="https://data.commoncrawl.org/projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/$FILE" 65 | echo "Downloading $URL" 66 | 67 | if $USING_CURL; then 68 | 69 | curl --silent --show-error --fail \ 70 | --remote-time -o "$FILE" --time-cond "$FILE" --continue-at - \ 71 | --retry 1000 --retry-delay 1 "$URL" 72 | 73 | elif $USING_WGET; then 74 | 75 | if [ "$(dirname "$FILE")" == "." ]; then 76 | wget --continue --timestamping --tries=0 --retry-on-http-error=503 --waitretry=1 "$URL" 77 | else 78 | wget --continue --timestamping --directory-prefix="$(dirname "$FILE")" \ 79 | --tries=0 --retry-on-http-error=503 --waitretry=1 "$URL" 80 | fi 81 | 82 | fi 83 | } 84 | 85 | function download_files() { 86 | name="$1"; shift 87 | for suffix in "$@"; do 88 | download_file "$name.$suffix" 89 | done 90 | } 91 | 92 | 93 | BASE_NAME="${NAME%-domain}" 94 | BASE_NAME="${BASE_NAME%-host}" 95 | GRAPH_AGGR_LEVEL="${NAME##*-}" 96 | 97 | 98 | set -e # stop on errors 99 | 100 | download_files "$NAME" graph properties stats 101 | download_files "$NAME-t" graph properties 102 | 103 | if [ "$GRAPH_AGGR_LEVEL" == "domain" ]; then 104 | download_files "$NAME-vertices" txt.gz 105 | else 106 | download_files "$NAME-vertices" paths.gz 107 | zcat "$NAME-vertices".paths.gz \ 108 | | while read path; do 109 | file=${path#projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/} 110 | mkdir -p $(dirname "$file") 111 | download_file "$file" 112 | done 113 | fi 114 | 115 | echo "Downloaded files" 116 | echo "- webgraph" 117 | list_webgraph_files $NAME graph properties stats 118 | echo "- webgraph (transpose)" 119 | list_webgraph_files $NAME-t graph properties 120 | echo "- webgraph vertices" 121 | if [ "$GRAPH_AGGR_LEVEL" == "domain" ]; then 122 | list_webgraph_files $NAME-vertices txt.gz 123 | else 124 | list_webgraph_files vertices "*.txt.gz" 125 | fi 126 | -------------------------------------------------------------------------------- /src/script/webgraph_ranking/graph_explore_load_graph.jsh: -------------------------------------------------------------------------------- 1 | /open PRINTING 2 | 3 | String graph = System.getProperty("graph") 4 | println("Loading graph " + graph) 5 | 6 | import org.commoncrawl.webgraph.explore.Graph 7 | import org.commoncrawl.webgraph.explore.GraphExplorer 8 | import it.unimi.dsi.webgraph.ImmutableGraph 9 | 10 | GraphExplorer e = new GraphExplorer(graph) 11 | Graph g = e.getGraph() 12 | 13 | println() 14 | println("Graph " + graph + " loaded into GraphExplorer *e*") 15 | println("Type \"e.\" and press to list the public methods of the class GraphExplorer") 16 | println("... or \"g.\" for the graph loaded for exploration") 17 | 18 | /* Define commands provided by pywebgraph (cn, pwn, ls, sl) */ 19 | void cn(String vertexLabel) { e.cn(vertexLabel); } 20 | void cn(long vertexID) { e.cn(vertexID); } 21 | void pwn() { e.pwn(); } 22 | void ls() { e.ls(); } 23 | void ls(long vertexId) { e.ls(vertexId); } 24 | void ls(String vertexLabel) { e.ls(vertexLabel); } 25 | void sl() { e.sl(); } 26 | void sl(long vertexId) { e.sl(vertexId); } 27 | void sl(String vertexLabel) { e.sl(vertexLabel); } 28 | 29 | println() 30 | println("... or use one of the predefined methods:") 31 | /methods cn pwn ls sl 32 | println() -------------------------------------------------------------------------------- /src/script/webgraph_ranking/process_webgraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME="$1" 4 | VERTICES="$2" 5 | EDGES="$3" 6 | if ! shift 3; then 7 | echo "$(basename $0) []" 8 | exit 1 9 | fi 10 | 11 | if ! [[ "$NAME" =~ ^[a-zA-Z0-9_][a-zA-Z0-9_.-]+[a-zA-Z0-9_]$ ]]; then 12 | echo "Graph should only contain [a-zA-Z0-9_.-] and start and end with [a-zA-Z0-9_]." 13 | echo "The graph name '$NAME' might not be safe as a graph base name (without suffix)" 14 | echo "or directory name to place the graph files into. Exiting..." 15 | exit 1 16 | fi 17 | 18 | OUTPUTDIR="$NAME" 19 | if [ -n "$1" ]; then 20 | OUTPUTDIR="$1" 21 | shift 22 | case "$OUTPUTDIR" in 23 | *" "* ) 24 | echo "The output directory must not contain white space. Exiting..." 25 | exit 1 26 | ;; 27 | esac 28 | fi 29 | FULLNAME="$OUTPUTDIR/$NAME" 30 | 31 | 32 | set -e # fail if creation of output directory fails 33 | 34 | if [ -d "$OUTPUTDIR" ]; then 35 | echo "Output directory $OUTPUTDIR/ exists" 36 | else 37 | mkdir "$OUTPUTDIR" 38 | fi 39 | 40 | export LC_ALL=C 41 | 42 | BIN=$(dirname $0) 43 | WG=$BIN/run_webgraph.sh 44 | LW=$BIN/run_webgraph.sh 45 | 46 | source $BIN/../workflow_lib.sh 47 | source $BIN/webgraph_config.sh 48 | 49 | 50 | if ! ${USE_WEBGRAPH_BIG:-false} && [ $GRAPH_SIZE_NODES -gt $((0x7ffffff7)) ]; then 51 | echo "Graph has more nodes than max. array size in Java" 52 | echo "Using big version of webgraph framework" 53 | USE_WEBGRAPH_BIG=true 54 | fi 55 | if ${USE_WEBGRAPH_BIG:-false}; then 56 | WGP=it.unimi.dsi.big.webgraph 57 | else 58 | WGP=it.unimi.dsi.webgraph 59 | fi 60 | 61 | 62 | # logging 63 | test -d $OUTPUTDIR/logs || mkdir $OUTPUTDIR/logs 64 | LOGDIR=$OUTPUTDIR/logs 65 | # file to stop workflow 66 | STOP_FILE_=$LOGDIR/$(basename $0 .sh).stop 67 | 68 | function join_rank() ( 69 | set -exo pipefail 70 | _DATA_TYPE=$1 71 | _IN=$2 72 | _VERT=$3 73 | _OUT=$4 74 | _EXTRA_FIELDS="" 75 | if [ -n "$5" ]; then 76 | _EXTRA_FIELDS=",$5" 77 | fi 78 | 79 | 80 | if [ -d $_VERT ]; then 81 | # _VERT is a directory with multiple vertices files 82 | _VERT="${_VERT}/*.gz" 83 | fi 84 | 85 | ### unpack scores with LAW, join node names via paste, 86 | ### assign ranks on sorted lines by nl 87 | $LW it.unimi.dsi.law.io.tool.DataInput2Text --type $_DATA_TYPE $_IN - \ 88 | | paste - <(zcat $_VERT | cut -f2$_EXTRA_FIELDS) \ 89 | | sort --batch-size=$SORT_BATCHES --buffer-size=$SORT_BUFFER_SIZE --compress-program=gzip -t$'\t' -k1,1gr --stable \ 90 | | nl -w1 -nln \ 91 | | gzip >$_OUT 92 | ) 93 | 94 | function join_harmonicc_pagerank() ( 95 | set -exo pipefail 96 | NAME="$1" 97 | _IN_HC="$2" 98 | _IN_PR="$3" 99 | _OUT="$4" 100 | _EXTRA_FIELDS="" 101 | HEADER="#harmonicc_pos\t#harmonicc_val\t#pr_pos\t#pr_val\t#host_rev" 102 | if [ -n "$5" ]; then 103 | _EXTRA_FIELDS=",$5" 104 | HEADER="$HEADER\t$6" 105 | fi 106 | SORTOPTS="$SORT_PARALLEL_THREADS_OPT --batch-size=$SORT_BATCHES --buffer-size=$SORT_BUFFER_SIZE --compress-program=gzip" 107 | (echo -e "$HEADER"; 108 | zcat $_IN_HC | sort $SORTOPTS -t$'\t' -k3,3 --unique --stable \ 109 | | join -a1 -a2 -e'---' -t$'\t' -j3 -o1.1,1.2,2.1,2.2,0$_EXTRA_FIELDS - \ 110 | <(zcat $_IN_PR | sort $SORTOPTS -t$'\t' -k3,3 --unique --stable) \ 111 | | sort $SORTOPTS -t$'\t' -k1,1n -s) \ 112 | | gzip >$_OUT 113 | ) 114 | 115 | function join_ranks_in_memory() ( 116 | set -exo pipefail 117 | _VERT="$1" 118 | _HC="$2" 119 | _PR="$3" 120 | _OUT="$4" 121 | HEADER="#harmonicc_pos\t#harmonicc_val\t#pr_pos\t#pr_val\t#host_rev" 122 | if [ -n "$5" ]; then 123 | HEADER="${HEADER}\t$5" 124 | fi 125 | if [ -d $_VERT ]; then 126 | # _VERT is a directory with multiple vertices files 127 | _VERT="$_VERT/*.gz" 128 | fi 129 | OPTS="" 130 | # heuristics to set Java heap memory 131 | # bytes required per node (in theory, 60% more in practice) 132 | BYTES_MEM_REQUIRED=24 133 | if $USE_WEBGRAPH_BIG; then 134 | OPTS="--big" 135 | BYTES_MEM_REQUIRED=36 136 | fi 137 | BYTES_MEM_REQUIRED=$(($BYTES_MEM_REQUIRED*$GRAPH_SIZE_NODES*16/10)) 138 | JAVA_HEAP_GB=$((($BYTES_MEM_REQUIRED/2**30)+1)) 139 | JAVAOPTS="-Xmx${JAVA_HEAP_GB}g" 140 | SORTOPTS="$SORT_PARALLEL_THREADS_OPT --batch-size=$SORT_BATCHES --buffer-size=$SORT_BUFFER_SIZE --compress-program=gzip" 141 | (echo -e "$HEADER"; 142 | JAVA_OPTS=$JAVA_OPTS $WG org.commoncrawl.webgraph.JoinSortRanks $OPTS <(zcat $_VERT) $_HC $_PR -) \ 143 | | sort $SORTOPTS -t$'\t' -k1,1n --stable | gzip >$_OUT 144 | ) 145 | 146 | function join_degrees() ( 147 | set -exo pipefail 148 | _FULLNAME="$1" 149 | _VERT="$2" 150 | HEADER="#outdegree\t#indegree\t#host_rev" 151 | if [ -n "$3" ]; then 152 | HEADER="$HEADER\t$3" 153 | fi 154 | if [ -d $_VERT ]; then 155 | # _VERT is a directory with multiple vertices files 156 | _VERT="$_VERT/*.gz" 157 | fi 158 | zcat $_VERT \ 159 | | cut -f2- \ 160 | | paste $FULLNAME.outdegrees $FULLNAME.indegrees - \ 161 | | gzip >$FULLNAME-outdegrees-indegrees.txt.gz 162 | # top-N out/indegrees 163 | (echo -e "$HEADER"; 164 | set +o pipefail; 165 | zcat $FULLNAME-outdegrees-indegrees.txt.gz \ 166 | | perl -aF'\t' -lne 'print if $F[0] > 1000' \ 167 | | sort -k1,1nr \ 168 | | head -10000) \ 169 | | gzip >$FULLNAME-outdegrees-indegrees-topout.txt.gz 170 | (echo -e "$HEADER"; 171 | set +o pipefail; 172 | zcat $FULLNAME-outdegrees-indegrees.txt.gz \ 173 | | perl -aF'\t' -lne 'print if $F[1] > 1000' \ 174 | | sort -k2,2nr \ 175 | | head -10000) \ 176 | | gzip >$FULLNAME-outdegrees-indegrees-topin.txt.gz 177 | ) 178 | 179 | function connected_distrib() ( 180 | set -exo pipefail 181 | NUM_NODES=$1 182 | INPUT=$2 183 | OUTPUT=$3 184 | (echo -e "#freq\t#size\t#perc"; \ 185 | $LW it.unimi.dsi.law.io.tool.DataInput2Text --type int $INPUT - \ 186 | | perl -lne '$h{$_}++; END { while (($k,$v)=each %h) { print sprintf("%d\t%d\t%9.6f%%", $v, $k, 100*$k*$v/'$NUM_NODES') } }' \ 187 | | sort -k2,2nr) \ 188 | | gzip >$OUTPUT 189 | ) 190 | 191 | function degree_distrib() ( 192 | set -exo pipefail 193 | TYPE="$1" 194 | NAME="$2" 195 | (echo -e "#arcs\t#nodes"; 196 | perl -lne 'print sprintf("%d\t%s", ($.-1), $_) if $_ ne 0' $NAME.$TYPE) \ 197 | | gzip >$FULLNAME-$TYPE-distrib.txt.gz 198 | ) 199 | 200 | 201 | 202 | set -exo pipefail 203 | 204 | if [ -d $EDGES ]; then 205 | # edges is a directory with multiple files 206 | sort_input="" 207 | for e in $EDGES/part-*.gz; do 208 | sort_input="$sort_input <(zcat $e)" 209 | done 210 | if ${USE_WEBGRAPH_BIG:-false}; then 211 | ## TODO: 212 | ## * option --threads not available in webgraph-big 213 | ## * need to load from stdin 214 | ## (fails to read longs when reading BVGraph from file) 215 | ## Caused by: java.lang.IllegalArgumentException: 4635383979 216 | ## at it.unimi.dsi.big.webgraph.ImmutableGraph$BigImmutableGraphAdapter.check(ImmutableGraph.java:801) 217 | ## at it.unimi.dsi.big.webgraph.ImmutableGraph$BigImmutableGraphAdapter.access$200(ImmutableGraph.java:793) 218 | ## at it.unimi.dsi.big.webgraph.ImmutableGraph$BigImmutableGraphAdapter$1$1.nextInt(ImmutableGraph.java:832) 219 | ## at it.unimi.dsi.webgraph.LazyIntIterators.unwrap(LazyIntIterators.java:51) 220 | ## at it.unimi.dsi.webgraph.NodeIterator.successorArray(NodeIterator.java:70) 221 | ## at it.unimi.dsi.webgraph.ArrayListMutableGraph.(ArrayListMutableGraph.java:114) 222 | ## at it.unimi.dsi.big.webgraph.ArcListASCIIGraph.load(ArcListASCIIGraph.java:283) 223 | ## at it.unimi.dsi.big.webgraph.ArcListASCIIGraph.load(ArcListASCIIGraph.java:279) 224 | ## at it.unimi.dsi.big.webgraph.ArcListASCIIGraph.loadOffline(ArcListASCIIGraph.java:255) 225 | _step bvgraph \ 226 | bash -c "eval \"sort --batch-size=$SORT_BATCHES -t$'\t' -k1,1n -k2,2n --stable --merge $sort_input\" | $WG $WGP.BVGraph --once -g $WGP.ArcListASCIIGraph - $FULLNAME" 227 | else 228 | _step bvgraph \ 229 | bash -c "$WG $WGP.BVGraph --threads $THREADS -g $WGP.ArcListASCIIGraph <(eval \"sort --batch-size=$SORT_BATCHES -t$'\t' -k1,1n -k2,2n --stable --merge $sort_input\") $FULLNAME" 230 | fi 231 | else 232 | if ${USE_WEBGRAPH_BIG:-false}; then 233 | _step bvgraph \ 234 | bash -c "zcat $EDGES | $WG $WGP.BVGraph --once -g $WGP.ArcListASCIIGraph - $FULLNAME" 235 | else 236 | _step bvgraph \ 237 | $WG $WGP.BVGraph --threads $THREADS -g $WGP.ArcListASCIIGraph <(zcat $EDGES) $FULLNAME 238 | fi 239 | fi 240 | 241 | if ${USE_WEBGRAPH_BIG:-false}; then 242 | _step transpose \ 243 | $WG $WGP.Transform transposeOffline $FULLNAME $FULLNAME-t 244 | else 245 | # if low memory, add 246 | # --offline, combined with 247 | # -Djava.io.tmpdir=... to point to a temporary directory with free space 2 times the graph size 248 | # (see also run_webgraph.sh) 249 | _step transpose \ 250 | $WG $WGP.Transform transpose $FULLNAME $FULLNAME-t 251 | fi 252 | # _step symmetrize \ 253 | # $WG $WGP.Transform symmetrize $FULLNAME $FULLNAME-t $FULLNAME-sym 254 | 255 | _step hyperball \ 256 | $WG $WGP.algo.HyperBall --threads $THREADS --offline --log2m $HYPERBALL_REGISTERS \ 257 | --harmonic-centrality $FULLNAME-harmonicc.bin $FULLNAME-t $FULLNAME 258 | 259 | if ${USE_WEBGRAPH_BIG:-false}; then 260 | _step pagerank \ 261 | $LW it.unimi.dsi.law.big.rank.PageRankParallelGaussSeidel --mapped --threads $THREADS $FULLNAME-t $FULLNAME-pagerank 262 | else 263 | _step pagerank \ 264 | $LW it.unimi.dsi.law.rank.PageRankParallelGaussSeidel --expand --mapped --threads $THREADS $FULLNAME-t $FULLNAME-pagerank 265 | fi 266 | 267 | _step_bg connected 15 \ 268 | $WG $WGP.algo.ConnectedComponents --threads $THREADS -m --renumber --sizes -t $FULLNAME-t $FULLNAME 269 | connected_pid=$! 270 | _step_bg strongly_connected 15 \ 271 | $WG $WGP.algo.StronglyConnectedComponents --renumber --sizes $FULLNAME 272 | strongly_connected_pid=$! 273 | 274 | EXTRA_FIELDS="" 275 | EXTRA_FIELDS_JOIN="" 276 | EXTRA_FIELDS_HEADER="" 277 | if [ $VERTICES_FIELDS -gt 2 ]; then 278 | EXTRA_FIELDS="3-$VERTICES_FIELDS" 279 | EXTRA_FIELDS_JOIN="1.4" 280 | EXTRA_FIELDS_HEADER="#n_hosts" 281 | for i in $(seq 4 $VERTICES_FIELDS); do 282 | EXTRA_FIELDS_JOIN="${EXTRA_FIELDS_JOIN},1.$(($i+1))" 283 | done 284 | fi 285 | 286 | if ${JOIN_RANKS_IN_MEMORY}; then 287 | _step_bg join_ranks 15 \ 288 | join_ranks_in_memory $VERTICES $FULLNAME-harmonicc.bin $FULLNAME-pagerank.ranks $FULLNAME-ranks.txt.gz "$EXTRA_FIELDS_HEADER" 289 | else 290 | _step_bg join_harmonicc 15 \ 291 | join_rank float $FULLNAME-harmonicc.bin $VERTICES $FULLNAME-harmonic-centrality.txt.gz "$EXTRA_FIELDS" 292 | _step_bg join_pr_gs 15 \ 293 | join_rank double $FULLNAME-pagerank.ranks $VERTICES $FULLNAME-pagerank.txt.gz 294 | wait # until background processes are finished 295 | # join ranks into one file 296 | _step_bg join_harmonicc_pagerank 60 \ 297 | join_harmonicc_pagerank $NAME $FULLNAME-harmonic-centrality.txt.gz $FULLNAME-pagerank.txt.gz $FULLNAME-ranks.txt.gz "$EXTRA_FIELDS_JOIN" "$EXTRA_FIELDS_HEADER" 298 | fi 299 | 300 | # stats use connected components files, wait for these to be finished 301 | if ! kill -0 $connected_pid; then 302 | : # step connected already finished 303 | else 304 | wait $connected_pid 305 | fi 306 | if ! kill -0 $strongly_connected_pid; then 307 | : # step strongly_connected already finished 308 | else 309 | wait $strongly_connected_pid 310 | fi 311 | 312 | _step stats \ 313 | $WG $WGP.Stats --save-degrees $FULLNAME 314 | 315 | _step_bg join_degrees 15 \ 316 | join_degrees $FULLNAME $VERTICES "$EXTRA_FIELDS_HEADER" 317 | 318 | NODES=$(perl -lne 'print if s@^nodes=@@' $FULLNAME.stats) 319 | _step connected_distrib \ 320 | connected_distrib $NODES $FULLNAME.wccsizes $FULLNAME-connected-components-distrib.txt.gz 321 | # it.unimi.dsi.webgraph.Stats writes *.sccdistr (but there is no *.wccdistr) 322 | # _step strongly_connected_distrib \ 323 | # connected_distrib $NODES $FULLNAME.sccsizes $FULLNAME-strongly-connected-components-distrib.txt.gz 324 | 325 | _step indegree_distrib \ 326 | degree_distrib indegree $FULLNAME 327 | _step outdegree_distrib \ 328 | degree_distrib outdegree $FULLNAME 329 | 330 | wait # until background processes are finished 331 | -------------------------------------------------------------------------------- /src/script/webgraph_ranking/process_webgraph_degrees.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eo pipefail 4 | 5 | NAME="$1" 6 | TYPE="${2:-domain}" 7 | 8 | if [ -z "$NAME" ]; then 9 | echo "Usage: $(basename $0) []" 10 | echo -e "\tgraph-name\tbase name of the webgraph (without the file suffix .graph)" 11 | echo -e "\ttype\ttype (level) of the graph aggregation: domain (default) or host" 12 | exit 1 13 | fi 14 | 15 | WG=$(dirname $0)/run_webgraph.sh 16 | 17 | if [ -e $NAME.outdegrees ] && [ -e $NAME.indegrees ]; then 18 | : # out/indegrees already done 19 | else 20 | $WG it.unimi.dsi.webgraph.Stats --save-degrees "$NAME" 21 | fi 22 | 23 | 24 | if [ "$TYPE" == "domain" ]; then 25 | zcat $NAME-vertices.txt.gz 26 | else 27 | zcat vertices/*.txt.gz 28 | fi \ 29 | | cut -f2- \ 30 | | paste $NAME.outdegrees $NAME.indegrees - \ 31 | | gzip >$NAME-outdegrees-indegrees.txt.gz 32 | 33 | 34 | HEADER="outdegree\tindegree\tname" 35 | if [ "$TYPE" == "domain" ]; then 36 | HEADER="outdegree\tindegree\tname\tnumsubdomains" 37 | fi 38 | 39 | (echo -e "$HEADER"; 40 | set +o pipefail; 41 | zcat $NAME-outdegrees-indegrees.txt.gz \ 42 | | perl -aF'\t' -lne 'print if $F[0] > 1000' \ 43 | | sort -k1,1nr \ 44 | | head -10000) \ 45 | | gzip >$NAME-outdegrees-indegrees-topout.txt.gz 46 | 47 | (echo -e "$HEADER"; 48 | set +o pipefail; 49 | zcat $NAME-outdegrees-indegrees.txt.gz \ 50 | | perl -aF'\t' -lne 'print if $F[1] > 1000' \ 51 | | sort -k2,2nr \ 52 | | head -10000) \ 53 | | gzip >$NAME-outdegrees-indegrees-topin.txt.gz 54 | 55 | -------------------------------------------------------------------------------- /src/script/webgraph_ranking/run_webgraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export LC_ALL=C 4 | 5 | source "$(dirname $0)"/webgraph_config.sh 6 | 7 | CC_WEBGRAPH_JAR="${CC_WEBGRAPH_JAR:-$(dirname $0)/../../../target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar}" 8 | if ! [ -e $CC_WEBGRAPH_JAR ]; then 9 | echo "Jar file $CC_WEBGRAPH_JAR not found" 10 | echo "Java project needs to be build by running" 11 | echo " mvn package" 12 | exit 1 13 | fi 14 | 15 | _CLASSPATH="$CC_WEBGRAPH_JAR" 16 | if [ -n "$CLASSPATH" ]; then 17 | _CLASSPATH=$CLASSPATH:$_CLASSPATH 18 | fi 19 | 20 | if ! echo "$JAVA_OPTS" | grep -qE -e "-Xmx[0-9]+"; then 21 | # heuristics to run webgraph with 80% of available RAM (or all RAM - 8 GB if this is larger) 22 | MEMMB=$(free -m | perl -ne 'do { $p80 = int($1*.8); $a8 = int($1-8192); $m = $p80; $m = $a8 if $a8 > $p80; print $m; last } if /(\d+)/') 23 | JAVA_OPTS="$JAVA_OPTS -Xmx${MEMMB}m" 24 | fi 25 | 26 | if [ -n "$TMPDIR" ]; then 27 | JAVA_OPTS="$JAVA_OPTS -Djava.io.tmpdir=$TMPDIR" 28 | fi 29 | 30 | case "$1" in 31 | it.unimi.dsi.webgraph.algo.HyperBall \ 32 | | it.unimi.dsi.big.webgraph.algo.HyperBall \ 33 | | it.unimi.dsi.law.rank.PageRankParallelGaussSeidel \ 34 | | it.unimi.dsi.big.law.rank.PageRankParallelGaussSeidel ) 35 | # Java options for HyperBall, recommended in 36 | # https://webgraph.di.unimi.it/docs/it/unimi/dsi/webgraph/algo/HyperBall.html 37 | JAVA_OPTS="$JAVA_OPTS -server -Xss256K -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=$(($MEMMB/3))m \ 38 | -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB" 39 | ;; 40 | esac 41 | 42 | set -x 43 | time $JAVA_HOME/bin/java $JAVA_OPTS -cp $_CLASSPATH "$@" 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/script/webgraph_ranking/webgraph_config.sh: -------------------------------------------------------------------------------- 1 | # configuration to process web graphs using the webgraph framework 2 | 3 | # size of the graph (default: 64 million nodes) 4 | # - no exact size is needed, just to estimate the required Java heap space 5 | GRAPH_SIZE_NODES=${GRAPH_SIZE_NODES:-67108864} 6 | 7 | # for big graphs with more than 2^31 nodes/vertices 8 | USE_WEBGRAPH_BIG=${USE_WEBGRAPH_BIG:-false} 9 | 10 | # join node names and ranks in memory 11 | JOIN_RANKS_IN_MEMORY=${JOIN_RANKS_IN_MEMORY:-true} 12 | 13 | 14 | # number of registers used for Hyperball / harmonic centrality calculation 15 | # 16 | # The number of Hyperball registers depend on 17 | # - the size of the machine (here EC2 instance) 18 | # - and of the graph to be processed 19 | # => it's an empirically determined value and 20 | # possibly needs to be adjusted 21 | # It can be overridden by the environment variable 22 | # HYPERBALL_REGISTERS, see below. 23 | HYP_REG=12 24 | ## on r8.24.xlarge (768 GB, 96 CPUs) 25 | #HYP_REG=10 (host-level graph, 300M nodes) 26 | #HYP_REG=12 (domain-level graph, 130M nodes) 27 | 28 | HYPERBALL_REGISTERS=${HYPERBALL_REGISTERS:-$HYP_REG} 29 | 30 | # number of threads 31 | # THREAD=0 : let the webgraph tools decide how many threads, 32 | # given the available CPU cores, using 33 | # java.lang.Runtime.availableProcessors() 34 | THREADS=${THREADS:-0} 35 | 36 | 37 | 38 | # number of fields in vertices file(s) 39 | # (default: 2) 40 | # 41 | # (if 3, for domain graphs) 42 | # 43 | VERTICES_FIELDS=${VERTICES_FIELDS:-2} 44 | 45 | 46 | # threads and buffer size used for sorting 47 | export SORT_PARALLEL_THREADS_OPT="" 48 | if echo -e "b\na\nc" | sort --parallel=2 >/dev/null; then 49 | echo "The sort command supports parallel sort threads" >&2 50 | SORT_PARALLEL_THREADS_OPT="--parallel=$((($THREADS > 4) ? ($THREADS/2) : 2))" 51 | fi 52 | 53 | # take 20% of main memory, at least 1 GB, for sorting "chunks" 54 | MEM_20PERC=$(free -g | perl -ne 'do { print 1+int($1*.2), "g"; last } if /(\d+)/') 55 | export SORT_BUFFER_SIZE=${SORT_BUFFER_SIZE:-$MEM_20PERC} 56 | 57 | # max. number of merge inputs 58 | # (should be not less than number of vertices / edges files to be merged) 59 | export SORT_BATCHES=${SORT_BATCHES:-240} 60 | 61 | -------------------------------------------------------------------------------- /src/script/workflow_lib.sh: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Copyright (C) 2022 Common Crawl and contributors 3 | 4 | ### functions used to run webgraph... workflow 5 | 6 | function LOG__() { 7 | echo $(date '+[%Y-%m-%d %H:%M:%S]') "$@" 8 | } 9 | 10 | function _test_step() { 11 | if [ -n "$STOP_FILE_" ] && [ -e "$STOP_FILE_" ]; then 12 | LOG__ INFO "Found stop file: $STOP_FILE_" 13 | exit 0 14 | fi 15 | _STEP__="$1"; shift 16 | if [ -e "$LOGDIR"/"$_STEP__".log ] \ 17 | || [ -e "$LOGDIR"/"$_STEP__".log.xz ] \ 18 | || [ -e "$LOGDIR"/"$_STEP__".log.gz ] \ 19 | || [ -e "$LOGDIR"/"$_STEP__".log.bz2 ]; then 20 | LOG__ INFO "Step $_STEP__ already done, $LOGDIR/$_STEP__.log exists" 21 | return 1 22 | fi 23 | return 0 24 | } 25 | 26 | function _step() { 27 | _STEP__="$1"; shift 28 | if _test_step "$_STEP__"; then 29 | LOG__ INFO "Running step $_STEP__ ..." 30 | if "$@" &>"$LOGDIR"/"$_STEP__".log; then 31 | LOG__ INFO "Step $_STEP__ succeeded." 32 | else 33 | RES=$? 34 | LOG__ ERROR "Step $_STEP__ failed with $RES" 35 | mv "$LOGDIR"/"$_STEP__".log "$LOGDIR"/"$_STEP__".failed.$(date +%Y-%m-%d-%H-%M-%S).log 36 | LOG__ ERROR "Exiting ..." 37 | exit $RES 38 | fi 39 | fi 40 | } 41 | 42 | function _step_bg() { 43 | _STEP__="$1" 44 | _SLEEP_="$2" 45 | shift 2 46 | LOG__ INFO "Running background step $_STEP__ ..." 47 | if ! [ "$_SLEEP_" -eq "$_SLEEP_" ] 2>/dev/null; then 48 | echo "_step_bg ..." 49 | echo " parameter must be an integer" 50 | echo " (sleep seconds after launching command, before executing next step)" 51 | exit 1 52 | fi 53 | if _test_step "$_STEP__"; then 54 | _step "$_STEP__" "$@" & 55 | sleep $_SLEEP_ 56 | fi 57 | } 58 | 59 | -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2022 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph; 6 | 7 | import static org.junit.jupiter.api.Assertions.assertEquals; 8 | import static org.junit.jupiter.api.Assertions.assertFalse; 9 | import static org.junit.jupiter.api.Assertions.assertTrue; 10 | 11 | import java.util.Arrays; 12 | 13 | import org.junit.jupiter.api.Test; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import it.unimi.dsi.webgraph.LazyIntIterator; 18 | import it.unimi.dsi.webgraph.LazyIntIterators; 19 | 20 | public class TestCountingMergedIntIterator { 21 | 22 | protected static Logger LOG = LoggerFactory.getLogger(TestCountingMergedIntIterator.class); 23 | 24 | @Test 25 | void testSimple() { 26 | CountingMergedIntIterator iter = new CountingMergedIntIterator(LazyIntIterators.EMPTY_ITERATOR); 27 | assertFalse(iter.hasNext()); 28 | 29 | int[][][] testArrays = { // 30 | {{0, 1}}, // 31 | {{0}, {1}}, // 32 | {{1}, {0}}, // 33 | {{1}, {0}, {}}, // 34 | {{1}, {0}, {}, {0}, {0}}, // 35 | {{1}, {0}, {}, {0}, {0, 1}}, // 36 | // tests for input arrays with repeating numbers 37 | {{1, 1}, {0, 0}, {}, {0, 0}, {0, 0}}, // 38 | {{1, 1}, {0, 0}, {}, {0}, {0, 1}} // 39 | }; 40 | 41 | for (int[][] tArrays : testArrays) { 42 | LazyIntIterator[] tIters = new LazyIntIterator[tArrays.length]; 43 | int totalCountExpected = 0; 44 | for (int i = 0; i < tArrays.length; i++) { 45 | tIters[i] = LazyIntIterators.wrap(tArrays[i]); 46 | totalCountExpected += tArrays[i].length; 47 | } 48 | int totalCount = 0; 49 | iter = new CountingMergedIntIterator(tIters); 50 | assertTrue(iter.hasNext()); 51 | 52 | assertEquals(0, iter.nextInt()); 53 | assertTrue(iter.getCount() > 0); 54 | totalCount += iter.getCount(); 55 | assertTrue(iter.hasNext()); 56 | assertEquals(1, iter.nextInt()); 57 | assertTrue(iter.getCount() > 0); 58 | totalCount += iter.getCount(); 59 | assertFalse(iter.hasNext()); 60 | assertEquals(totalCountExpected, totalCount, 61 | "expected total count for input " + Arrays.deepToString(tArrays) + " is " + totalCountExpected); 62 | } 63 | 64 | // test skip(n) 65 | for (int n = 0; n <= 5; n++) { 66 | for (int[][] tArrays : testArrays) { 67 | LazyIntIterator[] tIters = new LazyIntIterator[tArrays.length]; 68 | for (int i = 0; i < tArrays.length; i++) { 69 | tIters[i] = LazyIntIterators.wrap(tArrays[i]); 70 | } 71 | iter = new CountingMergedIntIterator(tIters); 72 | assertEquals(Math.min(n, 2), iter.skip(n)); 73 | } 74 | } 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2022 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph; 6 | 7 | import static org.junit.jupiter.api.Assertions.assertArrayEquals; 8 | import static org.junit.jupiter.api.Assertions.assertTrue; 9 | import static org.junit.jupiter.api.Assertions.fail; 10 | 11 | import java.io.ByteArrayOutputStream; 12 | import java.io.PrintStream; 13 | import java.nio.charset.StandardCharsets; 14 | import java.util.Arrays; 15 | 16 | import org.commoncrawl.webgraph.HostToDomainGraph.Domain; 17 | import org.junit.jupiter.api.BeforeEach; 18 | import org.junit.jupiter.api.Test; 19 | import org.slf4j.Logger; 20 | import org.slf4j.LoggerFactory; 21 | 22 | class TestHostToDomainGraph { 23 | 24 | protected static Logger LOG = LoggerFactory.getLogger(TestHostToDomainGraph.class); 25 | 26 | static final int maxGraphNodes = 128; 27 | 28 | HostToDomainGraph converter; 29 | 30 | String[] hostGraphSimple = { // 31 | "0\tcom.example", // 32 | "1\tcom.example.www,", // 33 | "2\tcom.example.xyz,", // 34 | "3\torg.example" // 35 | }; 36 | String[] domainGraphSimple = { // 37 | "0\tcom.example\t3", // 38 | "1\torg.example\t1" // 39 | }; 40 | 41 | String[] hostGraphNamesNotSorted = { // 42 | "0\tcom.example", // 43 | "1\tcom.example.xyz,", // 44 | "2\tcom.example.www,", // 45 | "3\torg.example" // 46 | }; 47 | 48 | String[] hostGraphHyphenatedDomains = { // 49 | "0\tac.e-bike", // 50 | "1\tac.e-bikes", // 51 | "2\tac.e-com", // 52 | "3\tac.e.subdomain", // 53 | "4\tac.eagle", // 54 | "5\tac.gov", // domain name public suffix only 55 | "6\tac.gov.ascension", // 56 | "7\tac.gov.ascension-island", // 57 | "8\tac.gov.ascension.mail", // 58 | "9\tac.gov.conservation-ascension-island", // 59 | "10\tac.gov.postoffice", // 60 | }; 61 | String[] domainGraphHyphenatedDomains = { // 62 | "0\tac.e\t1", // 63 | "1\tac.e-bike\t1", // 64 | "2\tac.e-bikes\t1", // 65 | "3\tac.e-com\t1", // 66 | "4\tac.eagle\t1", // 67 | "5\tac.gov.ascension\t2", // 68 | "6\tac.gov.ascension-island\t1", // 69 | "7\tac.gov.conservation-ascension-island\t1", // 70 | "8\tac.gov.postoffice\t1", // 71 | }; 72 | String[] domainGraphHyphenatedDomainsInclMultiPartSuffixes = { // 73 | "0\tac.e\t1", // 74 | "1\tac.e-bike\t1", // 75 | "2\tac.e-bikes\t1", // 76 | "3\tac.e-com\t1", // 77 | "4\tac.eagle\t1", // 78 | "5\tac.gov\t1", // 79 | "6\tac.gov.ascension\t2", // 80 | "7\tac.gov.ascension-island\t1", // 81 | "8\tac.gov.conservation-ascension-island\t1", // 82 | "9\tac.gov.postoffice\t1", // 83 | }; 84 | 85 | String[] hostGraphHyphenatedDomainsSubDomainOnly = { // 86 | "0\tac.gov.ascension-island", // 87 | "1\tac.gov.ascension.mail", // 88 | "2\tac.gov.conservation-ascension-island", // 89 | "3\tac.gov.postoffice", // 90 | }; 91 | String[] domainGraphHyphenatedDomainsSubDomainOnly = { // 92 | "0\tac.gov.ascension\t1", // 93 | "1\tac.gov.ascension-island\t1", // 94 | "2\tac.gov.conservation-ascension-island\t1", // 95 | "3\tac.gov.postoffice\t1", // 96 | }; 97 | 98 | String[] hostGraphDuplicatedDomains = { // 99 | "0\tno.hordaland", // 100 | "1\tno.hordaland-teater", // 101 | "2\tno.hordaland.os", // 102 | "3\tno.hordaland.os.bibliotek", // 103 | "4\tno.hordaland.oygarden", // 104 | "5\tno.hordalandfolkemusikklag", // 105 | }; 106 | String[] domainGraphDuplicatedDomains = { // 107 | "0\tno.hordaland\t2", // 108 | "1\tno.hordaland-teater\t1", // 109 | "2\tno.hordaland.os.bibliotek\t1", // 110 | "3\tno.hordalandfolkemusikklag\t1", // 111 | }; 112 | 113 | /** 114 | * forgot.his.name is in the "private section" of the public suffix 115 | * list, while name is in the ICANN section, see 116 | * {@link HostToDomainGraph#doPrivateDomains(boolean)} 117 | */ 118 | String[] hostGraphPrivateDomains = { // 119 | "0\tname.hiro", // 120 | "1\tname.hiropo", // 121 | "2\tname.his.forgot.adam", // 122 | "3\tname.his.forgot.ben", // 123 | "4\tname.his.forgot.never", // 124 | "5\tname.his.prz", // 125 | "6\tname.hista.tac", // 126 | "7\tname.history", // 127 | "8\tname.history.0.aba", // 128 | "9\tname.hit", // 129 | }; 130 | String[] domainGraphPrivateDomains = { // 131 | "0\tname.hiro\t1", // 132 | "1\tname.hiropo\t1", // 133 | "2\tname.his\t1", // 134 | "3\tname.his.forgot.adam\t1", // 135 | "4\tname.his.forgot.ben\t1", // 136 | "5\tname.his.forgot.never\t1", // 137 | "6\tname.hista\t1", // 138 | "7\tname.history\t2", // 139 | "8\tname.hit\t1", // 140 | }; 141 | 142 | @BeforeEach 143 | void init() { 144 | converter = new HostToDomainGraph(maxGraphNodes); 145 | } 146 | 147 | @Test 148 | void testDomainComparison() { 149 | assertTrue("org.example.".compareTo("org.example-domain.") > 0); 150 | assertTrue(Domain.compareRevDomainsSafe("org.example", "org.example") == 0); 151 | assertTrue(Domain.compareRevDomainsSafe("org.example", "org.exampledomain") < 0); 152 | assertTrue(Domain.compareRevDomainsSafe("org.example", "org.example-domain") > 0); 153 | assertTrue(Domain.compareRevDomainsSafe("org.example", "org.example.domain") > 0); 154 | } 155 | 156 | private String[] convert(HostToDomainGraph converter, String[] hostGraph) { 157 | ByteArrayOutputStream domainBytes = new ByteArrayOutputStream(); 158 | PrintStream domainOut = new PrintStream(domainBytes); 159 | converter.convert(converter::convertNode, Arrays.stream(hostGraph), domainOut); 160 | converter.finishNodes(domainOut); 161 | return new String(domainBytes.toByteArray(), StandardCharsets.UTF_8).split("\n"); 162 | } 163 | 164 | private String[] stripCounts(String[] domainGraph) { 165 | return Arrays.stream(domainGraph).map(s -> s.replaceFirst("\\t[^\\t]*$", "")).toArray(String[]::new); 166 | } 167 | 168 | private String[] getNodeNames(String[] graph) { 169 | return Arrays.stream(graph).map(s -> s.split("\t")[1]).toArray(String[]::new); 170 | } 171 | 172 | private long[] getNodeIDs(String[] graph) { 173 | return Arrays.stream(graph).mapToLong(s -> Long.parseLong(s.split("\t")[0])).toArray(); 174 | } 175 | 176 | /** 177 | * test whether node names are properly sorted and IDs are correctly assigned 178 | * (sequentially, strictly monotonically increasing, no gaps) 179 | */ 180 | void testSorted(String[] graph) { 181 | String[] names = getNodeNames(graph); 182 | String[] namesSorted = Arrays.copyOf(names, names.length); 183 | Arrays.sort(namesSorted); 184 | assertArrayEquals(namesSorted, names); 185 | long lastId = -1; 186 | for (long id : getNodeIDs(graph)) { 187 | if ((lastId + 1) != id) { 188 | fail("IDs not correctly assigned: " + lastId + ", " + id); 189 | } 190 | lastId = id; 191 | } 192 | } 193 | 194 | @Test 195 | void testConvertNodesSimple() { 196 | testSorted(hostGraphSimple); 197 | converter.doCount(false); 198 | assertArrayEquals(stripCounts(domainGraphSimple), convert(converter, hostGraphSimple)); 199 | testSorted(domainGraphSimple); 200 | } 201 | 202 | @Test 203 | void testConvertNodesSimpleCount() { 204 | converter.doCount(true); 205 | assertArrayEquals(domainGraphSimple, convert(converter, hostGraphSimple)); 206 | } 207 | 208 | @Test 209 | void testConvertNodesNotSorted() { 210 | try { 211 | convert(converter, hostGraphNamesNotSorted); 212 | fail("Unable to convert to domain graph from not properly sorted input"); 213 | } catch (Exception e) { 214 | LOG.info("Expected exception on input not properly sorted", e.getMessage()); 215 | } 216 | } 217 | 218 | @Test 219 | void testConvertNodesHyphenatedDomains() { 220 | // verify sorting of input and expected output 221 | testSorted(hostGraphHyphenatedDomains); 222 | testSorted(domainGraphHyphenatedDomains); 223 | converter.doCount(true); 224 | assertArrayEquals(domainGraphHyphenatedDomains, convert(converter, hostGraphHyphenatedDomains)); 225 | } 226 | 227 | @Test 228 | void testConvertNodesHyphenatedDomainsSubDomainOnly() { 229 | // verify sorting of input and expected output 230 | testSorted(hostGraphHyphenatedDomainsSubDomainOnly); 231 | testSorted(domainGraphHyphenatedDomains); 232 | converter.doCount(true); 233 | assertArrayEquals(domainGraphHyphenatedDomainsSubDomainOnly, 234 | convert(converter, hostGraphHyphenatedDomainsSubDomainOnly)); 235 | } 236 | 237 | @Test 238 | void testConvertNodesDuplicatedDomain() { 239 | // verify sorting of input and expected output 240 | testSorted(hostGraphDuplicatedDomains); 241 | testSorted(domainGraphDuplicatedDomains); 242 | converter.doCount(true); 243 | assertArrayEquals(domainGraphDuplicatedDomains, convert(converter, hostGraphDuplicatedDomains)); 244 | } 245 | 246 | @Test 247 | void testConvertNodesHyphenatedDomainsIncludingMultiPartSuffixes() { 248 | // verify sorting of input and expected output 249 | testSorted(hostGraphHyphenatedDomains); 250 | testSorted(domainGraphHyphenatedDomainsInclMultiPartSuffixes); 251 | converter.doCount(true); 252 | converter.multiPartSuffixesAsDomains(true); 253 | assertArrayEquals(domainGraphHyphenatedDomainsInclMultiPartSuffixes, 254 | convert(converter, hostGraphHyphenatedDomains)); 255 | } 256 | 257 | @Test 258 | void testConvertPrivateDomain() { 259 | // verify sorting of input and expected output 260 | testSorted(hostGraphPrivateDomains); 261 | testSorted(domainGraphPrivateDomains); 262 | converter.doCount(true); 263 | converter.doPrivateDomains(true); 264 | converter.multiPartSuffixesAsDomains(true); 265 | assertArrayEquals(domainGraphPrivateDomains, convert(converter, hostGraphPrivateDomains)); 266 | } 267 | 268 | } 269 | -------------------------------------------------------------------------------- /src/test/java/org/commoncrawl/webgraph/TestJoinSortRanks.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2022 Common Crawl and contributors 4 | */ 5 | package org.commoncrawl.webgraph; 6 | 7 | import static org.junit.jupiter.api.Assertions.assertEquals; 8 | import static org.junit.jupiter.api.Assertions.fail; 9 | 10 | import java.io.File; 11 | import java.io.IOException; 12 | 13 | import org.junit.jupiter.api.Disabled; 14 | import org.junit.jupiter.api.Test; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | import it.unimi.dsi.fastutil.io.BinIO; 19 | 20 | public class TestJoinSortRanks { 21 | 22 | protected static Logger LOG = LoggerFactory.getLogger(TestJoinSortRanks.class); 23 | 24 | /** 25 | * Reproduce issue in fastutil 8.5.8 loading (double) arrays from files of size 26 | * 2^31 bytes or more. 27 | */ 28 | @Disabled("Fixed in fastutil 8.5.9") 29 | @Test 30 | void testLoadingDoubleArray() { 31 | File file; 32 | try { 33 | file = File.createTempFile("test", ".bin"); 34 | } catch (IOException e) { 35 | LOG.error("Skipping test, failed to create temporary file to hold array:", e); 36 | return; 37 | } 38 | long intOverflow = 1L << 31; 39 | int arrSize = (int) (intOverflow / Double.BYTES); 40 | double[] arr = new double[arrSize]; 41 | try { 42 | LOG.info("Storing double array of length {} in file {}", arrSize, file.getAbsolutePath()); 43 | BinIO.storeDoubles(arr, file); 44 | LOG.info("Successfully stored double array of length {} in file {}, resulting file size: {} bytes", arrSize, 45 | file.getAbsolutePath(), file.length()); 46 | assertEquals(intOverflow, file.length()); 47 | LOG.info("Trying to clean up Java heap space..."); 48 | arr = null; 49 | System.gc(); 50 | LOG.info("Loading double array from file {}", file.getAbsolutePath()); 51 | arr = BinIO.loadDoubles(file.getAbsolutePath()); 52 | assertEquals(arrSize, arr.length); 53 | LOG.info("Successfully loaded double array of length {} from file {}", arr.length, file.getAbsolutePath()); 54 | } catch (IOException e) { 55 | fail("Failed to store and load double array: " + e); 56 | } finally { 57 | file.delete(); 58 | } 59 | } 60 | } 61 | --------------------------------------------------------------------------------