├── .DS_Store
├── .gitignore
├── LICENSE
├── README.md
├── assets
    └── demo.png
├── crawler_input
    └── .keep
├── lang_dump_confidence
    └── .keep
├── metadata
    ├── filterlist.txt
    ├── glot500_iso_code.json
    ├── iso_list.json
    ├── language_speakers_data.csv
    ├── languoid.csv
    ├── linguameta.tsv
    ├── madlad_aplha_3.json
    └── randomsample.txt
├── misc-code
    ├── crawling-with-seeds
    │   ├── config.yaml
    │   ├── input
    │   │   └── place_json_here
    │   ├── output
    │   │   └── .gitignore
    │   ├── readme.md
    │   └── seedcrawler_multithreaded.py
    ├── create_analysis_csv.py
    ├── filter_sites_and_lang.py
    ├── filterlist.txt
    ├── formatting_glot500_langs.ipynb
    ├── get_webinfo.py
    ├── glosslm
    │   ├── create_glotto_to_iso_mapping.py
    │   ├── glosslm_csv_creation.py
    │   ├── glosslm_dump.py
    │   ├── glosslm_processing.py
    │   ├── initial_text_seed_from_glosslm.py
    │   └── searxng_search.py
    ├── glot500_iso_code.json
    ├── glot500_raw.csv
    ├── glot500_raw.txt
    ├── iso_list.json
    ├── keeplist.txt
    ├── madlad.tsv
    ├── madlad_alpha_3_code_extraction.ipynb
    ├── seedcrawler.py
    └── trafilatura_filter
├── modeling
    └── create_language_modeling_files.py
├── output
    ├── .keep
    └── crawled
    │   └── .keep
├── pipeline
    ├── config.yaml
    ├── filter_config.yaml
    ├── language_filter.py
    ├── search_config.yaml
    ├── search_service.py
    ├── seedcrawler_alpha.py
    └── seedcrawler_beta.py
├── requirements.txt
├── result_filtering
    ├── final_filtering.py
    ├── formatted_output_robots_filtering.py
    ├── http_merge_2.py
    ├── output_formatter.py
    ├── remove_all_hash.py
    └── remove_hash.py
├── search_dump
    └── .keep
└── searxng
    ├── settings.yml
    └── uwsgi.ini


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cisnlp/GlotWeb/921fbf2f5e307f1f3e0927d078fa5afc95b74673/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | model_v3.bin
2 | logs/
3 | misc-code/.DS_Store
4 | .DS_Store
5 | output.zip
6 | .DS_Store
7 | output/
8 | crawler_input/
9 | lang_dump_confidence/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GlotWeb
  2 | ## About GlotWeb
  3 | 
  4 | GlotWeb is an advanced web indexing system specifically designed to address the digital resource gap for minority languages. Our system:
  5 | 
  6 | - **Identifies** web content in 402+ languages through multi-source aggregation
  7 | - **Validates** linguistic accuracy using GlotLID language identification
  8 | - **Filters** content to ensure quality while minimizing religious bias
  9 | - **Compiles** 169,155+ verified web links (47% in languages absent from major datasets)
 10 | 
 11 | ### Key Features
 12 | ✔ Covers languages missing from FLORES-200, MADLAD-400, and Glot500  
 13 | ✔ Open-source pipeline with reproducible results  
 14 | ✔ Interactive demo showcasing language resources (Click Image below to access)
 15 | 
 16 | [![Image](./assets/demo.png)](https://huggingface.co/spaces/cis-lmu/GlotWeb)
 17 | 
 18 | ## Getting Started
 19 | 
 20 | This documentation walks through GlotWeb's 4-step pipeline:
 21 | 
 22 | 1. **Search Setup**: Configure and run web searches
 23 | 2. **Seed Generation**: Filter initial results
 24 | 3. **Crawling**: Expand and validate links
 25 | 4. **Cleaning**: Deduplicate and finalize outputs
 26 | 
 27 | ### How to Use This Guide
 28 | 1. Follow steps sequentially (1 → 4)
 29 | 2. Each section includes:
 30 |    - Purpose explanation
 31 |    - Configuration options
 32 |    - Execution commands
 33 |    - Expected outputs
 34 | 3. Requires basic Python/Docker knowledge
 35 | 
 36 | > **Tip**: For quick setup, clone the repository and use the provided configuration files as templates.
 37 | 
 38 | Ready to begin? Proceed to [Step 1: Set up SearXNG and perform search](#step-1-set-up-searxng-and-perform-search).
 39 | 
 40 | # Step 1: Set up SearXNG and perform search
 41 | 
 42 | ##  SearXNG
 43 | 
 44 | ### Install
 45 | ```bash
 46 | 
 47 | export PORT=8080
 48 | 
 49 | docker pull searxng/searxng
 50 | docker run --rm \
 51 |              -d -p ${PORT}:8080 \
 52 |              -v "${PWD}/searxng:/etc/searxng" \
 53 |              -e "BASE_URL=http://localhost:${PORT}/" \
 54 |              -e "INSTANCE_NAME=my-instance" \
 55 |              searxng/searxng
 56 | ```
 57 | 
 58 | **add JSON output format:**
 59 | 
 60 | In the `${PWD}/searxng` directory, there is a file named `settings.yml`. In that file, you need to enable the JSON output format in the SearX configuration under the `search.formats` key like this:
 61 | 
 62 | ```yml
 63 | search:
 64 |   formats:
 65 |     - html
 66 |     - json
 67 | ```
 68 | 
 69 | **modify uwsgi.ini :**
 70 | In the `${PWD}/searxng` directory, there is a file named `uwsgi.ini`. In that file you need to modify buffer-size. Default is 8k. Increasing to 9k sometimes help with 'Internal Error 500'.
 71 | 
 72 | Default Value:
 73 | ```ini
 74 | buffer-size = 8192
 75 | ```
 76 | Change to:
 77 | ```ini
 78 | buffer-size = 9216
 79 | ```
 80 | 
 81 | ## Search Service Script: search_service.py
 82 | 
 83 | This is an object-oriented Python script that leverages the Searx API to perform searches and save the results to JSON files. The script is configurable using a YAML configuration file called 'search_config.yaml'.
 84 | 
 85 | ### Features
 86 | 
 87 | - Uses SearxSearchWrapper for querying multiple search engines.
 88 | - Handles retries for failed requests.
 89 | - Configurable search parameters through a YAML file.
 90 | - Configurable input file, search range, output directory, and other parameters.
 91 | - Automatically saves results in a structured JSON format.
 92 | 
 93 | ### Configuration file parameters:
 94 | 
 95 | ```yml
 96 | searx_host: "http://127.0.0.1:8080"  # Searx instance URL
 97 | engines:
 98 |   - "bing"
 99 |   - "yahoo"
100 |   - "qwant"
101 |   - "duckduckgo"  # Search engines to be used
102 | num_results: 50  # Number of results to fetch for each query
103 | max_retries: 3  # Maximum number of retries for failed requests
104 | retry_wait_time: 2  # Wait time (in seconds) between retries
105 | output_file_prefix: "results"  # Prefix for output file names
106 | output_directory: "search_dump"  # Directory to save output files
107 | input_file: "input.txt"  # Path to the input file containing search queries
108 | start_index: 0  # Start index for queries to process
109 | end_index: 10  # End index for queries to process
110 | ```
111 | ### Input file format:
112 | The input file should be a tab-separated file where each line contains an ISO code and a sentence for search:
113 | ```txt
114 | ISO_CODE_1    Search query 1
115 | ISO_CODE_2    Search query 2
116 | ```
117 | 
118 | ```txt
119 | aa	Itiyobbiyah agattinoona sittal xayyossa yangalen qaadoodih baari gablusaanamah angicille le.
120 | aai	Baise orot ta’ita’imon matah toniwa’an bar hinanutitiy gewas hinawowab.
121 | aak	O xewanɨŋo na'nɨ re rɨnɨŋɨnigɨnɨ, ‘A'mɨna' sea'yɨ e imo'nɨŋa' wonɨrɨnɨ.’
122 | ```
123 | The iso code for the input text file can be either 2 lettered format or 3 lettered format.
124 | 
125 | ### Usage
126 | Run the script using:
127 | pwd should be be root of the directory.
128 | ```bash
129 | python pipeline/search_service.py
130 | ```
131 | The search results will be saved in the specified output directory (e.g., search_dump) as JSON files named according to the specified prefix and index range, e.g., results_0-10.json.
132 | 
133 | ### Customization
134 | You can easily adjust the following parameters in the config.yaml file:
135 | 
136 | - Search engines: Add or remove engines in the engines list.
137 | - Search range: Modify start_index and end_index to control which lines in the input file are processed.
138 | - Output directory: Change output_directory to save results in a different location.
139 | 
140 | # Step 2: Filter and generate seeds
141 | 
142 | ## Overview
143 | This script filters web search dump/results based on domain restrictions, scrapes web pages, and performs language identification a FastText model for which we chose GlotLID. The processed data is stored in JSON format categorized by predicted languages.
144 | 
145 | ## Prerequisites
146 | ### Dependencies
147 | Ensure you have the following Python packages installed:
148 | 
149 | ```bash
150 | pip install fasttext trafilatura urllib3 tqdm pyyaml
151 | ```
152 | Configuration already provided in the repository and must be changed according to user preferneces. Examples below:
153 | 
154 | ```yaml
155 | model_path: "path/to/fasttext/model"
156 | domain_file: "path/to/domain_filter.txt"
157 | json_filename: "path/to/input.json"
158 | iso_list_file: "path/to/iso_list.json"
159 | output_directory: "path/to/output"
160 | ```
161 | 
162 | ## Running the Script
163 | Execute the script with:
164 | pwd should be be root of the directory.
165 | ```bash
166 | python pipeline/language_filter.py
167 | ```
168 | 
169 | # Step 3: Search and scrape with seeds
170 | 
171 | ## Overview
172 | This step takes the filtered seed URLs from Step 2 and performs deep crawling to discover additional web pages in the target languages. It includes:
173 | - Web crawling from seed URLs
174 | - Language detection using FastText (GlotLID)
175 | - Domain filtering
176 | - Parallel processing for efficiency
177 | - Comprehensive logging and metadata collection
178 | 
179 | ## Prerequisites
180 | ### Dependencies
181 | Ensure you have the following Python packages installed:
182 | ```bash
183 | pip install fasttext beautifulsoup4 requests trafilatura tqdm pyyaml urllib3
184 | ```
185 | 
186 | ### Configuration
187 | The script uses config.yaml with these key parameters:
188 | 
189 | ```yaml
190 | seed_crawler:
191 |   max_pages: 100            # Maximum pages to crawl per language
192 |   max_time: 3600              # Maximum crawling time in seconds
193 |   crawl_delay: 1              # Delay between requests
194 |   to_visit_growth_factor: 50   # Threshold for detecting circular links
195 |   max_workers: 4              # Threads for parallel processing
196 | 
197 | url_settings:
198 |   request_timeout: 10         # Timeout for web requests
199 |   max_url_length: 65000         # Maximum URL length to consider
200 | 
201 | language_detector:
202 |   model_path: "path/to/model" # Path to FastText model
203 |   minimum_confidence: 0.7     # Minimum language confidence score
204 |   desired_language: "bpy_Beng"      # Target language code
205 |   save_text: False            # Whether to save scraped text
206 | 
207 | output:
208 |   directory: "output"         # Output directory
209 |   output_file_name: "{language}_filtered.json"  # Output filename pattern
210 | 
211 | batch_processing:
212 |   enabled: False              # Enable batch mode
213 |   input_labels: []            # List of language codes for batch
214 |   cooldown_between_languages: 60  # Cool-down between languages
215 | ```
216 | 
217 | ### Input Requirements
218 | Input JSON files from Step 2 (named as [LANGUAGECODE_SCRIPT].json)
219 | 
220 | Each JSON file should contain entries with:
221 | 
222 | - link: URL string
223 | 
224 | - lid_confidence: Confidence score (float)
225 | 
226 | - predicted_lid: Language code
227 | 
228 | ### Output
229 | For each processed language, the script generates:
230 | 
231 | [LANGUAGE_CODE]_filtered.json - Filtered URLs with metadata
232 | 
233 | meta_data/[LANGUAGE_CODE]_meta_data.json - Crawling statistics including:
234 | 
235 | Seed URLs used
236 | 
237 | All discovered links
238 | 
239 | Filtered links
240 | 
241 | Unique new links
242 | 
243 | Rejected links
244 | 
245 | ### Usage
246 | Single Language Processing
247 | ```bash
248 | python pipeline/seed_crawler.py
249 | ```
250 | Configure desired_language in config.yaml first.
251 | 
252 | #### Batch Processing:
253 | Enable batch mode in config.yaml:
254 | 
255 | ```yaml
256 | batch_processing:
257 |   enabled: True
258 |   input_labels: ["syl_Sylo", "bpy_Beng", "akh_Latn"]  # Your target languages
259 | ```
260 | Run the same command:
261 | 
262 | ```bash
263 | python pipeline/seed_crawler_beta.py
264 | ```
265 | ### Customization Options
266 | - Crawling Behavior:
267 | Adjust max_pages and max_time to control crawling scope
268 | Modify crawl_delay to be more/less aggressive
269 | - Language Detection:
270 | Change minimum_confidence for stricter/looser filtering
271 | - Set save_text: True to store scraped content
272 | - Performance:
273 | Increase max_workers for faster processing (requires more CPU)
274 | Adjust cooldown_between_languages for batch processing
275 | 
276 | ### Output:
277 | 
278 | - Change output directory and filename patterns
279 | - Metadata collection is always enabled
280 | 
281 | ### Notes
282 | - The script automatically skips domains listed in your domain filter file
283 | - Progress bars are enabled by default (can be disabled in config)
284 | - Comprehensive logging helps troubleshoot issues
285 | 
286 | # Step 4: Filtering and Deduplication
287 | 
288 | ## Step 4.1: Domain Filtering
289 | 
290 | ### Purpose
291 | This script performs final domain filtering on crawled results to exclude unwanted domains from both the main output and metadata files.
292 | 
293 | ### Key Features
294 | - Loads crawled data and metadata JSON files
295 | - Applies domain filtering using the configured domain blocklist
296 | - Updates all metadata statistics after filtering
297 | - Handles both single-language and batch processing modes
298 | 
299 | ### Why Use It
300 | - Ensures final outputs comply with domain restrictions
301 | - Maintains consistency between data files and their metadata
302 | - Prepares clean data for subsequent deduplication steps
303 | 
304 | ### Usage
305 | Configure `domain_file` path in `config.yaml` and run:
306 | ```bash
307 | python result_filtering/final_domain_filter.py
308 | ```
309 | 
310 | ### Configuration
311 | Uses these key config parameters:
312 | 
313 | ```yaml
314 | domain_file: "path/to/domain_filter.txt"  # List of domains to exclude
315 | output:
316 |   directory: "output"                    # Where to find/save files
317 | ```
318 | ### Output
319 | Updates both:
320 | 
321 | [LANGUAGE]_filtered.json - With domain-filtered results
322 | 
323 | meta_data/[LANGUAGE]_meta_data.json - With filtered statistics
324 | 
325 | ## Step 4.2: Formatting Output for GlotWeb
326 | 
327 | ### Purpose
328 | Transforms crawled language data into a structured format suitable for GlotWeb visualization, enriching it with metadata and linguistic information.
329 | 
330 | ### Key Features
331 | - Extracts language metadata (speaker counts, language family)
332 | - Checks inclusion in major multilingual datasets (MADLAD-400, Flores, Glot500)
333 | - Organizes URLs by domain with site categorization
334 | - Handles both single-language and batch processing
335 | 
336 | ### Why Use It
337 | - Creates standardized format for GlotWeb frontend
338 | - Enriches raw data with valuable linguistic metadata
339 | - Provides domain-level organization of web resources
340 | - Generates compatibility flags for popular multilingual datasets
341 | 
342 | ### Configuration
343 | ```yaml
344 | output:
345 |   formated_directory: "formatted_output"  # Output directory
346 |   formated_file_name: "{language}_formatted.json"  # Output filename pattern
347 | ```
348 | ### Usage
349 | ```bash
350 | python result_filtering/format_for_glotweb.py
351 | ```
352 | 
353 | ## Step 4.3: Robots.txt Compliance Filtering
354 | 
355 | ### Purpose
356 | Filters out domains that explicitly block Common Crawl's CCBot in their robots.txt file, ensuring compliance with website crawling policies.
357 | 
358 | ### Key Features
359 | - Checks each domain's robots.txt for CCBot restrictions
360 | - Removes entire domains if they block CCBot
361 | - Preserves all other metadata while filtering
362 | - Handles both single-language and batch processing
363 | 
364 | ### Why Use It
365 | - Ensures ethical web scraping compliance
366 | - Prevents potential legal issues
367 | - Maintains good web citizenship by respecting robots.txt
368 | - Filters before final dataset compilation
369 | 
370 | ### Configuration
371 | ```yaml
372 | output:
373 |   formated_directory: "formatted_output"  # Input directory (from Step 4.2)
374 |   cleaned_directory: "cleaned_output"    # Output directory for filtered data
375 | ```
376 | ### Usage
377 | ```bash
378 | python result_filtering/robots_compliance_filter.py
379 | ```
380 | ### Process Flow
381 | - Loads formatted JSON from Step 4.2
382 | - For each domain:
383 | - Fetches robots.txt
384 | - Checks for CCBot restrictions
385 | - Saves cleaned version with compliant domains only
386 | 
387 | ### Output
388 | - Maintains same structure as input
389 | - Only contains domains that allow CCBot
390 | - Saved as [LANGUAGE].json in cleaned directory
391 | 
392 | ### Notes
393 | - If robots.txt is inaccessible, assumes crawling is allowed
394 | - Only checks for explicit CCBot blocks (not general User-agent: *)
395 | - Processes domains sequentially with 5-second timeout
396 | - Preserves all non-URL metadata (speaker counts, language family etc.)
397 | 
398 | ## Step 4.4: Final Cleaning and Deduplication
399 | 
400 | ### Purpose
401 | Performs final data cleaning through URL normalization and deduplication to create a polished dataset.
402 | 
403 | ### Process Overview
404 | 1. **HTTP/HTTPS Merging** (`http_merge_2.py`):
405 |    - Combines duplicate sites with different protocols (http/https)
406 |    - Standardizes www/non-www variants
407 |    - Preserves all unique links
408 | 
409 | 2. **Hash Fragment Removal** (`remove_all_hash.py`):
410 |    - Removes URL fragments (#section)
411 |    - Deduplicates URLs that only differ by fragments
412 | 
413 | ### Configuration
414 | ```yaml
415 | output:
416 |   robots_filtered: "output/robots_filtered"  # Input from Step 4.3
417 |   http_merged: "output/http_merged"         # Intermediate output
418 |   deduplication: "output/deduplication"     # Final output
419 | ```
420 | ### Usage
421 | ```bash
422 | # Run protocol merging first
423 | python result_filtering/http_merge_2.py
424 | 
425 | # Then run hash removal
426 | python result_filtering/remove_all_hash.py
427 | ```
428 | 
429 | ### Key Features
430 | - Protocol-agnostic site merging
431 | - Consistent URL normalization
432 | - Fragment removal while preserving query parameters
433 | - Order-preserving deduplication
434 | 
435 | ### Output
436 | Cleaned JSON files with:
437 | 
438 | - Unified site entries
439 | - Normalized URLs
440 | - No duplicate content
441 | 
442 | ✅ **Processed datasets** in `output/cleaned_output/[LANG].json` containing:
443 | - Verified web links
444 | - Language metadata (speakers, family)
445 | - Domain categorization
446 | - Compatibility flags (FLORES/MADLAD/Glot500)
447 | 
448 | ✅ **Metadata reports** in `output/meta_data/` with:
449 | - Crawling statistics
450 | - Domain distributions
451 | - Filtering metrics
452 | 
453 | # Step 5: Dataset Validation & Community Contribution
454 | 
455 | ## Community Auditing Request
456 | We urgently need **native speakers** and **linguists** to validate results:
457 | 
458 | ### How to Audit
459 | 1. **Explore your language** in the [GlotWeb Demo](https://huggingface.co/spaces/cis-lmu/GlotWeb)
460 | 2. **Check 10-20 random links** for:
461 |    - Actual language content (not machine translation)
462 |    - Cultural/educational value
463 |    - Flag as religious if content is from religious scriptures
464 |    - Correct language/dialect labeling
465 | 3. **Report issues** via:
466 |    - [GitHub Issues](https://github.com/cisnlp/GlotWeb/issues)
467 |    - Email: in.sefat@tum.de
468 | 
469 | ### Why This Matters
470 | | Impact Area | Community Role |
471 | |------------|---------------|
472 | | **Data Quality** | Remove spam/misclassified content |
473 | | **Language Preservation** | Identify valuable resources |
474 | | **NLP Development** | Improve training data for LLMs |
475 | 
476 | ## Get Involved
477 | - **Speakers**: Join us in Language auditing.
478 | - **Researchers**: Use data with citation (BibTeX forthcoming)
479 | 
480 | ## **Native speakers** of underrepresented languages are especially needed!
481 | 
482 | 


--------------------------------------------------------------------------------
/assets/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cisnlp/GlotWeb/921fbf2f5e307f1f3e0927d078fa5afc95b74673/assets/demo.png


--------------------------------------------------------------------------------
/crawler_input/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cisnlp/GlotWeb/921fbf2f5e307f1f3e0927d078fa5afc95b74673/crawler_input/.keep


--------------------------------------------------------------------------------
/lang_dump_confidence/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cisnlp/GlotWeb/921fbf2f5e307f1f3e0927d078fa5afc95b74673/lang_dump_confidence/.keep


--------------------------------------------------------------------------------
/metadata/filterlist.txt:
--------------------------------------------------------------------------------
 1 | ebible.org
 2 | png.bible
 3 | bible.com
 4 | jw.org
 5 | breakeveryyoke.com
 6 | scriptureearth.org
 7 | live.bible.is
 8 | bible.is
 9 | faithcomesbyhearing.com
10 | download.sabda.org/yesusmesias/__ebible/html/
11 | alkitab.mobi
12 | mykitabsuci.org
13 | aboriginalbibles.org.au
14 | youtube.com
15 | gospelgo.com
16 | tiktok.com
17 | wikiislam.net
18 | stepbible.org
19 | classicistranieri.com
20 | e-alkitab.org
21 | festvox.org/cmu_wilderness/
22 | biblerevelation.org
23 | biblehub.com
24 | biblegateway.com
25 | newchristianbiblestudy.org
26 | bibliaplus.org
27 | greekroom.org/wildebeest/ebible
28 | bibleforchildren.org
29 | al-islam.org
30 | google.com
31 | cyber.bible
32 | divinerevelations.info
33 | holy-bhagavad-gita.org
34 | realtor.com
35 | surinamebiblesoc
36 | lyricsmint
37 | quran411.com
38 | jesusforafrica
39 | espn.com
40 | wordproject.org/bibles
41 | https://karaoke.sabda.org
42 | manybooks.net
43 | rc.org
44 | slideshare.net
45 | timeanddate.com
46 | omniglot.com
47 | kitab.nur-az.com
48 | islamquest.net
49 | sourachristianteam.wordpress.com
50 | jw.1eye.us
51 | marysrosaries.com
52 | zori-islama.com
53 | https://gladtidings-bs.com/
54 | papuanewguinean.bible
55 | adibaptistunion.org
56 | baptist
57 | divinerevelations.info
58 | beblia.com
59 | breakeveryyoke.com/
60 | churchofjesuschrist.org
61 | www.simplybible.com
62 | newchristianbiblestudy.org
63 | bibel.simalungun.net
64 | chatinodenopala.org
65 | justus.anglican.org
66 | https://africa.coffeecup.com/Abuja/LukpaBible-NewTestament(Benin)
67 | https://tyndalebible.com/
68 | www.polarbibblo.se
69 | www.bahaiprayers.io/
70 | gullahbible.com/
71 | truebiblecode.com
72 | consexual.mx
73 | biblehub.com
74 | joraibibleassociation.org
75 | bible
76 | indiansexstories
77 | biblica.com
78 | fsm.bible/
79 | gospelweb.net
80 | catholicsabah.com
81 | glosbe.com
82 | ladakhibible.org
83 | islamworld
84 | islamiwazaif
85 | ummat-e-nabi.com
86 | islam
87 | biblia
88 | globalrecordings
89 | sunnatonline
90 | sunnionline
91 | New_Testament


--------------------------------------------------------------------------------
/metadata/glot500_iso_code.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     "hbs_Latn",
  3 |     "vec_Latn",
  4 |     "swh_Latn",
  5 |     "mal_Mlym",
  6 |     "jpn_Jpan",
  7 |     "alt_Cyrl",
  8 |     "aze_Latn",
  9 |     "lus_Latn",
 10 |     "rmn_Grek",
 11 |     "guj_Gujr",
 12 |     "crs_Latn",
 13 |     "miq_Latn",
 14 |     "ben_Beng",
 15 |     "kqn_Latn",
 16 |     "kaa_Cyrl",
 17 |     "kan_Knda",
 18 |     "ndo_Latn",
 19 |     "kos_Latn",
 20 |     "tel_Telu",
 21 |     "snd_Arab",
 22 |     "grn_Latn",
 23 |     "mlt_Latn",
 24 |     "yue_Hani",
 25 |     "lhu_Latn",
 26 |     "fra_Latn",
 27 |     "tiv_Latn",
 28 |     "lzh_Hani",
 29 |     "spa_Latn",
 30 |     "kua_Latn",
 31 |     "ajp_Arab",
 32 |     "eng_Latn",
 33 |     "kwy_Latn",
 34 |     "cmn_Hani",
 35 |     "fil_Latn",
 36 |     "hin_Latn",
 37 |     "gcf_Latn",
 38 |     "nob_Latn",
 39 |     "iku_Cans",
 40 |     "rmn_Cyrl",
 41 |     "rus_Cyrl",
 42 |     "kal_Latn",
 43 |     "kjh_Cyrl",
 44 |     "deu_Latn",
 45 |     "tdt_Latn",
 46 |     "rng_Latn",
 47 |     "tur_Latn",
 48 |     "gsw_Latn",
 49 |     "mgh_Latn",
 50 |     "pan_Guru",
 51 |     "mfe_Latn",
 52 |     "xmv_Latn",
 53 |     "mar_Deva",
 54 |     "swc_Latn",
 55 |     "ige_Latn",
 56 |     "por_Latn",
 57 |     "mon_Latn",
 58 |     "rmy_Latn",
 59 |     "nld_Latn",
 60 |     "mos_Latn",
 61 |     "srm_Latn",
 62 |     "ara_Arab",
 63 |     "kik_Latn",
 64 |     "bak_Latn",
 65 |     "zho_Hani",
 66 |     "cnh_Latn",
 67 |     "gur_Latn",
 68 |     "ita_Latn",
 69 |     "gil_Latn",
 70 |     "idu_Latn",
 71 |     "ind_Latn",
 72 |     "pon_Latn",
 73 |     "yom_Latn",
 74 |     "ell_Grek",
 75 |     "umb_Latn",
 76 |     "tdx_Latn",
 77 |     "bul_Cyrl",
 78 |     "lvs_Latn",
 79 |     "mzn_Arab",
 80 |     "swe_Latn",
 81 |     "sco_Latn",
 82 |     "cfm_Latn",
 83 |     "ces_Latn",
 84 |     "ori_Orya",
 85 |     "zpa_Latn",
 86 |     "isl_Latn",
 87 |     "arg_Latn",
 88 |     "kbd_Cyrl",
 89 |     "pol_Latn",
 90 |     "kur_Latn",
 91 |     "lao_Laoo",
 92 |     "ron_Latn",
 93 |     "dhv_Latn",
 94 |     "nap_Latn",
 95 |     "dan_Latn",
 96 |     "luo_Latn",
 97 |     "qub_Latn",
 98 |     "hun_Latn",
 99 |     "lun_Latn",
100 |     "oke_Latn",
101 |     "tgk_Cyrl",
102 |     "nzi_Latn",
103 |     "ote_Latn",
104 |     "srp_Latn",
105 |     "gug_Latn",
106 |     "bsb_Latn",
107 |     "fas_Arab",
108 |     "bar_Latn",
109 |     "ogo_Latn",
110 |     "ceb_Latn",
111 |     "bci_Latn",
112 |     "abn_Latn",
113 |     "heb_Hebr",
114 |     "chk_Latn",
115 |     "ldi_Latn",
116 |     "hrv_Latn",
117 |     "roh_Latn",
118 |     "ayr_Latn",
119 |     "glg_Latn",
120 |     "aym_Latn",
121 |     "gom_Deva",
122 |     "fin_Latn",
123 |     "yap_Latn",
124 |     "bba_Latn",
125 |     "slv_Latn",
126 |     "ssw_Latn",
127 |     "aln_Latn",
128 |     "vie_Latn",
129 |     "quz_Latn",
130 |     "leh_Latn",
131 |     "mkd_Cyrl",
132 |     "sah_Cyrl",
133 |     "ban_Latn",
134 |     "slk_Latn",
135 |     "tsn_Latn",
136 |     "ace_Latn",
137 |     "nor_Latn",
138 |     "lmo_Latn",
139 |     "pes_Arab",
140 |     "est_Latn",
141 |     "ido_Latn",
142 |     "skg_Latn",
143 |     "ltz_Latn",
144 |     "abk_Cyrl",
145 |     "ary_Arab",
146 |     "eus_Latn",
147 |     "zne_Latn",
148 |     "hus_Latn",
149 |     "lit_Latn",
150 |     "quy_Latn",
151 |     "glv_Latn",
152 |     "kaz_Cyrl",
153 |     "kam_Latn",
154 |     "fat_Latn",
155 |     "lav_Latn",
156 |     "bbc_Latn",
157 |     "frr_Latn",
158 |     "bos_Latn",
159 |     "vol_Latn",
160 |     "mwn_Latn",
161 |     "epo_Latn",
162 |     "wal_Latn",
163 |     "mai_Deva",
164 |     "cat_Latn",
165 |     "uig_Arab",
166 |     "dua_Latn",
167 |     "tha_Thai",
168 |     "vmw_Latn",
169 |     "dzo_Tibt",
170 |     "ukr_Cyrl",
171 |     "kwn_Latn",
172 |     "ctd_Latn",
173 |     "tgl_Latn",
174 |     "pam_Latn",
175 |     "nnb_Latn",
176 |     "sin_Sinh",
177 |     "seh_Latn",
178 |     "sxn_Latn",
179 |     "gle_Latn",
180 |     "tsc_Latn",
181 |     "mps_Latn",
182 |     "hin_Deva",
183 |     "nyk_Latn",
184 |     "mny_Latn",
185 |     "kor_Hang",
186 |     "kmb_Latn",
187 |     "gkp_Latn",
188 |     "ory_Orya",
189 |     "zai_Latn",
190 |     "kat_Latn",
191 |     "urd_Arab",
192 |     "gym_Latn",
193 |     "bjn_Latn",
194 |     "swa_Latn",
195 |     "bod_Tibt",
196 |     "acr_Latn",
197 |     "sqi_Latn",
198 |     "nde_Latn",
199 |     "dtp_Latn",
200 |     "bel_Cyrl",
201 |     "fon_Latn",
202 |     "lam_Latn",
203 |     "afr_Latn",
204 |     "ber_Latn",
205 |     "bik_Latn",
206 |     "nno_Latn",
207 |     "nbl_Latn",
208 |     "poh_Latn",
209 |     "tat_Cyrl",
210 |     "kmr_Latn",
211 |     "phm_Latn",
212 |     "ast_Latn",
213 |     "guc_Latn",
214 |     "hrx_Latn",
215 |     "mon_Cyrl",
216 |     "mam_Latn",
217 |     "quh_Latn",
218 |     "hbs_Cyrl",
219 |     "nia_Latn",
220 |     "hyw_Cyrl",
221 |     "hau_Latn",
222 |     "nyn_Latn",
223 |     "rue_Cyrl",
224 |     "sna_Latn",
225 |     "cab_Latn",
226 |     "eml_Latn",
227 |     "msa_Latn",
228 |     "top_Latn",
229 |     "acm_Arab",
230 |     "som_Latn",
231 |     "tog_Latn",
232 |     "tob_Latn",
233 |     "srp_Cyrl",
234 |     "mco_Latn",
235 |     "ach_Latn",
236 |     "mlg_Latn",
237 |     "tzh_Latn",
238 |     "vep_Latn",
239 |     "zul_Latn",
240 |     "pms_Latn",
241 |     "npi_Deva",
242 |     "arz_Arab",
243 |     "wuu_Hani",
244 |     "tok_Latn",
245 |     "nya_Latn",
246 |     "plt_Latn",
247 |     "sgs_Latn",
248 |     "tam_Taml",
249 |     "yid_Hebr",
250 |     "lĳ_Latn",
251 |     "hat_Latn",
252 |     "ada_Latn",
253 |     "myv_Cyrl",
254 |     "uzb_Latn",
255 |     "iba_Latn",
256 |     "tih_Latn",
257 |     "sot_Latn",
258 |     "kek_Latn",
259 |     "tat_Latn",
260 |     "uzb_Cyrl",
261 |     "koo_Latn",
262 |     "lfn_Latn",
263 |     "cos_Latn",
264 |     "sop_Latn",
265 |     "cgg_Latn",
266 |     "als_Latn",
267 |     "kac_Latn",
268 |     "ful_Latn",
269 |     "amh_Ethi",
270 |     "qvi_Latn",
271 |     "gor_Latn",
272 |     "sun_Latn",
273 |     "cak_Latn",
274 |     "ile_Latn",
275 |     "war_Latn",
276 |     "kbp_Latn",
277 |     "ium_Latn",
278 |     "div_Thaa",
279 |     "ctu_Latn",
280 |     "teo_Latn",
281 |     "yor_Latn",
282 |     "kri_Latn",
283 |     "kia_Latn",
284 |     "fao_Latn",
285 |     "mau_Latn",
286 |     "crh_Cyrl",
287 |     "uzn_Cyrl",
288 |     "scn_Latn",
289 |     "crh_Latn",
290 |     "smo_Latn",
291 |     "tyv_Cyrl",
292 |     "enm_Latn",
293 |     "bak_Cyrl",
294 |     "ina_Latn",
295 |     "sat_Olck",
296 |     "ilo_Latn",
297 |     "btx_Latn",
298 |     "mad_Latn",
299 |     "tso_Latn",
300 |     "nch_Latn",
301 |     "cac_Latn",
302 |     "mri_Latn",
303 |     "ncj_Latn",
304 |     "hnj_Latn",
305 |     "hmn_Latn",
306 |     "pau_Latn",
307 |     "ksh_Latn",
308 |     "asm_Beng",
309 |     "toj_Latn",
310 |     "ikk_Latn",
311 |     "hil_Latn",
312 |     "pcm_Latn",
313 |     "sba_Latn",
314 |     "nso_Latn",
315 |     "dyu_Latn",
316 |     "zom_Latn",
317 |     "ibo_Latn",
318 |     "kss_Latn",
319 |     "bqc_Latn",
320 |     "kin_Latn",
321 |     "afb_Arab",
322 |     "bim_Latn",
323 |     "hye_Armn",
324 |     "urh_Latn",
325 |     "mdy_Ethi",
326 |     "oci_Latn",
327 |     "quc_Latn",
328 |     "bts_Latn",
329 |     "lin_Latn",
330 |     "new_Deva",
331 |     "gya_Latn",
332 |     "tpi_Latn",
333 |     "yao_Latn",
334 |     "ajg_Latn",
335 |     "twi_Latn",
336 |     "ngl_Latn",
337 |     "agw_Latn",
338 |     "kir_Cyrl",
339 |     "nyu_Latn",
340 |     "kom_Cyrl",
341 |     "pap_Latn",
342 |     "kab_Latn",
343 |     "knv_Latn",
344 |     "nep_Deva",
345 |     "tuk_Cyrl",
346 |     "giz_Latn",
347 |     "azj_Latn",
348 |     "xmf_Geor",
349 |     "hui_Latn",
350 |     "bcl_Latn",
351 |     "ndc_Latn",
352 |     "kpg_Latn",
353 |     "xho_Latn",
354 |     "san_Deva",
355 |     "zea_Latn",
356 |     "cym_Latn",
357 |     "nba_Latn",
358 |     "aoj_Latn",
359 |     "gaa_Latn",
360 |     "bpy_Beng",
361 |     "csy_Latn",
362 |     "ton_Latn",
363 |     "ncx_Latn",
364 |     "azb_Arab",
365 |     "tah_Latn",
366 |     "qug_Latn",
367 |     "csb_Latn",
368 |     "lat_Latn",
369 |     "rmn_Latn",
370 |     "tpm_Latn",
371 |     "srn_Latn",
372 |     "cjk_Latn",
373 |     "quw_Latn",
374 |     "ewe_Latn",
375 |     "arb_Arab",
376 |     "rmy_Cyrl",
377 |     "bem_Latn",
378 |     "kea_Latn",
379 |     "ixl_Latn",
380 |     "efi_Latn",
381 |     "mck_Latn",
382 |     "mbb_Latn",
383 |     "bis_Latn",
384 |     "arn_Latn",
385 |     "pfl_Latn",
386 |     "orm_Latn",
387 |     "pdt_Latn",
388 |     "pcd_Latn",
389 |     "haw_Latn",
390 |     "her_Latn",
391 |     "tlh_Latn",
392 |     "hmo_Latn",
393 |     "gla_Latn",
394 |     "suz_Deva",
395 |     "kat_Geor",
396 |     "kmr_Cyrl",
397 |     "gcr_Latn",
398 |     "pag_Latn",
399 |     "mwl_Latn",
400 |     "jbo_Latn",
401 |     "loz_Latn",
402 |     "nav_Latn",
403 |     "tbz_Latn",
404 |     "fry_Latn",
405 |     "ksw_Mymr",
406 |     "bam_Latn",
407 |     "mya_Mymr",
408 |     "mxv_Latn",
409 |     "prk_Latn",
410 |     "nds_Latn",
411 |     "hif_Latn",
412 |     "jam_Latn",
413 |     "run_Latn",
414 |     "wol_Latn",
415 |     "twx_Latn",
416 |     "pnb_Arab",
417 |     "sme_Latn",
418 |     "nmf_Latn",
419 |     "rar_Latn",
420 |     "gom_Latn",
421 |     "caq_Latn",
422 |     "fij_Latn",
423 |     "bum_Latn",
424 |     "rop_Latn",
425 |     "wls_Latn",
426 |     "mgr_Latn",
427 |     "tca_Latn",
428 |     "ckb_Arab",
429 |     "ahk_Latn",
430 |     "yan_Latn",
431 |     "ven_Latn",
432 |     "kur_Arab",
433 |     "xav_Latn",
434 |     "zsm_Latn",
435 |     "bas_Latn",
436 |     "bih_Deva",
437 |     "chv_Cyrl",
438 |     "bin_Latn",
439 |     "cuk_Latn",
440 |     "lua_Latn",
441 |     "tsz_Latn",
442 |     "kjb_Latn",
443 |     "que_Latn",
444 |     "sid_Latn",
445 |     "hne_Deva",
446 |     "sag_Latn",
447 |     "diq_Latn",
448 |     "wbm_Latn",
449 |     "guw_Latn",
450 |     "srd_Latn",
451 |     "zlm_Latn",
452 |     "bre_Latn",
453 |     "tcf_Latn",
454 |     "tui_Latn",
455 |     "toi_Latn",
456 |     "bzj_Latn",
457 |     "ifb_Latn",
458 |     "pus_Arab",
459 |     "udm_Cyrl",
460 |     "izz_Latn",
461 |     "che_Cyrl",
462 |     "cce_Latn",
463 |     "rug_Latn",
464 |     "pis_Latn",
465 |     "meu_Latn",
466 |     "aka_Latn",
467 |     "kon_Latn",
468 |     "chw_Latn",
469 |     "pxm_Latn",
470 |     "oss_Cyrl",
471 |     "cbk_Latn",
472 |     "kmm_Latn",
473 |     "hyw_Armn",
474 |     "ibg_Latn",
475 |     "mcn_Latn",
476 |     "iso_Latn",
477 |     "bhw_Latn",
478 |     "ifa_Latn",
479 |     "nan_Latn",
480 |     "ngu_Latn",
481 |     "dln_Latn",
482 |     "lub_Latn",
483 |     "nyy_Latn",
484 |     "ext_Latn",
485 |     "lim_Latn",
486 |     "szl_Latn",
487 |     "ksd_Latn",
488 |     "tuk_Latn",
489 |     "ish_Latn",
490 |     "mzh_Latn",
491 |     "tir_Ethi",
492 |     "naq_Latn",
493 |     "llb_Latn",
494 |     "tgk_Latn",
495 |     "toh_Latn",
496 |     "hra_Latn",
497 |     "yua_Latn",
498 |     "ttj_Latn",
499 |     "mwm_Latn",
500 |     "min_Latn",
501 |     "nse_Latn",
502 |     "krc_Cyrl",
503 |     "lue_Latn",
504 |     "hsb_Latn",
505 |     "tuc_Latn",
506 |     "khm_Khmr",
507 |     "ami_Latn",
508 |     "mrw_Latn",
509 |     "tum_Latn",
510 |     "alz_Latn",
511 |     "pls_Latn",
512 |     "tll_Latn",
513 |     "apc_Arab",
514 |     "rap_Latn",
515 |     "ekk_Latn",
516 |     "vls_Latn",
517 |     "fur_Latn",
518 |     "lug_Latn",
519 |     "mhr_Cyrl",
520 |     "kaa_Latn",
521 |     "niu_Latn",
522 |     "djk_Latn",
523 |     "prs_Arab",
524 |     "tzo_Latn",
525 |     "wes_Latn",
526 |     "san_Latn",
527 |     "mah_Latn",
528 |     "gkn_Latn",
529 |     "som_Arab",
530 |     "tvl_Latn",
531 |     "grc_Grek",
532 |     "uig_Latn",
533 |     "jav_Latn",
534 |     "hbo_Hebr",
535 |     "hau_Arab"
536 | ]
537 | 


--------------------------------------------------------------------------------
/metadata/iso_list.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "cmn_Hani", "spa_Latn", "eng_Latn", "hin_Deva", "ben_Beng", 
 3 |     "por_Latn", "rus_Cyrl", "jpn_Jpan", "pnb_Arab", "mar_Deva", 
 4 |     "tel_Telu", "wuu_Hani", "tur_Latn", "kor_Hang", "fra_Latn", 
 5 |     "deu_Latn", "vie_Latn", "tam_Taml", "urd_Arab", "jav_Latn", 
 6 |     "ita_Latn", "guj_Gujr", "fas_Arab", "bho_Deva", "nan_Hani", 
 7 |     "hak_Hani", "cjy_Hani", "hau_Latn", "kan_Knda", "ind_Latn", 
 8 |     "pol_Latn", "yor_Latn", "hsn_Hani", "mal_Mlym", "ory_Orya", 
 9 |     "mai_Deva", "mya_Mymr", "sun_Latn", "apd_Arab", "arq_Arab", 
10 |     "tha_Thai", "hne_Deva", "som_Latn", "pan_Guru", "nld_Latn", 
11 |     "amh_Ethi", "ukr_Cyrl", "sin_Sinh", "ctg_Beng", "tgl_Latn", 
12 |     "ron_Latn", "aze_Latn", "ell_Grek", "kaz_Cyrl", "zha_Latn", 
13 |     "hun_Latn", "kin_Latn", "sna_Latn", "ceb_Latn", "mad_Latn", 
14 |     "dcc_Arab", "hat_Latn", "khm_Khmr", "asm_Beng", "nep_Deva", 
15 |     "ssw_Latn", "zul_Latn", "ful_Latn", "tsn_Latn", "sot_Latn", 
16 |     "run_Latn", "tir_Ethi", "hbs_Latn", "uzb_Latn", "pus_Arab", 
17 |     "mos_Latn", "ces_Latn", "nld_Latn", "swe_Latn", "aka_Latn", 
18 |     "nya_Latn", "bam_Latn", "kok_Deva", "bel_Cyrl", "tgk_Cyrl", 
19 |     "bcc_Arab", "hil_Latn", "run_Latn", "mos_Latn", "hbs_Cyrl", 
20 |     "ful_Latn", "ssw_Latn", "kok_Deva", "dzo_Tibt", "sot_Latn", 
21 |     "tir_Ethi", "sat_Olck", "zul_Latn", "ful_Latn", "mkd_Cyrl", 
22 |     "mon_Cyrl", "xho_Latn", "dan_Latn", "heb_Hebr", "slk_Latn", 
23 |     "gle_Latn", "fin_Latn", "nor_Latn", "cym_Latn", "twi_Latn", 
24 |     "kur_Arab", "ltz_Latn", "bak_Cyrl", "sag_Latn", "gla_Latn", 
25 |     "lav_Latn", "lit_Latn", "isl_Latn", "chv_Cyrl", "tat_Cyrl", 
26 |     "glg_Latn", "bak_Cyrl", "cos_Latn", "bre_Latn", "bis_Latn", 
27 |     "sme_Latn", "ast_Latn", "oci_Latn", "ava_Cyrl", "yid_Hebr", 
28 |     "fao_Latn", "csb_Latn", "lad_Latn", "kom_Cyrl", "mhr_Cyrl", 
29 |     "mrj_Cyrl", "orm_Latn", "sms_Latn", "fry_Latn", "iku_Latn", 
30 |     "glv_Latn", "kal_Latn", "cor_Latn", "epo_Latn", "nav_Latn", 
31 |     "srd_Latn", "tum_Latn", "pau_Latn", "mri_Latn", "tpi_Latn", 
32 |     "tet_Latn", "gil_Latn", "fij_Latn", "mah_Latn", "nya_Latn", 
33 |     "ven_Latn", "nde_Latn", "ton_Latn", "smo_Latn", "tvl_Latn", 
34 |     "tso_Latn", "orm_Latn", "swa_Latn", "kur_Latn", "lao_Laoo", 
35 |     "snd_Arab", "que_Latn", "aym_Latn", "grn_Latn", "arn_Latn", 
36 |     "quc_Latn", "iku_Cans", "xho_Latn", "sna_Latn", "zul_Latn", 
37 |     "nya_Latn", "tsn_Latn", "sot_Latn", "fan_Latn", "wol_Latn", 
38 |     "bem_Latn", "lin_Latn", "twi_Latn", "aka_Latn", "sna_Latn", 
39 |     "loz_Latn", "ndc_Latn", "lub_Latn", "kik_Latn", "swa_Latn", 
40 |     "zul_Latn", "nde_Latn", "nya_Latn", "orm_Latn", "amh_Ethi", 
41 |     "tir_Ethi", "som_Latn", "aar_Latn", "kam_Latn"
42 | ]
43 | 


--------------------------------------------------------------------------------
/metadata/madlad_aplha_3.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     "eng",
  3 |     "rus",
  4 |     "spa",
  5 |     "deu",
  6 |     "fra",
  7 |     "ita",
  8 |     "por",
  9 |     "pol",
 10 |     "nld",
 11 |     "tur",
 12 |     "vie",
 13 |     "ces",
 14 |     "ind",
 15 |     "ron",
 16 |     "swe",
 17 |     "hun",
 18 |     "ukr",
 19 |     "fas",
 20 |     "jpn",
 21 |     "ell",
 22 |     "fin",
 23 |     "zho",
 24 |     "dan",
 25 |     "tha",
 26 |     "nor",
 27 |     "bul",
 28 |     "kor",
 29 |     "ara",
 30 |     "slk",
 31 |     "cat",
 32 |     "lit",
 33 |     "heb",
 34 |     "slv",
 35 |     "est",
 36 |     "lav",
 37 |     "hin",
 38 |     "sqi",
 39 |     "aze",
 40 |     "hrv",
 41 |     "tam",
 42 |     "msa",
 43 |     "mal",
 44 |     "srp",
 45 |     "kaz",
 46 |     "tel",
 47 |     "mar",
 48 |     "isl",
 49 |     "bos",
 50 |     "mkd",
 51 |     "glg",
 52 |     "eus",
 53 |     "ben",
 54 |     "bel",
 55 |     "kat",
 56 |     "fil",
 57 |     "mon",
 58 |     "afr",
 59 |     "uzb",
 60 |     "guj",
 61 |     "kan",
 62 |     "kaa",
 63 |     "swa",
 64 |     "urd",
 65 |     "nep",
 66 |     "cym",
 67 |     "hye",
 68 |     "kir",
 69 |     "sin",
 70 |     "tat",
 71 |     "tgk",
 72 |     "lat",
 73 |     "som",
 74 |     "gle",
 75 |     "khm",
 76 |     "mlt",
 77 |     "epo",
 78 |     "pus",
 79 |     "kin",
 80 |     "kur",
 81 |     "lao",
 82 |     "fry",
 83 |     "hau",
 84 |     "mya",
 85 |     "div",
 86 |     "pan",
 87 |     "ckb",
 88 |     "ltz",
 89 |     "mlg",
 90 |     "hat",
 91 |     "uig",
 92 |     "amh",
 93 |     "ori",
 94 |     "fao",
 95 |     "gla",
 96 |     "bak",
 97 |     "tuk",
 98 |     "mri",
 99 |     "hmn",
100 |     "grc",
101 |     "jav",
102 |     "ceb",
103 |     "snd",
104 |     "yid",
105 |     "kaa",
106 |     "sna",
107 |     "cos",
108 |     "sun",
109 |     "pap",
110 |     "ibo",
111 |     "zul",
112 |     "xho",
113 |     "smo",
114 |     "nya",
115 |     "yor",
116 |     "chv",
117 |     "ell",
118 |     "kal",
119 |     "haw",
120 |     "gsw",
121 |     "tet",
122 |     "sot",
123 |     "lus",
124 |     "oci",
125 |     "asm",
126 |     "roh",
127 |     "bre",
128 |     "sah",
129 |     "hin",
130 |     "sme",
131 |     "cnh",
132 |     "orm",
133 |     "che",
134 |     "udm",
135 |     "lug",
136 |     "oss",
137 |     "nav",
138 |     "kha",
139 |     "ilo",
140 |     "ctd",
141 |     "vec",
142 |     "hil",
143 |     "tyv",
144 |     "iba",
145 |     "rus",
146 |     "kbd",
147 |     "tir",
148 |     "san",
149 |     "ava",
150 |     "bod",
151 |     "zza",
152 |     null,
153 |     "otq",
154 |     "tel",
155 |     "bua",
156 |     "tso",
157 |     "cfm",
158 |     "tsn",
159 |     "krc",
160 |     "aka",
161 |     "meo",
162 |     "chm",
163 |     "ton",
164 |     "ewe",
165 |     "nso",
166 |     "ady",
167 |     "rom",
168 |     "bho",
169 |     "ltg",
170 |     "fij",
171 |     "yua",
172 |     "grn",
173 |     "aze",
174 |     "lin",
175 |     "ada",
176 |     "myv",
177 |     "bik",
178 |     "tlh",
179 |     "kbp",
180 |     "war",
181 |     "wln",
182 |     "bew",
183 |     "rcf",
184 |     "tam",
185 |     "kac",
186 |     "iku",
187 |     "aym",
188 |     "kum",
189 |     "que",
190 |     "bgp",
191 |     "hif",
192 |     "cor",
193 |     "nan",
194 |     "srn",
195 |     "tly",
196 |     "sag",
197 |     "gom",
198 |     "mal",
199 |     "kua",
200 |     "ksd",
201 |     "dzo",
202 |     "kom",
203 |     "msi",
204 |     "ven",
205 |     "zap",
206 |     "zxx",
207 |     "meu",
208 |     "iso",
209 |     "ium",
210 |     "nhe",
211 |     "tyz",
212 |     "hui",
213 |     "new",
214 |     "mdf",
215 |     "pag",
216 |     "glv",
217 |     "gag",
218 |     "ngu",
219 |     "quc",
220 |     "mam",
221 |     "min",
222 |     "hmo",
223 |     "pon",
224 |     "mrj",
225 |     "lub",
226 |     "gom",
227 |     "alt",
228 |     "nzi",
229 |     "tzo",
230 |     "bci",
231 |     "dtp",
232 |     "abt",
233 |     "bbc",
234 |     "pck",
235 |     "mai",
236 |     "mps",
237 |     "emp",
238 |     "mgh",
239 |     "tab",
240 |     "crh",
241 |     "tbz",
242 |     "ssw",
243 |     "chk",
244 |     "bru",
245 |     "nnb",
246 |     "fon",
247 |     "ppk",
248 |     "tiv",
249 |     "btx",
250 |     "bul",
251 |     "mbt",
252 |     "ace",
253 |     "tvl",
254 |     "dov",
255 |     "ach",
256 |     "xal",
257 |     "cuk",
258 |     "kos",
259 |     "crs",
260 |     "wol",
261 |     "bts",
262 |     "ubu",
263 |     "gym",
264 |     "ibb",
265 |     "ape",
266 |     "stq",
267 |     "ang",
268 |     "enq",
269 |     "tsg",
270 |     "shn",
271 |     "kri",
272 |     "kek",
273 |     "rmc",
274 |     "acf",
275 |     "fip",
276 |     "syr",
277 |     "qub",
278 |     "bam",
279 |     "tzh",
280 |     "jiv",
281 |     "kan",
282 |     "kjh",
283 |     "yap",
284 |     "ban",
285 |     "tuc",
286 |     "tcy",
287 |     "cab",
288 |     "cak",
289 |     "din",
290 |     "zho",
291 |     "arn",
292 |     "lrc",
293 |     "rwo",
294 |     "hus",
295 |     "bum",
296 |     "mak",
297 |     "frp",
298 |     "seh",
299 |     "twu",
300 |     "kmb",
301 |     "ksw",
302 |     "sja",
303 |     "amu",
304 |     "mad",
305 |     "quh",
306 |     "dyu",
307 |     "toj",
308 |     "cha",
309 |     "sus",
310 |     "nog",
311 |     "jam",
312 |     "gui",
313 |     "nia",
314 |     "mas",
315 |     "bzj",
316 |     "mkn",
317 |     "lhu",
318 |     "ctu",
319 |     "kon",
320 |     "inb",
321 |     "guh",
322 |     "run",
323 |     "bus",
324 |     "mfe",
325 |     "sda",
326 |     "bis",
327 |     "cre",
328 |     "gor",
329 |     "jac",
330 |     "chr",
331 |     "mah",
332 |     "mni",
333 |     "wal",
334 |     "teo",
335 |     "gub",
336 |     "qvi",
337 |     "tdx",
338 |     "rki",
339 |     "djk",
340 |     "nbl",
341 |     "zne",
342 |     "izz",
343 |     "noa",
344 |     "bqc",
345 |     "srm",
346 |     "niq",
347 |     "bas",
348 |     "dwr",
349 |     "guc",
350 |     "jvn",
351 |     "hvn",
352 |     "sxn",
353 |     "koi",
354 |     "alz",
355 |     "nyu",
356 |     "ben",
357 |     "suz",
358 |     "pau",
359 |     "nij",
360 |     "sat",
361 |     "guj",
362 |     "msm",
363 |     "maz",
364 |     "qxr",
365 |     "shp",
366 |     "hne",
367 |     "ktu",
368 |     "laj",
369 |     "pis",
370 |     "mag",
371 |     "gbm",
372 |     "tzj",
373 |     "oji",
374 |     "ndc",
375 |     "tks",
376 |     "awa",
377 |     "gvl",
378 |     "knj",
379 |     "spp",
380 |     "mqy",
381 |     "tca",
382 |     "cce",
383 |     "skr",
384 |     "kmz",
385 |     "dje",
386 |     "gof",
387 |     "agr",
388 |     "qvz",
389 |     "adh",
390 |     "quf",
391 |     "kjg",
392 |     "tsc",
393 |     null,
394 |     "ify",
395 |     "cbk",
396 |     "quy",
397 |     "ahk",
398 |     "cac",
399 |     "akb",
400 |     "nut",
401 |     "ffm",
402 |     "taj",
403 |     "msa",
404 |     "brx",
405 |     "ann",
406 |     "qup",
407 |     "msa",
408 |     "miq",
409 |     "msb",
410 |     "bim",
411 |     "raj",
412 |     "kwi",
413 |     "tll",
414 |     "trp",
415 |     "smt",
416 |     "mrw",
417 |     "dln",
418 |     "qvc",
419 |     "doi",
420 |     "ful"
421 | ]
422 | 


--------------------------------------------------------------------------------
/misc-code/crawling-with-seeds/config.yaml:
--------------------------------------------------------------------------------
 1 | # YAML configuration for SeedReader, SeedCrawler, and LanguageDetector
 2 | 
 3 | # SeedReader configuration
 4 | seed_reader:
 5 |   input_directory: "input" # Directory containing the input JSON files
 6 |   json_file_name: "urd_Latn.json" # Name of the JSON file to read data from
 7 | 
 8 | # SeedCrawler configuration
 9 | seed_crawler:
10 |   max_pages: 100 # Maximum number of pages to crawl
11 |   max_workers: 10 # Maximum number of threads for crawling
12 |   crawl_delay: 0.1 # Delay between each crawling request in seconds
13 | 
14 | # LanguageDetector configuration
15 | language_detector:
16 |   model_path: "model_v3.bin" # Path to the FastText model file
17 |   desired_language: "urd_Latn" # Target language code
18 |   minimum_confidence: 0.8 # Minimum confidence score for language detection
19 | 
20 | # Output configuration
21 | output:
22 |   directory: "output/out_file" # Directory to save output JSON files
23 |   output_file_name: "{language}_crawled_output.json" # Output file pattern
24 | 
25 | # Logging configuration
26 | logging:
27 |   level: "INFO" # Logging level: DEBUG, INFO, WARNING, ERROR
28 |   file_path: "logs/application.log" # Path to the log file
29 | 
30 | # Progress bar configuration
31 | progress_bar:
32 |   enabled: true # Enable or disable tqdm progress bar
33 | 
34 | # ThreadPoolExecutor configuration
35 | executor:
36 |   max_workers_reader: 5 # Maximum number of threads for reading data
37 | 
38 | # URL settings
39 | url_settings:
40 |   max_url_length: 65536 # Maximum URL length allowed
41 |   request_timeout: 10 # Timeout for HTTP requests in seconds
42 | 


--------------------------------------------------------------------------------
/misc-code/crawling-with-seeds/input/place_json_here:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/misc-code/crawling-with-seeds/output/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/misc-code/crawling-with-seeds/readme.md:
--------------------------------------------------------------------------------
 1 | # Crawling with Seeds
 2 | 
 3 | This project is designed to read seed URLs from a JSON file, crawl those websites, and detect specific languages in the content. The script leverages GlotLID FastText model for language detection and uses Trafilatura for content extraction from web pages.
 4 | 
 5 | - Place json input seeds in /input.
 6 | - Download and place model file here.
 7 | - Modify the YAML file accordingly.
 8 | - Run the script.
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/misc-code/crawling-with-seeds/seedcrawler_multithreaded.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import requests
  4 | import yaml
  5 | from bs4 import BeautifulSoup
  6 | from urllib.parse import urljoin, urlparse
  7 | import time
  8 | import json
  9 | from typing import List, Dict, Any
 10 | from tqdm import tqdm
 11 | from trafilatura import extract, fetch_url
 12 | import fasttext
 13 | import urllib3
 14 | from concurrent.futures import ThreadPoolExecutor, as_completed
 15 | 
 16 | # Load YAML configuration
 17 | with open('config.yaml', 'r') as config_file:
 18 |     config = yaml.safe_load(config_file)
 19 | 
 20 | # Extract configuration settings
 21 | seed_reader_config = config['seed_reader']
 22 | seed_crawler_config = config['seed_crawler']
 23 | language_detector_config = config['language_detector']
 24 | output_config = config['output']
 25 | logging_config = config['logging']
 26 | progress_bar_config = config['progress_bar']
 27 | executor_config = config['executor']
 28 | url_settings = config['url_settings']
 29 | 
 30 | # Increase max URL length if needed
 31 | urllib3.util.url.MAX_URL_LENGTH = url_settings['max_url_length']
 32 | 
 33 | class SeedReader:
 34 |     def __init__(self, json_file_path: str):
 35 |         """
 36 |         Initializes the SeedReader with the path to a JSON file.
 37 |         
 38 |         :param json_file_path: The path to the JSON file containing the data.
 39 |         """
 40 |         self.json_file_path = json_file_path
 41 |         self.data = self.read_json_file()
 42 | 
 43 |     def read_json_file(self) -> List[Dict[str, Any]]:
 44 |         """
 45 |         Reads the JSON file and returns a list of dictionaries containing the data.
 46 | 
 47 |         :return: A list of dictionaries with keys: "snippet", "title", "link", "engines",
 48 |                  "category", "predicted_lid", "lid_confidence".
 49 |         """
 50 |         try:
 51 |             with open(self.json_file_path, 'r', encoding='utf-8') as file:
 52 |                 data = json.load(file)
 53 |                 # Validate that each item in the list is a dictionary with required keys
 54 |                 for item in data:
 55 |                     if not all(key in item for key in ["snippet", "title", "link", "engines", 
 56 |                                                        "category", "predicted_lid", "lid_confidence"]):
 57 |                         raise ValueError("Missing one or more required keys in the JSON data")
 58 |                 return data
 59 |         except FileNotFoundError:
 60 |             print(f"File not found: {self.json_file_path}")
 61 |             return []
 62 |         except json.JSONDecodeError:
 63 |             print(f"Error decoding JSON from file: {self.json_file_path}")
 64 |             return []
 65 | 
 66 |     def get_data(self) -> List[Dict[str, Any]]:
 67 |         """
 68 |         Returns the list of dictionaries containing the data.
 69 |         
 70 |         :return: List of data entries as dictionaries.
 71 |         """
 72 |         return self.data
 73 | 
 74 |     def get_entry_by_index(self, index: int) -> Dict[str, Any]:
 75 |         """
 76 |         Returns a specific entry by its index in the data list.
 77 | 
 78 |         :param index: The index of the entry to retrieve.
 79 |         :return: A dictionary representing the entry at the specified index.
 80 |         :raises IndexError: If the index is out of range.
 81 |         """
 82 |         try:
 83 |             return self.data[index]
 84 |         except IndexError:
 85 |             print(f"Index out of range: {index}")
 86 |             return {}
 87 | 
 88 |     def filter_by_key_value(self, key: str, value: Any) -> List[Dict[str, Any]]:
 89 |         """
 90 |         Filters the data based on a specified key-value pair.
 91 | 
 92 |         :param key: The key to filter by.
 93 |         :param value: The value to match.
 94 |         :return: A list of dictionaries where the specified key has the given value.
 95 |         """
 96 |         if not self.data:
 97 |             print("No data available for filtering.")
 98 |             return []
 99 | 
100 |         filtered_data = [entry for entry in self.data if entry.get(key) == value]
101 |         
102 |         if not filtered_data:
103 |             print(f"No entries found with {key} = {value}")
104 |         
105 |         return filtered_data
106 | 
107 | 
108 | class SeedCrawler:
109 |     def __init__(self, seed_url, max_pages=seed_crawler_config['max_pages']):
110 |         """
111 |         Initializes the SeedCrawler with a seed URL and maximum pages to crawl.
112 |         
113 |         :param seed_url: The URL to start crawling from.
114 |         :param max_pages: The maximum number of pages to crawl.
115 |         """
116 |         self.seed_url = seed_url
117 |         self.max_pages = max_pages
118 |         self.domain = urlparse(seed_url).netloc  # Extract domain from seed URL
119 |         self.visited = set()
120 |         self.to_visit = [seed_url]
121 |         self.all_links = set()
122 | 
123 |     def get_links(self, url):
124 |         """
125 |         Fetches all links from a webpage belonging to the specified domain.
126 | 
127 |         :param url: The URL of the webpage to fetch links from.
128 |         :return: A set of links belonging to the specified domain.
129 |         """
130 |         try:
131 |             response = requests.get(url, timeout=url_settings['request_timeout'])
132 |             response.raise_for_status()  # Raise an error for bad responses
133 |         except (requests.RequestException, requests.HTTPError) as e:
134 |             print(f"Error fetching {url}: {e}")
135 |             return set()
136 | 
137 |         soup = BeautifulSoup(response.text, 'html.parser')
138 |         links = set()
139 | 
140 |         # Find all anchor tags with href attributes
141 |         for anchor in soup.find_all('a', href=True):
142 |             # Resolve relative URLs
143 |             link = urljoin(url, anchor['href'])
144 |             # Parse the URL to check its domain
145 |             parsed_link = urlparse(link)
146 | 
147 |             # Check if the link belongs to the specified domain
148 |             if parsed_link.netloc.endswith(self.domain):
149 |                 links.add(link)
150 | 
151 |         return links
152 | 
153 |     def crawl_website(self):
154 |         """
155 |         Crawls the entire website starting from the seed URL.
156 | 
157 |         :return: A set of all found links belonging to the specified domain.
158 |         """
159 |         print(f"Crawling links from: {self.seed_url}")
160 |         with ThreadPoolExecutor(max_workers=seed_crawler_config['max_workers']) as executor:
161 |             futures = {}
162 |             
163 |             while self.to_visit and len(self.visited) < self.max_pages:
164 |                 current_url = self.to_visit.pop(0)
165 |                 
166 |                 if current_url in self.visited:
167 |                     continue
168 |                 
169 |                 future = executor.submit(self.get_links, current_url)
170 |                 futures[future] = current_url
171 |                 
172 |                 # Sleep after submitting the request to be polite
173 |                 time.sleep(seed_crawler_config['crawl_delay'])
174 | 
175 |             for future in as_completed(futures):
176 |                 current_url = futures[future]
177 |                 try:
178 |                     links = future.result()
179 |                     self.all_links.update(links)
180 |                     
181 |                     # Add new links to the to_visit list
182 |                     for link in links:
183 |                         if link not in self.visited and link not in self.to_visit:
184 |                             self.to_visit.append(link)
185 |                     
186 |                     self.visited.add(current_url)
187 |                 except Exception as e:
188 |                     print(f"Exception occurred while crawling {current_url}: {e}")
189 | 
190 |         print(f"Finished crawling links from: {self.seed_url}")
191 |         return self.all_links
192 | 
193 | 
194 | class LanguageDetector:
195 |     def __init__(self, model):
196 |         """
197 |         Initialize the LanguageDetector class with a FastText model.
198 | 
199 |         Args:
200 |             model: The loaded FastText model.
201 |         """
202 |         self.model = model  # Store the model reference instead of loading again.
203 | 
204 |     @staticmethod
205 |     def extract_language_code(input_str):
206 |         """
207 |         Extract the language code from a FastText prediction string.
208 | 
209 |         Args:
210 |             input_str (str): The input prediction string from FastText.
211 | 
212 |         Returns:
213 |             str or None: Extracted language code or None if not found.
214 |         """
215 |         pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
216 |         match = re.search(pattern, input_str)
217 |         if match:
218 |             return match.group(2)
219 |         else:
220 |             return None
221 | 
222 |     @staticmethod
223 |     def trafilatura_scrape(url):
224 |         """
225 |         Scrape text content from a given URL using Trafilatura.
226 | 
227 |         Args:
228 |             url (str): The URL to scrape.
229 | 
230 |         Returns:
231 |             str: Extracted text content from the URL.
232 |         """
233 |         document = fetch_url(url)
234 |         text = extract(document)
235 |         return text
236 | 
237 |     def language_predict(self, scraped_text):
238 |         """
239 |         Predict the language of the given text using the FastText model.
240 | 
241 |         Args:
242 |             scraped_text (str): The text content to predict language for.
243 | 
244 |         Returns:
245 |             tuple: (language label, confidence score) or (None, None) if text is empty.
246 |         """
247 |         if scraped_text is not None:
248 |             lid_label_script = self.model.predict(scraped_text.replace('\n', ''))
249 |             lid_label = self.extract_language_code(lid_label_script[0][0])
250 |             lid_confidence = lid_label_script[1][0]
251 |             return lid_label, lid_confidence
252 |         return None, None
253 | 
254 |     def filter_seeds(self, links, input_label, confidence):
255 |         """
256 |         Filter URLs based on language prediction and confidence score.
257 | 
258 |         Args:
259 |             links (list): List of URLs to filter.
260 |             input_label (str): The desired language label.
261 |             confidence (float): The minimum confidence score required.
262 | 
263 |         Returns:
264 |             list: Filtered list of URLs matching the language criteria.
265 |         """
266 |         new_list = []
267 | 
268 |         def filter_link(link):
269 |             scraped_text = self.trafilatura_scrape(link)
270 |             lid_label, lid_confidence = self.language_predict(scraped_text)
271 |             if (lid_label == input_label) and (lid_confidence >= confidence):
272 |                 return {"link": link, "lid_label": lid_label, "lid_confidence": lid_confidence}
273 |             return None
274 | 
275 |         with ThreadPoolExecutor(max_workers=seed_crawler_config['max_workers']) as executor:
276 |             futures = {executor.submit(filter_link, link): link for link in links}
277 |             
278 |             # Use tqdm to add a progress bar to the loop
279 |             for future in tqdm(as_completed(futures), desc="Filtering scraped links", unit="link", total=len(futures)):
280 |                 result = future.result()
281 |                 if result:
282 |                     new_list.append(result)
283 | 
284 |         return new_list
285 | 
286 | 
287 | # Example usage
288 | if __name__ == "__main__":
289 |     input_label = language_detector_config['desired_language']  # Replace with your JSON file name
290 |     json_file_path = os.path.join(seed_reader_config['input_directory'], seed_reader_config['json_file_name'])
291 |     input_confidence = language_detector_config['minimum_confidence']
292 |     model_path = language_detector_config['model_path']
293 |     
294 |     # Load the model once
295 |     model = fasttext.load_model(model_path)
296 |     
297 |     reader = SeedReader(json_file_path)
298 | 
299 |     all_data = reader.get_data()
300 | 
301 |     final_list = []
302 |     lang_detector = LanguageDetector(model)  # Initialize once and reuse
303 |     
304 |     with ThreadPoolExecutor(max_workers=executor_config['max_workers_reader']) as executor:
305 |         futures = []
306 | 
307 |         for entry in all_data:
308 |             # If confidence level threshold condition met.
309 |             # Initialize a big list of websites and put the seed in it.
310 |             
311 |             if entry['lid_confidence'] > input_confidence:
312 |                 seed_url = entry['link']
313 |                 crawler = SeedCrawler(seed_url, max_pages=seed_crawler_config['max_pages'])
314 |                 futures.append(executor.submit(crawler.crawl_website))
315 | 
316 |         for future in as_completed(futures):
317 |             all_website_links = future.result()
318 |             # Use the same LanguageDetector instance
319 |             filtered_links = lang_detector.filter_seeds(all_website_links, input_label, input_confidence)
320 |             final_list.extend(filtered_links)
321 |             print(filtered_links)
322 | 
323 |     # Create directory if it doesn't exist
324 |     output_dir = output_config['directory']
325 |     os.makedirs(output_dir, exist_ok=True)
326 | 
327 |     # Save the final list as a JSON file
328 |     output_file = os.path.join(output_dir, output_config['output_file_name'].format(language=input_label))
329 | 
330 |     with open(output_file, 'w', encoding='utf-8') as file:
331 |         json.dump(final_list, file, ensure_ascii=False, indent=4)
332 | 
333 |     print(f"Final list saved to {output_file}")
334 |     print(final_list)


--------------------------------------------------------------------------------
/misc-code/create_analysis_csv.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import yaml
 4 | import json
 5 | from typing import List, Dict, Any
 6 | from tqdm import tqdm
 7 | 
 8 | # Load configuration from a YAML file
 9 | def load_config(config_file: str) -> Dict[str, Any]:
10 |     with open(config_file, 'r') as file:
11 |         return yaml.safe_load(file)
12 | 
13 | 
14 | def create_csv(code_list, formatted_output_path, text_files_path, meta_data_path):
15 |     """
16 |     Creates a CSV file containing data extracted from JSON and text files.
17 |     
18 |     Args:
19 |         code_list (list): List of codes to process.
20 |         formatted_output_path (str): Path where formatted JSON files are stored.
21 |         text_files_path (str): Path where text files are stored.
22 |         meta_data_path (str): Path where metadata JSON files are stored.
23 |     """
24 |     # Define the CSV column names
25 |     columns = ['alpha_3_code','language_name', 'num_speakers', 'family', 'madlad', 'flores', 'glot500', 'text_len', 'active_seed_urls', 'acquired_urls', 'newly_discovered_urls']
26 |     output_csv_path = 'collected_language_data.csv'
27 |     
28 |     # Open the CSV file for writing
29 |     with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
30 |         writer = csv.DictWriter(csv_file, fieldnames=columns)
31 |         writer.writeheader()  # Write the header row
32 |         
33 |         # Process each code
34 |         for code in code_list:
35 |             row = {}
36 |             
37 |             # Read the JSON file for the current code
38 |             json_file_path = os.path.join(formatted_output_path, f"{code}.json")
39 |             with open(json_file_path, 'r', encoding='utf-8') as json_file:
40 |                 data = json.load(json_file)
41 |             
42 |             # Populate the CSV row with data
43 |             row['alpha_3_code'] = code
44 |             row['language_name'] = data.get('Language Name', '')  
45 |             row['num_speakers'] = data.get('Number of Speakers', '')  
46 |             row['family'] = data.get('Family','')
47 |             row['madlad'] = data.get('Supported by allenai/MADLAD-400', '') 
48 |             row['flores'] = data.get('Supported by facebook/flores', '')  
49 |             row['glot500'] = data.get('Supported by cis-lmu/Glot500','')
50 |             
51 |             # Read the text file for the current code
52 |             text_file_path = os.path.join(text_files_path, f"{code}.txt")
53 |             with open(text_file_path, 'r', encoding='utf-8') as text_file:
54 |                 text_content = text_file.read()
55 |             row['text_len'] = len(text_content)
56 |             
57 |             # Read the metadata JSON file for the current code
58 |             meta_data_file_path = os.path.join(meta_data_path, f"{code}_meta_data.json")
59 |             with open(meta_data_file_path, 'r', encoding='utf-8') as meta_data_file:
60 |                 meta_data = json.load(meta_data_file)
61 |             
62 |             # Populate the CSV row with metadata
63 |             row['active_seed_urls'] = meta_data.get('active_seed_urls_len', '')  
64 |             row['acquired_urls'] = meta_data.get('filtered_links_len', '')
65 |             row['newly_discovered_urls'] = meta_data.get('unique_links_len', '')  
66 |             
67 |             # Write the row to the CSV
68 |             writer.writerow(row)
69 |     
70 |     print(f"CSV saved to {output_csv_path}")
71 | 
72 | if __name__ == "__main__":
73 |     config = load_config('config.yaml')
74 |     formatted_output_path = config['output']['formated_directory']
75 |     text_files_path = config['output']['text_files_directory']
76 |     meta_data_path = os.path.join(config['output']['directory'], "meta_data")
77 |     
78 |     code = config['language_detector']['desired_language']
79 |     # Determine input labels
80 |     if config['batch_processing']['enabled']:
81 |         code_list = config['batch_processing']['input_labels']
82 |     else:
83 |         code_list = [code]
84 |     
85 |     create_csv(code_list,formatted_output_path,text_files_path,meta_data_path)
86 |     
87 | 


--------------------------------------------------------------------------------
/misc-code/filter_sites_and_lang.py:
--------------------------------------------------------------------------------
  1 | #import dependencies
  2 | import json
  3 | import re
  4 | import fasttext
  5 | from huggingface_hub import hf_hub_download
  6 | import requests
  7 | from bs4 import BeautifulSoup
  8 | import urllib3
  9 | 
 10 | 
 11 | #FUNCTIONS
 12 | 
 13 | def remove_html_tags(text):
 14 |     # Remove HTML tags using BeautifulSoup
 15 |     soup = BeautifulSoup(text, "html.parser")
 16 |     cleaned_text = soup.get_text(separator=" ")
 17 | 
 18 |     # Remove extra whitespaces and artifacts
 19 |     cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
 20 | 
 21 |     return cleaned_text
 22 | 
 23 | def read_website_text(url):
 24 |     # Send a GET request to the URL
 25 |     response = requests.get(url, verify=False)
 26 | 
 27 |     # Check if the request was successful (status code 200)
 28 |     if response.status_code == 200:
 29 |         # Remove HTML tags and artifacts from the webpage text
 30 |         cleaned_text = remove_html_tags(response.text)
 31 | 
 32 |         return cleaned_text
 33 |     else:
 34 |         print("Failed to retrieve the webpage. Status code:", response.status_code)
 35 |         return None
 36 | 
 37 | def remove_entries_with_domains(json_data, domain_file):
 38 |     with open(domain_file, 'r') as f:
 39 |         domains = [line.strip() for line in f.readlines()]
 40 |     for iso_key, entries in json_data.items():
 41 |         json_data[iso_key] = [entry for entry in entries if not any(domain in entry['link'] for domain in domains)]
 42 |     return json_data
 43 | 
 44 | def extract_language_code(input_str):
 45 |     pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
 46 |     match = re.search(pattern, input_str)
 47 |     if match:
 48 |         return match.group(2)
 49 |     else:
 50 |         return None
 51 | 
 52 | def lang_filter(filtered_data, model, iso_list):
 53 |   lang_filtered_data = {}
 54 |   for key in filtered_data:
 55 |     entry = filtered_data[key]
 56 |     for item in entry:
 57 |       lid_label = model.predict(item['snippet'])
 58 |       lid_label = extract_language_code(lid_label[0][0])
 59 |       if lid_label not in iso_list:
 60 |         new_entry = item
 61 |         if key not in lang_filtered_data:
 62 |           lang_filtered_data[key]=[]
 63 |         lang_filtered_data[key].append(item)
 64 | 
 65 |   return lang_filtered_data
 66 | 
 67 | def scraped_lang_filter(filtered_data, model, iso_list):
 68 |   lang_filtered_data = {}
 69 |   #lid_prediction_list = [] #optional line for comparison. Must be removed in final build
 70 |   for key in filtered_data:
 71 |     entry = filtered_data[key]
 72 |     for item in entry:
 73 |       link = item['link']
 74 |       scraped_text = read_website_text(link)
 75 |       if(scraped_text != None):
 76 |         lid_label_script = model.predict(scraped_text)
 77 | 
 78 |         lid_label = extract_language_code(lid_label_script[0][0])
 79 |         if lid_label not in iso_list:
 80 |           #lid_prediction_list.append(lid_label) #optional line for comparison. Must be removed in final build
 81 |           new_entry = item
 82 |           new_entry['glotLID'] = lid_label
 83 |           if key not in lang_filtered_data:
 84 |             lang_filtered_data[key]=[]
 85 |           lang_filtered_data[key].append(item)
 86 |   #for lpl in lid_prediction_list: #optional line for comparison. Must be removed in final build
 87 |   #  print(lpl)  #optional line for comparison. Must be removed in final build
 88 |   return lang_filtered_data
 89 | 
 90 | ### Download LID Model for Language Filtering
 91 | model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin")
 92 | model = fasttext.load_model(model_path)
 93 | 
 94 | ### Load list of High resource language iso codes.
 95 | with open('iso_list.json', 'r') as f:
 96 |     iso_list = json.load(f)
 97 | 
 98 | ### Load Json dump of web search.
 99 | with open('now.json', 'r') as json_file:
100 |     data = json.load(json_file)
101 | 
102 | ### Filter searches using filterlist.txt
103 | filtered_data = remove_entries_with_domains(data, 'filterlist.txt')
104 | ### Filter searches to remove High resource languages
105 | lang_filtered_data = lang_filter(filtered_data, model, iso_list)
106 | ### Disable wanrings for unverified certificate
107 | urllib3.disable_warnings()
108 | ### Lang filter using scraped web content
109 | scraped_lang_filtered_data = scraped_lang_filter(filtered_data, model, iso_list)
110 | 


--------------------------------------------------------------------------------
/misc-code/filterlist.txt:
--------------------------------------------------------------------------------
 1 | scribd.com
 2 | ebible.org
 3 | png.bible
 4 | bible.com
 5 | jw.org
 6 | breakeveryyoke.com
 7 | scriptureearth.org
 8 | live.bible.is
 9 | bible.is
10 | faithcomesbyhearing.com
11 | download.sabda.org/yesusmesias/__ebible/html/
12 | alkitab.mobi
13 | mykitabsuci.org
14 | aboriginalbibles.org.au
15 | youtube.com
16 | genius.com
17 | gospelgo.com
18 | linkedin.com
19 | tiktok.com
20 | wikiislam.net
21 | stepbible.org
22 | wikipedia.org
23 | wikimedia.org
24 | classicistranieri.com
25 | e-alkitab.org
26 | festvox.org/cmu_wilderness/
27 | biblerevelation.org
28 | biblehub.com
29 | biblegateway.com
30 | newchristianbiblestudy.org
31 | bibliaplus.org
32 | greekroom.org/wildebeest/ebible
33 | bibleforchildren.org
34 | al-islam.org
35 | translate.google
36 | google.com
37 | cyber.bible
38 | divinerevelations.info
39 | wikisource.org
40 | holy-bhagavad-gita.org
41 | realtor.com
42 | surinamebiblesoc
43 | lyricsmint
44 | quran411.com
45 | tripadvisor
46 | jesusforafrica
47 | zlibrary
48 | espn.com
49 | wordproject.org/bibles
50 | https://karaoke.sabda.org
51 | manybooks.net
52 | 


--------------------------------------------------------------------------------
/misc-code/formatting_glot500_langs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 7,
  6 |    "id": "43154a26-c2b3-42ab-b5cc-0c800d0d5ab6",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import csv\n",
 11 |     "import json\n",
 12 |     "\n",
 13 |     "def find_underscores_in_csv(file_path):\n",
 14 |     "    output_list = []\n",
 15 |     "    \n",
 16 |     "    with open(file_path, newline='', encoding='utf-8') as csvfile:\n",
 17 |     "        reader = csv.reader(csvfile)\n",
 18 |     "        \n",
 19 |     "        for row in reader:\n",
 20 |     "            for cell in row:\n",
 21 |     "                if '_' in cell:\n",
 22 |     "                    output_list.append(cell)\n",
 23 |     "    \n",
 24 |     "    return output_list"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 8,
 30 |    "id": "da507292-803f-4f2b-9f9e-5dfa3de46bf2",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# Example usage\n",
 35 |     "file_path = 'glot500_raw.csv'  # Replace with your actual CSV file path\n",
 36 |     "glot500_iso_code = find_underscores_in_csv(file_path)\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 9,
 42 |    "id": "99404a8c-7967-4ee7-ac2d-e03b84fa0ad0",
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "534"
 49 |       ]
 50 |      },
 51 |      "execution_count": 9,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "len(glot500_iso_code)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 10,
 63 |    "id": "641c442d-fff6-4ea3-b0ce-105872daa9dc",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "with open('glot500_iso_code.json', 'w', encoding='utf-8') as json_file:\n",
 68 |     "    json.dump(glot500_iso_code, json_file, ensure_ascii=False, indent=4)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "7772feb0-b988-4bc8-b9d1-1f274e62ab18",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 3",
 83 |    "language": "python",
 84 |    "name": "python3"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 3
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython3",
 96 |    "version": "3.8.8"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 5
101 | }
102 | 


--------------------------------------------------------------------------------
/misc-code/get_webinfo.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | def get_webpage_metadata(url):
 5 |     try:
 6 |         # Fetch the webpage
 7 |         response = requests.get(url)
 8 |         response.raise_for_status()  # Raise an exception for 4xx or 5xx status codes
 9 |         html_content = response.content
10 | 
11 |         # Parse HTML
12 |         soup = BeautifulSoup(html_content, 'html.parser')
13 | 
14 |         # Get metadata
15 |         metadata = {}
16 | 
17 |         # Content language
18 |         lang_attribute = soup.find('html').get('lang')
19 |         if lang_attribute:
20 |             metadata['Content Language'] = lang_attribute
21 | 
22 |         # Content-Language header
23 |         content_language_header = response.headers.get('Content-Language')
24 |         if content_language_header:
25 |             metadata['Content-Language Header'] = content_language_header
26 | 
27 |         # Title
28 |         title = soup.find('title')
29 |         if title:
30 |             metadata['Title'] = title.text.strip()
31 | 
32 |         # Description
33 |         meta_description = soup.find('meta', attrs={'name': 'description'})
34 |         if meta_description and meta_description.get('content'):
35 |             metadata['Description'] = meta_description.get('content')
36 | 
37 |         # Keywords
38 |         meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
39 |         if meta_keywords and meta_keywords.get('content'):
40 |             metadata['Keywords'] = meta_keywords.get('content')
41 | 
42 |         return metadata
43 | 
44 |     except Exception as e:
45 |         print("Error:", e)
46 |         return None
47 | 
48 | # Example usage
49 | url = "https://Akhaliterature.com"
50 | metadata = get_webpage_metadata(url)
51 | if metadata:
52 |     print("Metadata:")
53 |     for key, value in metadata.items():
54 |         print(f"{key}: {value}")
55 | else:
56 |     print("Failed to retrieve metadata.")
57 | 


--------------------------------------------------------------------------------
/misc-code/glosslm/create_glotto_to_iso_mapping.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | # Function to read JSON file
 4 | def read_json_file(file_path):
 5 |     with open(file_path, 'r', encoding='utf-8') as file:
 6 |         return json.load(file)
 7 | 
 8 | # Function to transform the JSON entries into a dictionary
 9 | def transform_entries_to_dict(entries):
10 |     if not isinstance(entries, list):
11 |         raise TypeError("The input data must be a list of dictionaries.")
12 |     
13 |     transformed_dict = {}
14 |     for entry in entries:
15 |         if not isinstance(entry, dict):
16 |             raise TypeError("Each entry must be a dictionary.")
17 |         
18 |         glottocode = entry['id']
19 |         isocode = next((identifier['identifier'] for identifier in entry['identifiers'] if identifier['type'] == 'iso639-3'), None)
20 |         transformed_dict[glottocode] = isocode
21 |     return transformed_dict
22 | 
23 | # Function to write JSON to file
24 | def write_json_file(data, file_path):
25 |     with open(file_path, 'w', encoding='utf-8') as file:
26 |         json.dump(data, file, indent=4)
27 | 
28 | # Main process
29 | # We used https://glottolog.org/resourcemap.json?rsc=language as our source and saved it as 'resourcemap.json'
30 | input_file_path = 'resourcemap.json'  # Change this to your input file path
31 | output_file_path = 'glotto_iso.json'  # Change this to your desired output file path
32 | 
33 | 
34 | # Read the JSON file
35 | data = read_json_file(input_file_path)
36 | 
37 | entries = data["resources"]
38 | 
39 | # Transform the entries into a dictionary
40 | transformed_dict = transform_entries_to_dict(entries)
41 | 
42 | # Write the transformed dictionary to a new JSON file
43 | write_json_file(transformed_dict, output_file_path)
44 | 
45 | print(f"Transformed data has been written to {output_file_path}")
46 | 


--------------------------------------------------------------------------------
/misc-code/glosslm/glosslm_csv_creation.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | import pandas as pd
3 | 
4 | dataset = load_dataset("lecslab/glosslm-corpus-split")
5 | df_train = pd.DataFrame(dataset['train'])
6 | 
7 | # Save the DataFrame as a CSV file
8 | df_train.to_csv('glosslm-corpus-split.csv', index=False)
9 | 


--------------------------------------------------------------------------------
/misc-code/glosslm/glosslm_dump.py:
--------------------------------------------------------------------------------
  1 | #IMPORTS
  2 | 
  3 | import os
  4 | import re
  5 | import json
  6 | from trafilatura import extract
  7 | from trafilatura import fetch_url
  8 | import fasttext
  9 | from huggingface_hub import hf_hub_download
 10 | import urllib3
 11 | import time
 12 | from tqdm import tqdm
 13 | 
 14 | #FUNCTIONS
 15 | 
 16 | def remove_entries_with_domains(json_data, domain_file):
 17 |     with open(domain_file, 'r') as f:
 18 |         domains = [line.strip() for line in f.readlines()]
 19 | 
 20 |     for iso_key, entries in json_data.items():
 21 |         json_data[iso_key] = [entry for entry in entries if 'link' in entry and not any(domain in entry['link'] for domain in domains)]
 22 | 
 23 |     return json_data
 24 | 
 25 | 
 26 | def extract_language_code(input_str):
 27 |     pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
 28 |     match = re.search(pattern, input_str)
 29 |     if match:
 30 |         return match.group(2)
 31 |     else:
 32 |         return None
 33 | 
 34 | def trafilatura_scrape(url):
 35 |   document = fetch_url(url)
 36 |   text = extract(document)
 37 | 
 38 |   return text
 39 | 
 40 | def scraped_lang_filter(filtered_data, model, iso_list, output_directory):
 41 | 
 42 |     scraped_lang_list = []
 43 |     for key in tqdm(filtered_data, desc = "Processing"):
 44 |         entries = filtered_data[key]
 45 |         for entry in entries:
 46 |           new_entry = []
 47 |           for item in entry:
 48 |             link = item['link']
 49 |             scraped_text = trafilatura_scrape(link)
 50 |             if scraped_text is not None:
 51 | 
 52 |               lid_label_script = model.predict(scraped_text.replace('\n', ''))
 53 |               lid_label = extract_language_code(lid_label_script[0][0])
 54 |               lid_confidence = lid_label_script[1][0]
 55 |               item['predicted_lid'] = lid_label
 56 |               item['lid_confidence'] = lid_confidence
 57 | 
 58 |               if lid_label not in iso_list:
 59 |                 output_file_path = os.path.join(output_directory, f'{lid_label}.json')
 60 |                 if os.path.exists(output_file_path):
 61 |                   # If it exists, read the existing content
 62 |                   with open(output_file_path, 'r') as output_file:
 63 |                       existing_data = json.load(output_file)
 64 |                 else:
 65 |                   # If it doesn't exist, create a new list
 66 |                   existing_data = []
 67 | 
 68 |                 print(lid_confidence) # Add this line for debugging
 69 |                 print("lid_label:", lid_label)  # Add this line for debugging
 70 |                 scraped_lang_list.append(lid_label) # Add this line for debugging
 71 |                 existing_data.append(item)
 72 |                 with open(output_file_path, 'w') as output_file:
 73 |                   json.dump(existing_data, output_file, ensure_ascii = False, indent=4)
 74 | 
 75 |     print(scraped_lang_list)      # Add this line for debugging
 76 |     print(len(scraped_lang_list)) # Add this line for debugging
 77 |     return None
 78 | 
 79 | #Execution
 80 | 
 81 | start_time = time.time()
 82 | 
 83 | model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin")
 84 | model = fasttext.load_model(model_path)
 85 | 
 86 | filename = '0-200.json'
 87 | ### Load Json dump of web search.
 88 | with open(filename, 'r') as json_file:
 89 |     data = json.load(json_file)
 90 | ### ISO code list of top 200 languages
 91 | with open('iso_list.json', 'r') as f:
 92 |     iso_list = json.load(f)
 93 | 
 94 | ### Filter searches using filterlist.txt
 95 | filtered_data = remove_entries_with_domains(data, 'filterlist.txt')
 96 | #scraped_lang_filtered_data = scraped_lang_filter(filtered_data, model, iso_list)
 97 | 
 98 | urllib3.disable_warnings()
 99 | output_directory = '/content/drive/MyDrive/glotsparse/glosslm_dump/lang_dump_confidence'
100 | scraped_lang_filtered_data = scraped_lang_filter(filtered_data, model, iso_list, output_directory)
101 | filepath = f"{filename}_filtered.json"
102 | 
103 | print("DONE ALL")
104 | print(filename)
105 | end_time = time.time()
106 | time_min = (end_time - start_time)/60
107 | print(time_min)
108 | 


--------------------------------------------------------------------------------
/misc-code/glosslm/glosslm_processing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | def read_json_file(file_path):
 5 |     with open(file_path, 'r', encoding='utf-8') as file:
 6 |         data = json.load(file)
 7 |         return data
 8 | 
 9 | def remove_null_entries(data):
10 |     if isinstance(data, dict):
11 |         return {key: value for key, value in data.items() if value is not None}
12 |     elif isinstance(data, list):
13 |         return [remove_null_entries(item) for item in data if item is not None]
14 |     return data
15 | 
16 | def write_json_file(data, file_path):
17 |     with open(file_path, 'w', encoding='utf-8') as file:
18 |         json.dump(data, file, indent=4)
19 | 
20 | def add_iso_code(data_frame, glotto_iso_mapping):
21 |     data_frame['glottocode'] = data_frame['glottocode'].astype(str)
22 |     data_frame['iso_code'] = data_frame['glottocode'].map(glotto_iso_mapping)
23 |     return data_frame
24 | 
25 | def save_dataframe_to_csv(data_frame, file_path):
26 |     try:
27 |         data_frame.to_csv(file_path, index=False)
28 |         print(f"DataFrame successfully saved to {file_path}")
29 |     except Exception as e:
30 |         print(f"An error occurred while saving the DataFrame: {e}")
31 | 
32 | def remove_empty_iso_codes(data_frame):
33 |     # Replace empty strings with NaN and then drop rows with NaN in 'iso_code'
34 |     cleaned_data_frame = data_frame.replace({'iso_code': {'': None}}).dropna(subset=['iso_code'])
35 |     
36 |     return cleaned_data_frame
37 | 
38 | ### Execution ###
39 | glotto_iso_mapping = read_json_file('glotto_iso.json')
40 | dataset = pd.read_csv('glosslm-corpus-split.csv')
41 | result_df = add_iso_code(dataset, glotto_iso_mapping)
42 | result_df = remove_empty_iso_codes(result_df)
43 | save_dataframe_to_csv(result_df, 'glosslm_train_with_iso.csv')
44 | 


--------------------------------------------------------------------------------
/misc-code/glosslm/initial_text_seed_from_glosslm.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | def get_closest_to_median(group):
 5 |     # Ensure all values in 'transcription' are strings and handle NaN values
 6 |     group['transcription'] = group['transcription'].astype(str).fillna('')
 7 |     # Calculate the string lengths
 8 |     group['str_length'] = group['transcription'].apply(len)
 9 |     # Calculate the median string length
10 |     median_length = group['str_length'].median()
11 |     # Calculate the absolute difference from the median
12 |     group['diff_from_median'] = (group['str_length'] - median_length).abs()
13 |     # Sort by the difference and take the two rows closest to the median
14 |     closest_rows = group.nsmallest(2, 'diff_from_median')
15 |     return closest_rows
16 | 
17 | df = pd.read_csv('glosslm_train_with_iso.csv')
18 | # Apply the function to each group
19 | result = df.groupby('iso_code').apply(get_closest_to_median).reset_index(drop=True)
20 | 
21 | # Drop the temporary columns used for calculation
22 | result = result.drop(columns=['str_length', 'diff_from_median'])
23 | 
24 | glosslm_samples = result[['iso_code','transcription']]
25 | 
26 | glosslm_samples.to_csv('glosslm_samples.txt', sep='\t', index=False, header=False)
27 | 


--------------------------------------------------------------------------------
/misc-code/glosslm/searxng_search.py:
--------------------------------------------------------------------------------
 1 | # Import packages
 2 | import pprint
 3 | import json
 4 | from langchain_community.utilities import SearxSearchWrapper
 5 | from tqdm import tqdm
 6 | 
 7 | ### Search and save dump function
 8 | import requests
 9 | from requests.exceptions import RequestException
10 | from urllib3.exceptions import ProtocolError
11 | import json
12 | import time
13 | 
14 | def search_and_save_results(file_path, start_index, end_index):
15 |     search = SearxSearchWrapper(searx_host="http://127.0.0.1:8080")
16 |     with open(file_path, 'r') as file:
17 |         results_dict = {}
18 |         lines = file.readlines()
19 |         
20 |         for i, line in enumerate(tqdm(lines[start_index:end_index], initial=start_index, total=end_index - start_index)):
21 |             iso_code, sentence = line.strip().split('\t')
22 |             try:
23 |                 results = retry_request(
24 |                     lambda: search.results(
25 |                         sentence,
26 |                         num_results=50,
27 |                         engines=["bing", "yahoo", "qwant", "duckduckgo"]
28 |                     )
29 |                 )
30 |                 if iso_code not in results_dict:
31 |                     results_dict[iso_code] = []
32 |                 for result in results:
33 |                     result['query'] = sentence
34 |                 results_dict[iso_code].extend(results)
35 |             except RequestException as e:
36 |                 print(f"Error occurred: {e}")
37 |                 continue
38 | 
39 |     with open(f'{start_index}-{end_index}.json', 'w') as json_file:
40 |         json.dump(results_dict, json_file, indent=4)
41 | 
42 | def retry_request(request_func, max_retries=3):
43 |     retries = 0
44 |     while retries < max_retries:
45 |         try:
46 |             return request_func()
47 |         except (RequestException, ProtocolError) as e:
48 |             print(f"Error occurred: {e}")
49 |             retries += 1
50 |             if retries < max_retries:
51 |                 print("Retrying...")
52 |                 time.sleep(2)  # Wait for 2 seconds before retrying
53 |             else:
54 |                 print("Max retries exceeded.")
55 |                 raise
56 | 
57 | search_and_save_results('glosslm_samples.txt',1001,1200)
58 | 


--------------------------------------------------------------------------------
/misc-code/glot500_iso_code.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     "hbs_Latn",
  3 |     "vec_Latn",
  4 |     "swh_Latn",
  5 |     "mal_Mlym",
  6 |     "jpn_Jpan",
  7 |     "alt_Cyrl",
  8 |     "aze_Latn",
  9 |     "lus_Latn",
 10 |     "rmn_Grek",
 11 |     "guj_Gujr",
 12 |     "crs_Latn",
 13 |     "miq_Latn",
 14 |     "ben_Beng",
 15 |     "kqn_Latn",
 16 |     "kaa_Cyrl",
 17 |     "kan_Knda",
 18 |     "ndo_Latn",
 19 |     "kos_Latn",
 20 |     "tel_Telu",
 21 |     "snd_Arab",
 22 |     "grn_Latn",
 23 |     "mlt_Latn",
 24 |     "yue_Hani",
 25 |     "lhu_Latn",
 26 |     "fra_Latn",
 27 |     "tiv_Latn",
 28 |     "lzh_Hani",
 29 |     "spa_Latn",
 30 |     "kua_Latn",
 31 |     "ajp_Arab",
 32 |     "eng_Latn",
 33 |     "kwy_Latn",
 34 |     "cmn_Hani",
 35 |     "fil_Latn",
 36 |     "hin_Latn",
 37 |     "gcf_Latn",
 38 |     "nob_Latn",
 39 |     "iku_Cans",
 40 |     "rmn_Cyrl",
 41 |     "rus_Cyrl",
 42 |     "kal_Latn",
 43 |     "kjh_Cyrl",
 44 |     "deu_Latn",
 45 |     "tdt_Latn",
 46 |     "rng_Latn",
 47 |     "tur_Latn",
 48 |     "gsw_Latn",
 49 |     "mgh_Latn",
 50 |     "pan_Guru",
 51 |     "mfe_Latn",
 52 |     "xmv_Latn",
 53 |     "mar_Deva",
 54 |     "swc_Latn",
 55 |     "ige_Latn",
 56 |     "por_Latn",
 57 |     "mon_Latn",
 58 |     "rmy_Latn",
 59 |     "nld_Latn",
 60 |     "mos_Latn",
 61 |     "srm_Latn",
 62 |     "ara_Arab",
 63 |     "kik_Latn",
 64 |     "bak_Latn",
 65 |     "zho_Hani",
 66 |     "cnh_Latn",
 67 |     "gur_Latn",
 68 |     "ita_Latn",
 69 |     "gil_Latn",
 70 |     "idu_Latn",
 71 |     "ind_Latn",
 72 |     "pon_Latn",
 73 |     "yom_Latn",
 74 |     "ell_Grek",
 75 |     "umb_Latn",
 76 |     "tdx_Latn",
 77 |     "bul_Cyrl",
 78 |     "lvs_Latn",
 79 |     "mzn_Arab",
 80 |     "swe_Latn",
 81 |     "sco_Latn",
 82 |     "cfm_Latn",
 83 |     "ces_Latn",
 84 |     "ori_Orya",
 85 |     "zpa_Latn",
 86 |     "isl_Latn",
 87 |     "arg_Latn",
 88 |     "kbd_Cyrl",
 89 |     "pol_Latn",
 90 |     "kur_Latn",
 91 |     "lao_Laoo",
 92 |     "ron_Latn",
 93 |     "dhv_Latn",
 94 |     "nap_Latn",
 95 |     "dan_Latn",
 96 |     "luo_Latn",
 97 |     "qub_Latn",
 98 |     "hun_Latn",
 99 |     "lun_Latn",
100 |     "oke_Latn",
101 |     "tgk_Cyrl",
102 |     "nzi_Latn",
103 |     "ote_Latn",
104 |     "srp_Latn",
105 |     "gug_Latn",
106 |     "bsb_Latn",
107 |     "fas_Arab",
108 |     "bar_Latn",
109 |     "ogo_Latn",
110 |     "ceb_Latn",
111 |     "bci_Latn",
112 |     "abn_Latn",
113 |     "heb_Hebr",
114 |     "chk_Latn",
115 |     "ldi_Latn",
116 |     "hrv_Latn",
117 |     "roh_Latn",
118 |     "ayr_Latn",
119 |     "glg_Latn",
120 |     "aym_Latn",
121 |     "gom_Deva",
122 |     "fin_Latn",
123 |     "yap_Latn",
124 |     "bba_Latn",
125 |     "slv_Latn",
126 |     "ssw_Latn",
127 |     "aln_Latn",
128 |     "vie_Latn",
129 |     "quz_Latn",
130 |     "leh_Latn",
131 |     "mkd_Cyrl",
132 |     "sah_Cyrl",
133 |     "ban_Latn",
134 |     "slk_Latn",
135 |     "tsn_Latn",
136 |     "ace_Latn",
137 |     "nor_Latn",
138 |     "lmo_Latn",
139 |     "pes_Arab",
140 |     "est_Latn",
141 |     "ido_Latn",
142 |     "skg_Latn",
143 |     "ltz_Latn",
144 |     "abk_Cyrl",
145 |     "ary_Arab",
146 |     "eus_Latn",
147 |     "zne_Latn",
148 |     "hus_Latn",
149 |     "lit_Latn",
150 |     "quy_Latn",
151 |     "glv_Latn",
152 |     "kaz_Cyrl",
153 |     "kam_Latn",
154 |     "fat_Latn",
155 |     "lav_Latn",
156 |     "bbc_Latn",
157 |     "frr_Latn",
158 |     "bos_Latn",
159 |     "vol_Latn",
160 |     "mwn_Latn",
161 |     "epo_Latn",
162 |     "wal_Latn",
163 |     "mai_Deva",
164 |     "cat_Latn",
165 |     "uig_Arab",
166 |     "dua_Latn",
167 |     "tha_Thai",
168 |     "vmw_Latn",
169 |     "dzo_Tibt",
170 |     "ukr_Cyrl",
171 |     "kwn_Latn",
172 |     "ctd_Latn",
173 |     "tgl_Latn",
174 |     "pam_Latn",
175 |     "nnb_Latn",
176 |     "sin_Sinh",
177 |     "seh_Latn",
178 |     "sxn_Latn",
179 |     "gle_Latn",
180 |     "tsc_Latn",
181 |     "mps_Latn",
182 |     "hin_Deva",
183 |     "nyk_Latn",
184 |     "mny_Latn",
185 |     "kor_Hang",
186 |     "kmb_Latn",
187 |     "gkp_Latn",
188 |     "ory_Orya",
189 |     "zai_Latn",
190 |     "kat_Latn",
191 |     "urd_Arab",
192 |     "gym_Latn",
193 |     "bjn_Latn",
194 |     "swa_Latn",
195 |     "bod_Tibt",
196 |     "acr_Latn",
197 |     "sqi_Latn",
198 |     "nde_Latn",
199 |     "dtp_Latn",
200 |     "bel_Cyrl",
201 |     "fon_Latn",
202 |     "lam_Latn",
203 |     "afr_Latn",
204 |     "ber_Latn",
205 |     "bik_Latn",
206 |     "nno_Latn",
207 |     "nbl_Latn",
208 |     "poh_Latn",
209 |     "tat_Cyrl",
210 |     "kmr_Latn",
211 |     "phm_Latn",
212 |     "ast_Latn",
213 |     "guc_Latn",
214 |     "hrx_Latn",
215 |     "mon_Cyrl",
216 |     "mam_Latn",
217 |     "quh_Latn",
218 |     "hbs_Cyrl",
219 |     "nia_Latn",
220 |     "hyw_Cyrl",
221 |     "hau_Latn",
222 |     "nyn_Latn",
223 |     "rue_Cyrl",
224 |     "sna_Latn",
225 |     "cab_Latn",
226 |     "eml_Latn",
227 |     "msa_Latn",
228 |     "top_Latn",
229 |     "acm_Arab",
230 |     "som_Latn",
231 |     "tog_Latn",
232 |     "tob_Latn",
233 |     "srp_Cyrl",
234 |     "mco_Latn",
235 |     "ach_Latn",
236 |     "mlg_Latn",
237 |     "tzh_Latn",
238 |     "vep_Latn",
239 |     "zul_Latn",
240 |     "pms_Latn",
241 |     "npi_Deva",
242 |     "arz_Arab",
243 |     "wuu_Hani",
244 |     "tok_Latn",
245 |     "nya_Latn",
246 |     "plt_Latn",
247 |     "sgs_Latn",
248 |     "tam_Taml",
249 |     "yid_Hebr",
250 |     "lĳ_Latn",
251 |     "hat_Latn",
252 |     "ada_Latn",
253 |     "myv_Cyrl",
254 |     "uzb_Latn",
255 |     "iba_Latn",
256 |     "tih_Latn",
257 |     "sot_Latn",
258 |     "kek_Latn",
259 |     "tat_Latn",
260 |     "uzb_Cyrl",
261 |     "koo_Latn",
262 |     "lfn_Latn",
263 |     "cos_Latn",
264 |     "sop_Latn",
265 |     "cgg_Latn",
266 |     "als_Latn",
267 |     "kac_Latn",
268 |     "ful_Latn",
269 |     "amh_Ethi",
270 |     "qvi_Latn",
271 |     "gor_Latn",
272 |     "sun_Latn",
273 |     "cak_Latn",
274 |     "ile_Latn",
275 |     "war_Latn",
276 |     "kbp_Latn",
277 |     "ium_Latn",
278 |     "div_Thaa",
279 |     "ctu_Latn",
280 |     "teo_Latn",
281 |     "yor_Latn",
282 |     "kri_Latn",
283 |     "kia_Latn",
284 |     "fao_Latn",
285 |     "mau_Latn",
286 |     "crh_Cyrl",
287 |     "uzn_Cyrl",
288 |     "scn_Latn",
289 |     "crh_Latn",
290 |     "smo_Latn",
291 |     "tyv_Cyrl",
292 |     "enm_Latn",
293 |     "bak_Cyrl",
294 |     "ina_Latn",
295 |     "sat_Olck",
296 |     "ilo_Latn",
297 |     "btx_Latn",
298 |     "mad_Latn",
299 |     "tso_Latn",
300 |     "nch_Latn",
301 |     "cac_Latn",
302 |     "mri_Latn",
303 |     "ncj_Latn",
304 |     "hnj_Latn",
305 |     "hmn_Latn",
306 |     "pau_Latn",
307 |     "ksh_Latn",
308 |     "asm_Beng",
309 |     "toj_Latn",
310 |     "ikk_Latn",
311 |     "hil_Latn",
312 |     "pcm_Latn",
313 |     "sba_Latn",
314 |     "nso_Latn",
315 |     "dyu_Latn",
316 |     "zom_Latn",
317 |     "ibo_Latn",
318 |     "kss_Latn",
319 |     "bqc_Latn",
320 |     "kin_Latn",
321 |     "afb_Arab",
322 |     "bim_Latn",
323 |     "hye_Armn",
324 |     "urh_Latn",
325 |     "mdy_Ethi",
326 |     "oci_Latn",
327 |     "quc_Latn",
328 |     "bts_Latn",
329 |     "lin_Latn",
330 |     "new_Deva",
331 |     "gya_Latn",
332 |     "tpi_Latn",
333 |     "yao_Latn",
334 |     "ajg_Latn",
335 |     "twi_Latn",
336 |     "ngl_Latn",
337 |     "agw_Latn",
338 |     "kir_Cyrl",
339 |     "nyu_Latn",
340 |     "kom_Cyrl",
341 |     "pap_Latn",
342 |     "kab_Latn",
343 |     "knv_Latn",
344 |     "nep_Deva",
345 |     "tuk_Cyrl",
346 |     "giz_Latn",
347 |     "azj_Latn",
348 |     "xmf_Geor",
349 |     "hui_Latn",
350 |     "bcl_Latn",
351 |     "ndc_Latn",
352 |     "kpg_Latn",
353 |     "xho_Latn",
354 |     "san_Deva",
355 |     "zea_Latn",
356 |     "cym_Latn",
357 |     "nba_Latn",
358 |     "aoj_Latn",
359 |     "gaa_Latn",
360 |     "bpy_Beng",
361 |     "csy_Latn",
362 |     "ton_Latn",
363 |     "ncx_Latn",
364 |     "azb_Arab",
365 |     "tah_Latn",
366 |     "qug_Latn",
367 |     "csb_Latn",
368 |     "lat_Latn",
369 |     "rmn_Latn",
370 |     "tpm_Latn",
371 |     "srn_Latn",
372 |     "cjk_Latn",
373 |     "quw_Latn",
374 |     "ewe_Latn",
375 |     "arb_Arab",
376 |     "rmy_Cyrl",
377 |     "bem_Latn",
378 |     "kea_Latn",
379 |     "ixl_Latn",
380 |     "efi_Latn",
381 |     "mck_Latn",
382 |     "mbb_Latn",
383 |     "bis_Latn",
384 |     "arn_Latn",
385 |     "pfl_Latn",
386 |     "orm_Latn",
387 |     "pdt_Latn",
388 |     "pcd_Latn",
389 |     "haw_Latn",
390 |     "her_Latn",
391 |     "tlh_Latn",
392 |     "hmo_Latn",
393 |     "gla_Latn",
394 |     "suz_Deva",
395 |     "kat_Geor",
396 |     "kmr_Cyrl",
397 |     "gcr_Latn",
398 |     "pag_Latn",
399 |     "mwl_Latn",
400 |     "jbo_Latn",
401 |     "loz_Latn",
402 |     "nav_Latn",
403 |     "tbz_Latn",
404 |     "fry_Latn",
405 |     "ksw_Mymr",
406 |     "bam_Latn",
407 |     "mya_Mymr",
408 |     "mxv_Latn",
409 |     "prk_Latn",
410 |     "nds_Latn",
411 |     "hif_Latn",
412 |     "jam_Latn",
413 |     "run_Latn",
414 |     "wol_Latn",
415 |     "twx_Latn",
416 |     "pnb_Arab",
417 |     "sme_Latn",
418 |     "nmf_Latn",
419 |     "rar_Latn",
420 |     "gom_Latn",
421 |     "caq_Latn",
422 |     "fij_Latn",
423 |     "bum_Latn",
424 |     "rop_Latn",
425 |     "wls_Latn",
426 |     "mgr_Latn",
427 |     "tca_Latn",
428 |     "ckb_Arab",
429 |     "ahk_Latn",
430 |     "yan_Latn",
431 |     "ven_Latn",
432 |     "kur_Arab",
433 |     "xav_Latn",
434 |     "zsm_Latn",
435 |     "bas_Latn",
436 |     "bih_Deva",
437 |     "chv_Cyrl",
438 |     "bin_Latn",
439 |     "cuk_Latn",
440 |     "lua_Latn",
441 |     "tsz_Latn",
442 |     "kjb_Latn",
443 |     "que_Latn",
444 |     "sid_Latn",
445 |     "hne_Deva",
446 |     "sag_Latn",
447 |     "diq_Latn",
448 |     "wbm_Latn",
449 |     "guw_Latn",
450 |     "srd_Latn",
451 |     "zlm_Latn",
452 |     "bre_Latn",
453 |     "tcf_Latn",
454 |     "tui_Latn",
455 |     "toi_Latn",
456 |     "bzj_Latn",
457 |     "ifb_Latn",
458 |     "pus_Arab",
459 |     "udm_Cyrl",
460 |     "izz_Latn",
461 |     "che_Cyrl",
462 |     "cce_Latn",
463 |     "rug_Latn",
464 |     "pis_Latn",
465 |     "meu_Latn",
466 |     "aka_Latn",
467 |     "kon_Latn",
468 |     "chw_Latn",
469 |     "pxm_Latn",
470 |     "oss_Cyrl",
471 |     "cbk_Latn",
472 |     "kmm_Latn",
473 |     "hyw_Armn",
474 |     "ibg_Latn",
475 |     "mcn_Latn",
476 |     "iso_Latn",
477 |     "bhw_Latn",
478 |     "ifa_Latn",
479 |     "nan_Latn",
480 |     "ngu_Latn",
481 |     "dln_Latn",
482 |     "lub_Latn",
483 |     "nyy_Latn",
484 |     "ext_Latn",
485 |     "lim_Latn",
486 |     "szl_Latn",
487 |     "ksd_Latn",
488 |     "tuk_Latn",
489 |     "ish_Latn",
490 |     "mzh_Latn",
491 |     "tir_Ethi",
492 |     "naq_Latn",
493 |     "llb_Latn",
494 |     "tgk_Latn",
495 |     "toh_Latn",
496 |     "hra_Latn",
497 |     "yua_Latn",
498 |     "ttj_Latn",
499 |     "mwm_Latn",
500 |     "min_Latn",
501 |     "nse_Latn",
502 |     "krc_Cyrl",
503 |     "lue_Latn",
504 |     "hsb_Latn",
505 |     "tuc_Latn",
506 |     "khm_Khmr",
507 |     "ami_Latn",
508 |     "mrw_Latn",
509 |     "tum_Latn",
510 |     "alz_Latn",
511 |     "pls_Latn",
512 |     "tll_Latn",
513 |     "apc_Arab",
514 |     "rap_Latn",
515 |     "ekk_Latn",
516 |     "vls_Latn",
517 |     "fur_Latn",
518 |     "lug_Latn",
519 |     "mhr_Cyrl",
520 |     "kaa_Latn",
521 |     "niu_Latn",
522 |     "djk_Latn",
523 |     "prs_Arab",
524 |     "tzo_Latn",
525 |     "wes_Latn",
526 |     "san_Latn",
527 |     "mah_Latn",
528 |     "gkn_Latn",
529 |     "som_Arab",
530 |     "tvl_Latn",
531 |     "grc_Grek",
532 |     "uig_Latn",
533 |     "jav_Latn",
534 |     "hbo_Hebr",
535 |     "hau_Arab"
536 | ]


--------------------------------------------------------------------------------
/misc-code/glot500_raw.csv:
--------------------------------------------------------------------------------
  1 | Language-Script-1,|Sent-1|,Family-1,Language-Script-2,|Sent-2|,Family-2,Language-Script-3,|Sent-3|,Family-3
  2 | hbs_Latn,63411156,indo1319,vec_Latn,514240,indo1319,swh_Latn,95776,atla1278
  3 | mal_Mlym,48098273,drav1251,jpn_Jpan,510722,japo1237,alt_Cyrl,95148,turk1311
  4 | aze_Latn,46300705,lus_Latn,509250,sino1245,rmn_Grek,94533,indo1319
  5 | guj_Gujr,45738685,indo1319,crs_Latn,508755,indo1319,miq_Latn,94343,misu1242
  6 | ben_Beng,43514870,indo1319,kqn_Latn,507913,atla1278,kaa_Cyrl,88815,turk1311
  7 | kan_Knda,41836495,drav1251,ndo_Latn,496613,atla1278,kos_Latn,88603,aust1307
  8 | tel_Telu,41580525,drav1251,snd_Arab,488730,indo1319,grn_Latn,87568
  9 | mlt_Latn,40654838,afro1255,yue_Hani,484700,sino1245,lhu_Latn,87255,sino1245
 10 | fra_Latn,39197581,indo1319,tiv_Latn,483064,atla1278,lzh_Hani,86035,sino1245
 11 | spa_Latn,37286756,indo1319,kua_Latn,473535,atla1278,ajp_Arab,83297,afro1255
 12 | eng_Latn,36122761,indo1319,kwy_Latn,473274,atla1278,cmn_Hani,80745,sino1245
 13 | fil_Latn,33493255,aust1307,hin_Latn,466175,indo1319,gcf_Latn,80737,indo1319
 14 | nob_Latn,32869205,indo1319,iku_Cans,465011,rmn_Cyrl,79925,indo1319
 15 | rus_Cyrl,31787973,indo1319,kal_Latn,462430,eski1264,kjh_Cyrl,79262,turk1311
 16 | deu_Latn,31015993,indo1319,tdt_Latn,459818,aust1307,rng_Latn,78177,atla1278
 17 | tur_Latn,29184662,turk1311,gsw_Latn,449240,indo1319,mgh_Latn,78117,atla1278
 18 | pan_Guru,29052537,indo1319,mfe_Latn,447435,indo1319,xmv_Latn,77896,aust1307
 19 | mar_Deva,28748897,indo1319,swc_Latn,446378,atla1278,ige_Latn,77114,atla1278
 20 | por_Latn,27824391,indo1319,mon_Latn,437950,mong1349,rmy_Latn,76991,indo1319
 21 | nld_Latn,25061426,indo1319,mos_Latn,437666,atla1278,srm_Latn,76884,indo1319
 22 | ara_Arab,24524122,kik_Latn,437228,atla1278,bak_Latn,76809,turk1311
 23 | zho_Hani,24143786,cnh_Latn,436667,sino1245,gur_Latn,76151,atla1278
 24 | ita_Latn,23539857,indo1319,gil_Latn,434529,aust1307,idu_Latn,75106,atla1278
 25 | ind_Latn,23018106,aust1307,pon_Latn,434522,aust1307,yom_Latn,74818,atla1278
 26 | ell_Grek,22033282,indo1319,umb_Latn,431589,atla1278,tdx_Latn,74430,aust1307
 27 | bul_Cyrl,21823004,indo1319,lvs_Latn,422952,indo1319,mzn_Arab,73719,indo1319
 28 | swe_Latn,20725883,indo1319,sco_Latn,411591,indo1319,cfm_Latn,70227,sino1245
 29 | ces_Latn,20376340,indo1319,ori_Orya,410827,zpa_Latn,69237,otom1299
 30 | isl_Latn,19547941,indo1319,arg_Latn,410683,indo1319,kbd_Cyrl,67914,abkh1242
 31 | pol_Latn,19339945,indo1319,kur_Latn,407169,indo1319,lao_Laoo,66966,taik1256
 32 | ron_Latn,19190217,indo1319,dhv_Latn,405711,aust1307,nap_Latn,65826,indo1319
 33 | dan_Latn,19174573,indo1319,luo_Latn,398974,nilo1247,qub_Latn,64973,quec1387
 34 | hun_Latn,18800025,ural1272,lun_Latn,395764,atla1278,oke_Latn,64508,atla1278
 35 | tgk_Cyrl,18659517,indo1319,nzi_Latn,394247,atla1278,ote_Latn,64224,otom1299
 36 | srp_Latn,18371769,indo1319,gug_Latn,392227,tupi1275,bsb_Latn,63634,aust1307
 37 | fas_Arab,18277593,bar_Latn,387070,indo1319,ogo_Latn,61901,atla1278
 38 | ceb_Latn,18149215,aust1307,bci_Latn,384059,atla1278,abn_Latn,61830,atla1278
 39 | heb_Hebr,18128962,afro1255,chk_Latn,380596,aust1307,ldi_Latn,61827,atla1278
 40 | hrv_Latn,17882932,indo1319,roh_Latn,377067,indo1319,ayr_Latn,61570,ayma1253
 41 | glg_Latn,17852274,indo1319,aym_Latn,373329,ayma1253,gom_Deva,61140,indo1319
 42 | fin_Latn,16730388,ural1272,yap_Latn,358929,aust1307,bba_Latn,61123,atla1278
 43 | slv_Latn,15719210,indo1319,ssw_Latn,356561,atla1278,aln_Latn,60989,indo1319
 44 | vie_Latn,15697827,aust1305,quz_Latn,354781,quec1387,leh_Latn,59944,atla1278
 45 | mkd_Cyrl,14717004,indo1319,sah_Cyrl,352697,turk1311,ban_Latn,59805,aust1307
 46 | slk_Latn,14633631,indo1319,tsn_Latn,350954,atla1278,ace_Latn,59333,aust1307
 47 | nor_Latn,14576191,indo1319,lmo_Latn,348135,indo1319,pes_Arab,57511,indo1319
 48 | est_Latn,13600579,ido_Latn,331239,arti1236,skg_Latn,57228,aust1307
 49 | ltz_Latn,12997242,indo1319,abk_Cyrl,321578,abkh1242,ary_Arab,56933,afro1255
 50 | eus_Latn,12775959,zne_Latn,318871,atla1278,hus_Latn,56176,maya1287
 51 | lit_Latn,12479626,indo1319,quy_Latn,311040,quec1387,glv_Latn,55641,indo1319
 52 | kaz_Cyrl,12378727,turk1311,kam_Latn,310659,atla1278,fat_Latn,55609,atla1278
 53 | lav_Latn,12143980,indo1319,bbc_Latn,310420,aust1307,frr_Latn,55254,indo1319
 54 | bos_Latn,11014744,indo1319,vol_Latn,310399,arti1236,mwn_Latn,54805,atla1278
 55 | epo_Latn,8737198,arti1236,wal_Latn,309873,gong1255,mai_Deva,54687,indo1319
 56 | cat_Latn,8648271,indo1319,uig_Arab,307302,turk1311,dua_Latn,53392,atla1278
 57 | tha_Thai,7735209,taik1256,vmw_Latn,306899,atla1278,dzo_Tibt,52732,sino1245
 58 | ukr_Cyrl,7462046,indo1319,kwn_Latn,305362,atla1278,ctd_Latn,52135,sino1245
 59 | tgl_Latn,7411064,aust1307,pam_Latn,303737,aust1307,nnb_Latn,52041,atla1278
 60 | sin_Sinh,7293178,indo1319,seh_Latn,300243,atla1278,sxn_Latn,51749,aust1307
 61 | gle_Latn,7225513,indo1319,tsc_Latn,298442,atla1278,mps_Latn,50645,tebe1251
 62 | hin_Deva,7046700,indo1319,nyk_Latn,297976,atla1278,mny_Latn,50581,atla1278
 63 | kor_Hang,6468444,kore1284,kmb_Latn,296269,atla1278,gkp_Latn,50549,mand1469
 64 | ory_Orya,6266475,indo1319,zai_Latn,277632,otom1299,kat_Latn,50424,kart1248
 65 | urd_Arab,6009594,indo1319,gym_Latn,274512,chib1249,bjn_Latn,49068,aust1307
 66 | swa_Latn,5989369,bod_Tibt,273489,sino1245,acr_Latn,48886,maya1287
 67 | sqi_Latn,5526836,indo1319,nde_Latn,269931,atla1278,dtp_Latn,48468,aust1307
 68 | bel_Cyrl,5319675,indo1319,fon_Latn,268566,atla1278,lam_Latn,46853,atla1278
 69 | afr_Latn,5157787,indo1319,ber_Latn,264426,bik_Latn,46561
 70 | nno_Latn,4899103,indo1319,nbl_Latn,259158,atla1278,poh_Latn,46454,maya1287
 71 | tat_Cyrl,4708088,turk1311,kmr_Latn,256677,indo1319,phm_Latn,45862,atla1278
 72 | ast_Latn,4683554,indo1319,guc_Latn,249044,araw1281,hrx_Latn,45716,indo1319
 73 | mon_Cyrl,4616960,mong1349,mam_Latn,248348,maya1287,quh_Latn,45566,quec1387
 74 | hbs_Cyrl,4598073,indo1319,nia_Latn,247406,aust1307,hyw_Cyrl,45379,indo1319
 75 | hau_Latn,4368483,afro1255,nyn_Latn,241992,atla1278,rue_Cyrl,45369,indo1319
 76 | sna_Latn,4019596,atla1278,cab_Latn,240101,araw1281,eml_Latn,44630,indo1319
 77 | msa_Latn,3929084,top_Latn,239232,toto1251,acm_Arab,44505,afro1255
 78 | som_Latn,3916769,afro1255,tog_Latn,231969,atla1278,tob_Latn,44473,guai1249
 79 | srp_Cyrl,3864091,indo1319,mco_Latn,231209,mixe1284,ach_Latn,43974,nilo1247
 80 | mlg_Latn,3715802,tzh_Latn,230706,maya1287,vep_Latn,43076,ural1272
 81 | zul_Latn,3580113,atla1278,pms_Latn,227748,indo1319,npi_Deva,43072,indo1319
 82 | arz_Arab,3488224,afro1255,wuu_Hani,224088,sino1245,tok_Latn,42820,arti1236
 83 | nya_Latn,3409030,atla1278,plt_Latn,220413,aust1307,sgs_Latn,42467,indo1319
 84 | tam_Taml,3388255,drav1251,yid_Hebr,220214,indo1319,lĳ_Latn,42447,indo1319
 85 | hat_Latn,3226932,indo1319,ada_Latn,219427,atla1278,myv_Cyrl,42147,ural1272
 86 | uzb_Latn,3223485,turk1311,iba_Latn,213615,aust1307,tih_Latn,41873,aust1307
 87 | sot_Latn,3205510,atla1278,kek_Latn,209932,maya1287,tat_Latn,41640,turk1311
 88 | uzb_Cyrl,3029947,turk1311,koo_Latn,209375,atla1278,lfn_Latn,41632,arti1236
 89 | cos_Latn,3015055,indo1319,sop_Latn,206501,atla1278,cgg_Latn,41196,atla1278
 90 | als_Latn,2954874,indo1319,kac_Latn,205542,sino1245,ful_Latn,41188,atla1278
 91 | amh_Ethi,2862985,afro1255,qvi_Latn,205447,quec1387,gor_Latn,41174,aust1307
 92 | sun_Latn,2586011,aust1307,cak_Latn,204472,maya1287,ile_Latn,40984,arti1236
 93 | war_Latn,2584810,aust1307,kbp_Latn,202877,atla1278,ium_Latn,40683,hmon1336
 94 | div_Thaa,2418687,indo1319,ctu_Latn,201662,maya1287,teo_Latn,40203,nilo1247
 95 | yor_Latn,2392359,atla1278,kri_Latn,201087,indo1319,kia_Latn,40035,atla1278
 96 | fao_Latn,2365271,indo1319,mau_Latn,199134,otom1299,crh_Cyrl,39985,turk1311
 97 | uzn_Cyrl,2293672,turk1311,scn_Latn,199068,indo1319,crh_Latn,39896,turk1311
 98 | smo_Latn,2290439,aust1307,tyv_Cyrl,198649,turk1311,enm_Latn,39809,indo1319
 99 | bak_Cyrl,2264196,turk1311,ina_Latn,197315,arti1236,sat_Olck,39614,aust1305
100 | ilo_Latn,2106531,aust1307,btx_Latn,193701,aust1307,mad_Latn,38993,aust1307
101 | tso_Latn,2100708,atla1278,nch_Latn,193129,utoa1244,cac_Latn,38812,maya1287
102 | mri_Latn,2046850,aust1307,ncj_Latn,192962,utoa1244,hnj_Latn,38611,hmon1336
103 | hmn_Latn,1903898,pau_Latn,190529,aust1307,ksh_Latn,38130,indo1319
104 | asm_Beng,1882353,indo1319,toj_Latn,189651,maya1287,ikk_Latn,38071,atla1278
105 | hil_Latn,1798875,aust1307,pcm_Latn,187594,indo1319,sba_Latn,38040,cent2225
106 | nso_Latn,1619354,atla1278,dyu_Latn,186367,mand1469,zom_Latn,37013,sino1245
107 | ibo_Latn,1543820,atla1278,kss_Latn,185868,atla1278,bqc_Latn,36881,mand1469
108 | kin_Latn,1521612,atla1278,afb_Arab,183694,afro1255,bim_Latn,36835,atla1278
109 | hye_Armn,1463123,indo1319,urh_Latn,182214,atla1278,mdy_Ethi,36370,gong1255
110 | oci_Latn,1449128,indo1319,quc_Latn,181559,maya1287,bts_Latn,36216,aust1307
111 | lin_Latn,1408460,atla1278,new_Deva,181427,sino1245,gya_Latn,35902,atla1278
112 | tpi_Latn,1401844,indo1319,yao_Latn,179965,atla1278,ajg_Latn,35631,atla1278
113 | twi_Latn,1400979,atla1278,ngl_Latn,178498,atla1278,agw_Latn,35585,aust1307
114 | kir_Cyrl,1397566,turk1311,nyu_Latn,177483,atla1278,kom_Cyrl,35249,ural1272
115 | pap_Latn,1360138,indo1319,kab_Latn,176015,afro1255,knv_Latn,35196
116 | nep_Deva,1317291,indo1319,tuk_Cyrl,175769,turk1311,giz_Latn,35040,afro1255
117 | azj_Latn,1315834,turk1311,xmf_Geor,174994,kart1248,hui_Latn,34926,nucl1709
118 | bcl_Latn,1284493,aust1307,ndc_Latn,174305,atla1278,kpg_Latn,34900,aust1307
119 | xho_Latn,1262364,atla1278,san_Deva,165616,indo1319,zea_Latn,34426,indo1319
120 | cym_Latn,1244783,indo1319,nba_Latn,163485,atla1278,aoj_Latn,34349,nucl1708
121 | gaa_Latn,1222307,atla1278,bpy_Beng,162838,indo1319,csy_Latn,34126,sino1245
122 | ton_Latn,1216118,aust1307,ncx_Latn,162558,utoa1244,azb_Arab,33758,turk1311
123 | tah_Latn,1190747,aust1307,qug_Latn,162500,quec1387,csb_Latn,33743,indo1319
124 | lat_Latn,1179913,indo1319,rmn_Latn,162069,indo1319,tpm_Latn,33517,atla1278
125 | srn_Latn,1172349,indo1319,cjk_Latn,160645,atla1278,quw_Latn,33449,quec1387
126 | ewe_Latn,1161605,atla1278,arb_Arab,159884,afro1255,rmy_Cyrl,33351,indo1319
127 | bem_Latn,1111969,atla1278,kea_Latn,158047,indo1319,ixl_Latn,33289,maya1287
128 | efi_Latn,1082621,atla1278,mck_Latn,157521,atla1278,mbb_Latn,33240,aust1307
129 | bis_Latn,1070170,indo1319,arn_Latn,155882,arau1255,pfl_Latn,33148,indo1319
130 | orm_Latn,1067699,pdt_Latn,155485,indo1319,pcd_Latn,32867,indo1319
131 | haw_Latn,1062491,aust1307,her_Latn,154827,atla1278,tlh_Latn,32863,arti1236
132 | hmo_Latn,1033636,pidg1258,gla_Latn,152563,indo1319,suz_Deva,32811,sino1245
133 | kat_Geor,1004297,kart1248,kmr_Cyrl,151728,indo1319,gcr_Latn,32676,indo1319
134 | pag_Latn,983637,aust1307,mwl_Latn,150054,indo1319,jbo_Latn,32619,arti1236
135 | loz_Latn,964418,atla1278,nav_Latn,147702,atha1245,tbz_Latn,32264,atla1278
136 | fry_Latn,957422,indo1319,ksw_Mymr,147674,sino1245,bam_Latn,32150,mand1469
137 | mya_Mymr,945180,sino1245,mxv_Latn,147591,otom1299,prk_Latn,32085,aust1305
138 | nds_Latn,944715,indo1319,hif_Latn,147261,indo1319,jam_Latn,32048,indo1319
139 | run_Latn,943828,atla1278,wol_Latn,146992,atla1278,twx_Latn,32028,atla1278
140 | pnb_Arab,899895,indo1319,sme_Latn,146803,ural1272,nmf_Latn,31997,sino1245
141 | rar_Latn,894515,aust1307,gom_Latn,143937,indo1319,caq_Latn,31903,aust1305
142 | fij_Latn,887134,aust1307,bum_Latn,141673,atla1278,rop_Latn,31889,indo1319
143 | wls_Latn,882167,aust1307,mgr_Latn,138953,atla1278,tca_Latn,31852,ticu1244
144 | ckb_Arab,874441,indo1319,ahk_Latn,135068,sino1245,yan_Latn,31775,misu1242
145 | ven_Latn,860249,atla1278,kur_Arab,134160,indo1319,xav_Latn,31765,nucl1710
146 | zsm_Latn,859947,aust1307,bas_Latn,133436,atla1278,bih_Deva,31658
147 | chv_Cyrl,859863,turk1311,bin_Latn,133256,atla1278,cuk_Latn,31612,chib1249
148 | lua_Latn,854359,atla1278,tsz_Latn,133251,tara1323,kjb_Latn,31471,maya1287
149 | que_Latn,838486,sid_Latn,130406,afro1255,hne_Deva,31465,indo1319
150 | sag_Latn,771048,atla1278,diq_Latn,128908,indo1319,wbm_Latn,31394,aust1305
151 | guw_Latn,767918,atla1278,srd_Latn,127064,zlm_Latn,31345,aust1307
152 | bre_Latn,748954,indo1319,tcf_Latn,126050,otom1299,tui_Latn,31161,atla1278
153 | toi_Latn,745385,atla1278,bzj_Latn,124958,indo1319,ifb_Latn,30980,aust1307
154 | pus_Arab,731992,indo1319,udm_Cyrl,121705,ural1272,izz_Latn,30894,atla1278
155 | che_Cyrl,728201,nakh1245,cce_Latn,120636,atla1278,rug_Latn,30857,aust1307
156 | pis_Latn,714783,indo1319,meu_Latn,120273,aust1307,aka_Latn,30704,atla1278
157 | kon_Latn,685194,chw_Latn,119751,atla1278,pxm_Latn,30698,book1242
158 | oss_Cyrl,683517,indo1319,cbk_Latn,118789,indo1319,kmm_Latn,30671,sino1245
159 | hyw_Armn,679819,indo1319,ibg_Latn,118733,aust1307,mcn_Latn,30666,afro1255
160 | iso_Latn,658789,atla1278,bhw_Latn,117381,aust1307,ifa_Latn,30621,aust1307
161 | nan_Latn,656389,sino1245,ngu_Latn,116851,utoa1244,dln_Latn,30620,sino1245
162 | lub_Latn,654390,atla1278,nyy_Latn,115914,atla1278,ext_Latn,30605,indo1319
163 | lim_Latn,652078,indo1319,szl_Latn,112496,indo1319,ksd_Latn,30550,aust1307
164 | tuk_Latn,649411,turk1311,ish_Latn,111814,atla1278,mzh_Latn,30517,mata1289
165 | tir_Ethi,649117,afro1255,naq_Latn,109747,khoe1240,llb_Latn,30480,atla1278
166 | tgk_Latn,636541,indo1319,toh_Latn,107583,atla1278,hra_Latn,30472,sino1245
167 | yua_Latn,610052,maya1287,ttj_Latn,106925,atla1278,mwm_Latn,30432,cent2225
168 | min_Latn,609065,aust1307,nse_Latn,105189,atla1278,krc_Cyrl,30353,turk1311
169 | lue_Latn,599429,atla1278,hsb_Latn,104802,indo1319,tuc_Latn,30349,aust1307
170 | khm_Khmr,590429,aust1305,ami_Latn,104559,aust1307,mrw_Latn,30304,aust1307
171 | tum_Latn,589857,atla1278,alz_Latn,104392,nilo1247,pls_Latn,30136,otom1299
172 | tll_Latn,586530,atla1278,apc_Arab,102392,afro1255,rap_Latn,30102,aust1307
173 | ekk_Latn,582595,ural1272,vls_Latn,101900,indo1319,fur_Latn,30052,indo1319
174 | lug_Latn,566948,atla1278,mhr_Cyrl,100474,ural1272,kaa_Latn,30031,turk1311
175 | niu_Latn,566715,aust1307,djk_Latn,99234,indo1319,prs_Arab,26823,indo1319
176 | tzo_Latn,540262,maya1287,wes_Latn,98492,indo1319,san_Latn,25742,indo1319
177 | mah_Latn,534614,aust1307,gkn_Latn,97041,atla1278,som_Arab,14199,afro1255
178 | tvl_Latn,521556,aust1307,grc_Grek,96986,indo1319,uig_Latn,9637,turk1311
179 | jav_Latn,516833,aust1307,hbo_Hebr,96484,afro1255,hau_Arab,9593,afro1255


--------------------------------------------------------------------------------
/misc-code/iso_list.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "ace_Arab", "ace_Latn", "acm_Arab", "acq_Arab", "aeb_Arab", "afr_Latn", "ajp_Arab", "aka_Latn", "amh_Ethi", 
 3 |     "apc_Arab", "arb_Arab", "arb_Latn", "ars_Arab", "ary_Arab", "arz_Arab", "asm_Beng", "ast_Latn", "awa_Deva", 
 4 |     "ayr_Latn", "azb_Arab", "azj_Latn", "bak_Cyrl", "bam_Latn", "ban_Latn", "bel_Cyrl", "bem_Latn", "ben_Beng", 
 5 |     "bho_Deva", "bjn_Arab", "bjn_Latn", "bod_Tibt", "bos_Latn", "bug_Latn", "bul_Cyrl", "cat_Latn", "ceb_Latn", 
 6 |     "ces_Latn", "cjk_Latn", "ckb_Arab", "crh_Latn", "cym_Latn", "dan_Latn", "deu_Latn", "dik_Latn", "dyu_Latn", 
 7 |     "dzo_Tibt", "ell_Grek", "eng_Latn", "epo_Latn", "est_Latn", "eus_Latn", "ewe_Latn", "fao_Latn", "fij_Latn", 
 8 |     "fin_Latn", "fon_Latn", "fra_Latn", "fur_Latn", "fuv_Latn", "gla_Latn", "gle_Latn", "glg_Latn", "grn_Latn", 
 9 |     "guj_Gujr", "hat_Latn", "hau_Latn", "heb_Hebr", "hin_Deva", "hne_Deva", "hrv_Latn", "hun_Latn", "hye_Armn", 
10 |     "ibo_Latn", "ilo_Latn", "ind_Latn", "isl_Latn", "ita_Latn", "jav_Latn", "jpn_Jpan", "kab_Latn", "kac_Latn", 
11 |     "kam_Latn", "kan_Knda", "kas_Arab", "kas_Deva", "kat_Geor", "knc_Arab", "knc_Latn", "kaz_Cyrl", "kbp_Latn", 
12 |     "kea_Latn", "khm_Khmr", "kik_Latn", "kin_Latn", "kir_Cyrl", "kmb_Latn", "kmr_Latn", "kon_Latn", "kor_Hang", 
13 |     "lao_Laoo", "lij_Latn", "lim_Latn", "lin_Latn", "lit_Latn", "lmo_Latn", "ltg_Latn", "ltz_Latn", "lua_Latn", 
14 |     "lug_Latn", "luo_Latn", "lus_Latn", "lvs_Latn", "mag_Deva", "mai_Deva", "mal_Mlym", "mar_Deva", "min_Arab", 
15 |     "min_Latn", "mkd_Cyrl", "plt_Latn", "mlt_Latn", "mni_Beng", "khk_Cyrl", "mos_Latn", "mri_Latn", "mya_Mymr", 
16 |     "nld_Latn", "nno_Latn", "nob_Latn", "npi_Deva", "nso_Latn", "nus_Latn", "nya_Latn", "oci_Latn", "gaz_Latn", 
17 |     "ory_Orya", "pag_Latn", "pan_Guru", "pap_Latn", "pes_Arab", "pol_Latn", "por_Latn", "prs_Arab", "pbt_Arab", 
18 |     "quy_Latn", "ron_Latn", "run_Latn", "rus_Cyrl", "sag_Latn", "san_Deva", "sat_Olck", "scn_Latn", "shn_Mymr", 
19 |     "sin_Sinh", "slk_Latn", "slv_Latn", "smo_Latn", "sna_Latn", "snd_Arab", "som_Latn", "sot_Latn", "spa_Latn", 
20 |     "als_Latn", "srd_Latn", "srp_Cyrl", "ssw_Latn", "sun_Latn", "swe_Latn", "swh_Latn", "szl_Latn", "tam_Taml", 
21 |     "tat_Cyrl", "tel_Telu", "tgk_Cyrl", "tgl_Latn", "tha_Thai", "tir_Ethi", "taq_Latn", "taq_Tfng", "tpi_Latn", 
22 |     "tsn_Latn", "tso_Latn", "tuk_Latn", "tum_Latn", "tur_Latn", "twi_Latn", "tzm_Tfng", "uig_Arab", "ukr_Cyrl", 
23 |     "umb_Latn", "urd_Arab", "uzn_Latn", "vec_Latn", "vie_Latn", "war_Latn", "wol_Latn", "xho_Latn", "ydd_Hebr", 
24 |     "yor_Latn", "yue_Hant", "zho_Hans", "zho_Hant", "zsm_Latn", "zul_Latn"
25 | ]
26 | 


--------------------------------------------------------------------------------
/misc-code/keeplist.txt:
--------------------------------------------------------------------------------
1 | facebook.com
2 | x.com
3 | twitter.com
4 | 


--------------------------------------------------------------------------------
/misc-code/madlad_alpha_3_code_extraction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 43,
  6 |    "id": "f329d77a-4778-4975-ab18-a8fa702d235d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pycountry\n",
 11 |     "\n",
 12 |     "def iso_two_to_three_conversion(code):\n",
 13 |     "    if len(code) == 2:\n",
 14 |     "        lang = pycountry.languages.get(alpha_2=code)\n",
 15 |     "        if lang:\n",
 16 |     "            return lang.alpha_3\n",
 17 |     "    elif len(code) == 3:\n",
 18 |     "        lang = pycountry.languages.get(alpha_3=code)\n",
 19 |     "        if lang:\n",
 20 |     "            return lang.alpha_3\n",
 21 |     "    return None\n",
 22 |     "\n",
 23 |     "def extract_iso_code_from_bcp(bcp_identifier):\n",
 24 |     "    # Check if there's a hyphen in the identifier\n",
 25 |     "    if '_' in bcp_identifier:\n",
 26 |     "        language, _ = bcp_identifier.split('_', 1)\n",
 27 |     "        if 2 <= len(language) <= 3:\n",
 28 |     "            return iso_two_to_three_conversion(language)\n",
 29 |     "        else:\n",
 30 |     "            # Handle non-ISO codes\n",
 31 |     "            raise ValueError(f\"Invalid ISO code: {bcp_identifier}\")\n",
 32 |     "    else:\n",
 33 |     "        # If no hyphen, assume the input is either a two-letter or three-letter code\n",
 34 |     "        if 2 <= len(bcp_identifier) <= 3:\n",
 35 |     "            return iso_two_to_three_conversion(bcp_identifier)\n",
 36 |     "        else:\n",
 37 |     "            return language\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 44,
 43 |    "id": "3606d044-3eac-4bb0-912c-b73243c8e398",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "import csv\n",
 48 |     "import json\n",
 49 |     "import numpy as np\n",
 50 |     "\n",
 51 |     "def process_tsv_to_json(tsv_file, output_json):\n",
 52 |     "    result_list = []\n",
 53 |     "\n",
 54 |     "    # Open and process the TSV file\n",
 55 |     "    with open(tsv_file, 'r', encoding='utf-8') as file:\n",
 56 |     "        reader = csv.DictReader(file, delimiter='\\t')\n",
 57 |     "        \n",
 58 |     "        # Skip the header and iterate through the rows\n",
 59 |     "        for row in reader:\n",
 60 |     "            try:\n",
 61 |     "                # Apply the conversion function on the 'BCP-47' column\n",
 62 |     "                iso_code = extract_iso_code_from_bcp(row['BCP-47'])\n",
 63 |     "                result_list.append(iso_code)\n",
 64 |     "            except Exception as e:\n",
 65 |     "                # If there's an issue with conversion, you can decide to log it or continue\n",
 66 |     "                result_list.append(row['BCP-47'])\n",
 67 |     "\n",
 68 |     "    result_list = result_list[1:]\n",
 69 |     "    # Write the result list to a JSON file\n",
 70 |     "    with open(output_json, 'w', encoding='utf-8') as json_file:\n",
 71 |     "        json.dump(result_list, json_file, ensure_ascii=False, indent=4)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 45,
 77 |    "id": "832fa182-6340-49f9-8180-8d025f7fd8d7",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "process_tsv_to_json('madlad.tsv','madlad_aplha_3.json')"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "718e7d6e-9951-481c-8630-07a4b65dfe36",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": []
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "id": "920af4e0-a094-4166-8602-9661981b4b7b",
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": []
 99 |   }
100 |  ],
101 |  "metadata": {
102 |   "kernelspec": {
103 |    "display_name": "Python 3",
104 |    "language": "python",
105 |    "name": "python3"
106 |   },
107 |   "language_info": {
108 |    "codemirror_mode": {
109 |     "name": "ipython",
110 |     "version": 3
111 |    },
112 |    "file_extension": ".py",
113 |    "mimetype": "text/x-python",
114 |    "name": "python",
115 |    "nbconvert_exporter": "python",
116 |    "pygments_lexer": "ipython3",
117 |    "version": "3.8.8"
118 |   }
119 |  },
120 |  "nbformat": 4,
121 |  "nbformat_minor": 5
122 | }
123 | 


--------------------------------------------------------------------------------
/misc-code/seedcrawler.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | from urllib.parse import urljoin, urlparse
  6 | import time
  7 | import json
  8 | from typing import List, Dict, Any
  9 | from tqdm import tqdm  # Import tqdm for progress bar
 10 | from trafilatura import extract, fetch_url
 11 | import fasttext
 12 | import urllib3
 13 | 
 14 | class SeedReader:
 15 |     def __init__(self, json_file_path: str):
 16 |         """
 17 |         Initializes the SeedReader with the path to a JSON file.
 18 |         
 19 |         :param json_file_path: The path to the JSON file containing the data.
 20 |         """
 21 |         self.json_file_path = json_file_path
 22 |         self.data = self.read_json_file()
 23 | 
 24 |     def read_json_file(self) -> List[Dict[str, Any]]:
 25 |         """
 26 |         Reads the JSON file and returns a list of dictionaries containing the data.
 27 | 
 28 |         :return: A list of dictionaries with keys: "snippet", "title", "link", "engines",
 29 |                  "category", "predicted_lid", "lid_confidence".
 30 |         """
 31 |         try:
 32 |             with open(self.json_file_path, 'r', encoding='utf-8') as file:
 33 |                 data = json.load(file)
 34 |                 # Validate that each item in the list is a dictionary with required keys
 35 |                 for item in data:
 36 |                     if not all(key in item for key in ["snippet", "title", "link", "engines", 
 37 |                                                        "category", "predicted_lid", "lid_confidence"]):
 38 |                         raise ValueError("Missing one or more required keys in the JSON data")
 39 |                 return data
 40 |         except FileNotFoundError:
 41 |             print(f"File not found: {self.json_file_path}")
 42 |             return []
 43 |         except json.JSONDecodeError:
 44 |             print(f"Error decoding JSON from file: {self.json_file_path}")
 45 |             return []
 46 | 
 47 |     def get_data(self) -> List[Dict[str, Any]]:
 48 |         """
 49 |         Returns the list of dictionaries containing the data.
 50 |         
 51 |         :return: List of data entries as dictionaries.
 52 |         """
 53 |         return self.data
 54 | 
 55 |     def get_entry_by_index(self, index: int) -> Dict[str, Any]:
 56 |         """
 57 |         Returns a specific entry by its index in the data list.
 58 | 
 59 |         :param index: The index of the entry to retrieve.
 60 |         :return: A dictionary representing the entry at the specified index.
 61 |         :raises IndexError: If the index is out of range.
 62 |         """
 63 |         try:
 64 |             return self.data[index]
 65 |         except IndexError:
 66 |             print(f"Index out of range: {index}")
 67 |             return {}
 68 | 
 69 |     def filter_by_key_value(self, key: str, value: Any) -> List[Dict[str, Any]]:
 70 |         """
 71 |         Filters the data based on a specified key-value pair.
 72 | 
 73 |         :param key: The key to filter by.
 74 |         :param value: The value to match.
 75 |         :return: A list of dictionaries where the specified key has the given value.
 76 |         """
 77 |         if not self.data:
 78 |             print("No data available for filtering.")
 79 |             return []
 80 | 
 81 |         filtered_data = [entry for entry in self.data if entry.get(key) == value]
 82 |         
 83 |         if not filtered_data:
 84 |             print(f"No entries found with {key} = {value}")
 85 |         
 86 |         return filtered_data
 87 | 
 88 | 
 89 | class SeedCrawler:
 90 |     def __init__(self, seed_url, max_pages=100):
 91 |         """
 92 |         Initializes the SeedCrawler with a seed URL and maximum pages to crawl.
 93 |         
 94 |         :param seed_url: The URL to start crawling from.
 95 |         :param max_pages: The maximum number of pages to crawl.
 96 |         """
 97 |         self.seed_url = seed_url
 98 |         self.max_pages = max_pages
 99 |         self.domain = urlparse(seed_url).netloc  # Extract domain from seed URL
100 |         self.visited = set()
101 |         self.to_visit = [seed_url]
102 |         self.all_links = set()
103 | 
104 |     def get_links(self, url):
105 |         """
106 |         Fetches all links from a webpage belonging to the specified domain.
107 | 
108 |         :param url: The URL of the webpage to fetch links from.
109 |         :return: A set of links belonging to the specified domain.
110 |         """
111 |         try:
112 |             response = requests.get(url, timeout=10)
113 |             response.raise_for_status()  # Raise an error for bad responses
114 |         except (requests.RequestException, requests.HTTPError) as e:
115 |             print(f"Error fetching {url}: {e}")
116 |             return set()
117 | 
118 |         soup = BeautifulSoup(response.text, 'html.parser')
119 |         links = set()
120 | 
121 |         # Find all anchor tags with href attributes
122 |         for anchor in soup.find_all('a', href=True):
123 |             # Resolve relative URLs
124 |             link = urljoin(url, anchor['href'])
125 |             # Parse the URL to check its domain
126 |             parsed_link = urlparse(link)
127 | 
128 |             # Check if the link belongs to the specified domain
129 |             if parsed_link.netloc.endswith(self.domain):
130 |                 links.add(link)
131 | 
132 |         return links
133 | 
134 |     def crawl_website(self):
135 |         """
136 |         Crawls the entire website starting from the seed URL.
137 | 
138 |         :return: A set of all found links belonging to the specified domain.
139 |         """
140 |         print(f"Crawling links from: {self.seed_url}")
141 |         while self.to_visit and len(self.visited) < self.max_pages:
142 |             current_url = self.to_visit.pop(0)
143 |             
144 |             if current_url in self.visited:
145 |                 continue
146 |             
147 |             #print(f"Crawling: {current_url}")
148 |             print("...")
149 |             links = self.get_links(current_url)
150 |             self.all_links.update(links)
151 |             
152 |             # Add new links to the to_visit list
153 |             for link in links:
154 |                 if link not in self.visited and link not in self.to_visit:
155 |                     self.to_visit.append(link)
156 |             
157 |             self.visited.add(current_url)
158 |             time.sleep(1)  # Be polite and wait a bit before next request
159 | 
160 |         print(f"Finished crawling links from: {self.seed_url}")
161 |         return self.all_links
162 | 
163 | 
164 | class LanguageDetector:
165 |     def __init__(self, model):
166 |         """
167 |         Initialize the LanguageDetector class with a FastText model.
168 | 
169 |         Args:
170 |             model: The loaded FastText model.
171 |         """
172 |         self.model = model  # Store the model reference instead of loading again.
173 | 
174 |     @staticmethod
175 |     def extract_language_code(input_str):
176 |         """
177 |         Extract the language code from a FastText prediction string.
178 | 
179 |         Args:
180 |             input_str (str): The input prediction string from FastText.
181 | 
182 |         Returns:
183 |             str or None: Extracted language code or None if not found.
184 |         """
185 |         pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
186 |         match = re.search(pattern, input_str)
187 |         if match:
188 |             return match.group(2)
189 |         else:
190 |             return None
191 | 
192 |     @staticmethod
193 |     def trafilatura_scrape(url):
194 |         """
195 |         Scrape text content from a given URL using Trafilatura.
196 | 
197 |         Args:
198 |             url (str): The URL to scrape.
199 | 
200 |         Returns:
201 |             str: Extracted text content from the URL.
202 |         """
203 |         document = fetch_url(url)
204 |         text = extract(document)
205 |         return text
206 | 
207 |     def language_predict(self, scraped_text):
208 |         """
209 |         Predict the language of the given text using the FastText model.
210 | 
211 |         Args:
212 |             scraped_text (str): The text content to predict language for.
213 | 
214 |         Returns:
215 |             tuple: (language label, confidence score) or (None, None) if text is empty.
216 |         """
217 |         if scraped_text is not None:
218 |             lid_label_script = self.model.predict(scraped_text.replace('\n', ''))
219 |             lid_label = self.extract_language_code(lid_label_script[0][0])
220 |             lid_confidence = lid_label_script[1][0]
221 |             return lid_label, lid_confidence
222 |         return None, None
223 | 
224 |     def filter_seeds(self, links, input_label, confidence):
225 |         """
226 |         Filter URLs based on language prediction and confidence score.
227 | 
228 |         Args:
229 |             links (list): List of URLs to filter.
230 |             input_label (str): The desired language label.
231 |             confidence (float): The minimum confidence score required.
232 | 
233 |         Returns:
234 |             list: Filtered list of URLs matching the language criteria.
235 |         """
236 |         new_list = []
237 |         # Use tqdm to add a progress bar to the loop
238 |         for link in tqdm(links, desc="Filtering scraped links", unit="link"):
239 |             scraped_text = self.trafilatura_scrape(link)
240 |             lid_label, lid_confidence = self.language_predict(scraped_text)
241 |             if (lid_label == input_label) and (lid_confidence >= confidence):
242 |                 new_list.append({"link": link, "lid_label": lid_label, "lid_confidence": lid_confidence})
243 |         return new_list
244 | 
245 | 
246 | # Example usage
247 | if __name__ == "__main__":
248 |     input_label = "urd_Latn"  # Replace with your JSON file name
249 |     json_file_path = input_label + ".json"
250 |     input_confidence = 0.8
251 |     model_path = "model_v3.bin"
252 |     
253 |     # Load the model once
254 |     model = fasttext.load_model(model_path)
255 |     
256 |     reader = SeedReader(json_file_path)
257 | 
258 |     all_data = reader.get_data()
259 | 
260 |     final_list = []
261 |     lang_detector = LanguageDetector(model)  # Initialize once and reuse
262 |     
263 |     for entry in all_data:
264 |         # if confidence level threshold condition met.
265 |         # initialize a big list of websites and put the seed in it.
266 |         
267 |         if entry['lid_confidence'] > input_confidence:
268 | 
269 |             seed_url = entry['link']
270 |             crawler = SeedCrawler(seed_url, max_pages=100)
271 |             all_website_links = crawler.crawl_website()
272 | 
273 |             # Use the same LanguageDetector instance
274 |             filtered_links = lang_detector.filter_seeds(all_website_links, input_label, input_confidence)
275 |             final_list.extend(filtered_links)
276 |             print(final_list)
277 | 
278 |     print(final_list)


--------------------------------------------------------------------------------
/misc-code/trafilatura_filter:
--------------------------------------------------------------------------------
 1 | #IMPORTS
 2 | 
 3 | import os
 4 | import re
 5 | import json
 6 | from trafilatura import extract
 7 | from trafilatura import fetch_url
 8 | import fasttext
 9 | from huggingface_hub import hf_hub_download
10 | import urllib3
11 | 
12 | #FUNCTIONS
13 | 
14 | def remove_entries_with_domains(json_data, domain_file):
15 |     with open(domain_file, 'r') as f:
16 |         domains = [line.strip() for line in f.readlines()]
17 |     
18 |     for iso_key, entries in json_data.items():
19 |         json_data[iso_key] = [entry for entry in entries if 'link' in entry and not any(domain in entry['link'] for domain in domains)]
20 |     
21 |     return json_data
22 | 
23 | 
24 | def extract_language_code(input_str):
25 |     pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
26 |     match = re.search(pattern, input_str)
27 |     if match:
28 |         return match.group(2)
29 |     else:
30 |         return None
31 | 
32 | def trafilatura_scrape(url):
33 |   document = fetch_url(url)
34 |   text = extract(document)
35 | 
36 |   return text
37 | 
38 | def scraped_lang_filter(filtered_data, model, iso_list):
39 |     lang_filtered_data = {}
40 |     for key in filtered_data:
41 |         entry = filtered_data[key]
42 |         new_entry = []
43 |         for item in entry:
44 |             link = item['link']
45 |             scraped_text = trafilatura_scrape(link)
46 |             if scraped_text is not None:
47 |                 lid_label_script = model.predict(scraped_text.replace('\n', ''))
48 |                 lid_label = extract_language_code(lid_label_script[0][0])
49 |                 #print("lid_label:", lid_label)  # Add this line for debugging
50 |                 if lid_label not in iso_list:
51 |                     new_entry.append(item)
52 |                     if key not in lang_filtered_data:
53 |                         lang_filtered_data[key] = []
54 |                     lang_filtered_data[key].append(item)
55 |         #print("new_entry:", new_entry)  # Add this line for debugging
56 |         # Save new_entry as JSON file
57 |         if(len(new_entry)>0):
58 |           filename = f"{key}.json"
59 |           directory = "/content/drive/MyDrive/glotsparse/search_dump/lang_dump_individual"
60 |           os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
61 |           filepath = os.path.join(directory, filename)
62 |           with open(filepath, "w", encoding='utf-8') as json_file:
63 |               json.dump(new_entry, json_file, ensure_ascii = False)
64 |     print("lang_filtered_data:", lang_filtered_data)  # Add this line for debugging
65 |     return lang_filtered_data
66 | 
67 | #Execution
68 | 
69 | model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin")
70 | model = fasttext.load_model(model_path)
71 | 
72 | filename = '100-200.json'
73 | ### Load Json dump of web search.
74 | with open(filename, 'r') as json_file:
75 |     data = json.load(json_file)
76 | ### ISO code list of top 200 languages
77 | with open('iso_list.json', 'r') as f:
78 |     iso_list = json.load(f)
79 | 
80 | ### Filter searches using filterlist.txt
81 | filtered_data = remove_entries_with_domains(data, 'filterlist.txt')
82 | #scraped_lang_filtered_data = scraped_lang_filter(filtered_data, model, iso_list)
83 | urllib3.disable_warnings()
84 | scraped_lang_filtered_data = scraped_lang_filter(filtered_data, model, iso_list)
85 | filepath = f"{filename}_filtered.json"
86 | 
87 | # Save dictionary as JSON file
88 | with open(filepath, "w", encoding='utf-8') as json_file:
89 |     json.dump(scraped_lang_filtered_data, json_file, ensure_ascii = False)
90 | 


--------------------------------------------------------------------------------
/modeling/create_language_modeling_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | import json
 4 | from typing import List, Dict, Any
 5 | from tqdm import tqdm  # Add tqdm for progress bars
 6 | 
 7 | 
 8 | def load_config(config_file: str) -> Dict[str, Any]:
 9 |     """Load configuration from a YAML file."""
10 |     with open(config_file, 'r') as file:
11 |         return yaml.safe_load(file)
12 | 
13 | 
14 | def load_crawled_output(code: str, data_dir: str) -> List[Dict[str, Any]]:
15 |     """Load crawled output JSON for a given code."""
16 |     file_path = os.path.join(data_dir, f"{code}_crawled_output.json")
17 |     with open(file_path, 'r', encoding='utf-8') as file:
18 |         return json.load(file)
19 | 
20 | 
21 | def load_robots_filtered_output(code: str, data_dir: str) -> Dict[str, Any]:
22 |     """Load robots-filtered JSON for a given code."""
23 |     file_path = os.path.join(data_dir, f"{code}.json")
24 |     with open(file_path, 'r', encoding='utf-8') as file:
25 |         return json.load(file)
26 | 
27 | 
28 | def get_full_list(robots_filtered_data: Dict[str, Any]) -> List[str]:
29 |     """Extract the full list of allowed links."""
30 |     full_list = []
31 |     for site in robots_filtered_data['Sites']:
32 |         full_list.extend(site['Links'])
33 |     return full_list
34 | 
35 | 
36 | def generate_text_file(
37 |     code: str, 
38 |     crawled_output_dir: str, 
39 |     robots_filtered_output_dir: str, 
40 |     text_files_output_dir: str
41 | ):
42 |     """Generate a text file containing filtered and concatenated data."""
43 |     print(f"Processing code: {code}")  # Indicate which code is being processed
44 |     
45 |     # Load the required data
46 |     crawled_data = load_crawled_output(code, crawled_output_dir)
47 |     robots_filtered_data = load_robots_filtered_output(code, robots_filtered_output_dir)
48 |     full_list = set(get_full_list(robots_filtered_data))  # Use a set for O(1) lookups
49 | 
50 |     # Collect matching text with tqdm for progress indication
51 |     text_parts = []
52 |     for item in tqdm(crawled_data, desc=f"Filtering {code}", unit="item"):
53 |         if item['link'] in full_list:
54 |             text_parts.append(item['text'])
55 | 
56 |     # Ensure the output directory exists
57 |     os.makedirs(text_files_output_dir, exist_ok=True)
58 | 
59 |     # Save the concatenated string to a file
60 |     output_file_path = os.path.join(text_files_output_dir, f"{code}.txt")
61 |     with open(output_file_path, 'w', encoding='utf-8') as file:
62 |         file.write('\n'.join(text_parts))
63 | 
64 | 
65 | def main():
66 |     """Main function to execute the script."""
67 |     # Load configuration
68 |     config = load_config('config.yaml')
69 |     robots_filtered_output_dir = config['output']['cleaned_directory']
70 |     crawled_output_dir = config['output']['directory']
71 |     text_files_output_dir = config['output']['text_files_directory']
72 |     code = config['language_detector']['desired_language']
73 | 
74 |     # Determine input labels
75 |     if config['batch_processing']['enabled']:
76 |         input_labels = config['batch_processing']['input_labels']
77 |     else:
78 |         input_labels = [code]
79 | 
80 |     # Process each label with tqdm for progress tracking
81 |     for label in tqdm(input_labels, desc="Processing labels", unit="label"):
82 |         generate_text_file(label, crawled_output_dir, robots_filtered_output_dir, text_files_output_dir)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/output/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cisnlp/GlotWeb/921fbf2f5e307f1f3e0927d078fa5afc95b74673/output/.keep


--------------------------------------------------------------------------------
/output/crawled/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cisnlp/GlotWeb/921fbf2f5e307f1f3e0927d078fa5afc95b74673/output/crawled/.keep


--------------------------------------------------------------------------------
/pipeline/config.yaml:
--------------------------------------------------------------------------------
  1 | # YAML configuration for SeedReader, SeedCrawler, and LanguageDetector
  2 | 
  3 | # SeedReader configuration
  4 | seed_reader:
  5 |   input_directory: "crawler_input" # Directory containing the input JSON files
  6 | 
  7 | # SeedCrawler configuration
  8 | seed_crawler:
  9 |   max_pages: 100 # Maximum number of pages to crawl
 10 |   max_workers: 10 # Maximum number of threads for crawling
 11 |   crawl_delay: 0.1 # Delay between each crawling request in seconds
 12 |   max_time: 600 # Maximum crawl time in seconds
 13 |   to_visit_growth_factor: 60
 14 | 
 15 | # LanguageDetector configuration
 16 | language_detector:
 17 |   model_path: "model_v3.bin" # Path to the FastText model file
 18 |   desired_language: "bpy_Beng" # Target language code
 19 |   minimum_confidence: 0.7 # Minimum confidence score for language detection
 20 |   save_text: True
 21 | 
 22 | # Output configuration
 23 | output:
 24 |   directory: "output/crawled/" # Directory to save output JSON files
 25 |   output_file_name: "{language}_crawled_output.json" # Output file pattern
 26 |   formated_directory: "output/formatted/"
 27 |   formated_file_name: "{language}.json"
 28 |   cleaned_directory: "output/robots_filtered"
 29 |   text_files_directory: "output/text_files"
 30 | 
 31 | # Logging configuration
 32 | logging:
 33 |   level: "INFO" # Logging level: DEBUG, INFO, WARNING, ERROR
 34 |   file_path: "logs/application.log" # Path to the log file
 35 | 
 36 | # Progress bar configuration
 37 | progress_bar:
 38 |   enabled: True # Enable or disable tqdm progress bar
 39 | 
 40 | # ThreadPoolExecutor configuration
 41 | executor:
 42 |   max_workers_reader: 5 # Maximum number of threads for reading data
 43 | 
 44 | # URL settings
 45 | url_settings:
 46 |   max_url_length: 65536 # Maximum URL length allowed
 47 |   request_timeout: 10 # Timeout for HTTP requests in seconds
 48 | 
 49 | domain_file: "metadata/filterlist.txt"
 50 | 
 51 | batch_processing:
 52 |   enabled: True # Set to true to enable batch processing
 53 |   # List of languages to process
 54 |   input_labels: [
 55 |   "abk_Cyrl",
 56 |   "abq_Cyrl",
 57 |   "abs_Latn",
 58 |   "ace_Latn",
 59 |   "ady_Cyrl",
 60 |   "ahk_Latn",
 61 |   "aii_Syrc",
 62 |   "ain_Latn",
 63 |   "ajz_Latn",
 64 |   "akb_Latn",
 65 |   "alt_Cyrl",
 66 |   "ami_Latn",
 67 |   "amu_Latn",
 68 |   "ann_Latn",
 69 |   "anp_Deva",
 70 |   "ast_Latn",
 71 |   "atb_Latn",
 72 |   "atj_Latn",
 73 |   "ava_Cyrl",
 74 |   "ban_Latn",
 75 |   "bbc_Latn",
 76 |   "bcc_Arab",
 77 |   "bcl_Latn",
 78 |   "bdq_Latn",
 79 |   "bew_Latn",
 80 |   "bho_Deva",
 81 |   "bim_Latn",
 82 |   "bin_Latn",
 83 |   "bis_Latn",
 84 |   "bjn_Latn",
 85 |   "blk_Mymr",
 86 |   "blw_Latn",
 87 |   "bpy_Beng",
 88 |   "bre_Latn",
 89 |   "brh_Arab",
 90 |   "bru_Latn",
 91 |   "brx_Deva",
 92 |   "bts_Latn",
 93 |   "btx_Latn",
 94 |   "bxr_Cyrl",
 95 |   "bzj_Latn",
 96 |   "cak_Latn",
 97 |   "cbk_Latn",
 98 |   "ccp_Latn",
 99 |   "ceb_Latn",
100 |   "cfm_Latn",
101 |   "cho_Latn",
102 |   "chu_Cyrl",
103 |   "chv_Cyrl",
104 |   "ckm_Latn",
105 |   "ckt_Cyrl",
106 |   "cmo_Latn",
107 |   "cnh_Latn",
108 |   "cop_Copt",
109 |   "cor_Latn",
110 |   "cos_Latn",
111 |   "crk_Cans",
112 |   "crk_Latn",
113 |   "crl_Cans",
114 |   "crs_Latn",
115 |   "csb_Latn",
116 |   "csw_Latn",
117 |   "csy_Latn",
118 |   "ctd_Latn",
119 |   "ctu_Latn",
120 |   "dag_Latn",
121 |   "dak_Latn",
122 |   "dar_Cyrl",
123 |   "dik_Latn",
124 |   "diq_Latn",
125 |   "dje_Latn",
126 |   "djk_Latn",
127 |   "djr_Latn",
128 |   "dng_Cyrl",
129 |   "doi_Deva",
130 |   "dsb_Latn",
131 |   "dtp_Latn",
132 |   "dts_Latn",
133 |   "dua_Latn",
134 |   "dyi_Latn",
135 |   "dyo_Latn",
136 |   "efi_Latn",
137 |   "esu_Latn",
138 |   "eve_Cyrl",
139 |   "ewe_Latn",
140 |   "ext_Latn",
141 |   "fij_Latn",
142 |   "fkv_Latn",
143 |   "fon_Latn",
144 |   "frr_Latn",
145 |   "fub_Latn",
146 |   "fuf_Latn",
147 |   "fur_Latn",
148 |   "fuv_Latn",
149 |   "gag_Latn",
150 |   "gcf_Latn",
151 |   "gil_Latn",
152 |   "glk_Arab",
153 |   "glv_Latn",
154 |   "gor_Latn",
155 |   "gos_Latn",
156 |   "gug_Latn",
157 |   "guw_Latn",
158 |   "guz_Latn",
159 |   "hac_Arab",
160 |   "hil_Latn",
161 |   "hmr_Latn",
162 |   "hne_Deva",
163 |   "hnj_Latn",
164 |   "hsb_Latn",
165 |   "hus_Latn",
166 |   "iba_Latn",
167 |   "ibg_Latn",
168 |   "ike_Cans",
169 |   "ikt_Latn",
170 |   "ikw_Latn",
171 |   "inh_Cyrl",
172 |   "ium_Latn",
173 |   "jra_Latn",
174 |   "jvn_Latn",
175 |   "kaa_Cyrl",
176 |   "kab_Latn",
177 |   "kac_Latn",
178 |   "kbd_Cyrl",
179 |   "kby_Latn",
180 |   "kcg_Latn",
181 |   "kck_Latn",
182 |   "kea_Latn",
183 |   "kek_Latn",
184 |   "kik_Latn",
185 |   "kjh_Cyrl",
186 |   "kmb_Latn",
187 |   "knc_Latn",
188 |   "kog_Latn",
189 |   "koi_Cyrl",
190 |   "kpe_Latn",
191 |   "kpv_Cyrl",
192 |   "kqn_Latn",
193 |   "krc_Cyrl",
194 |   "kri_Latn",
195 |   "krl_Latn",
196 |   "ksh_Latn",
197 |   "ksw_Mymr",
198 |   "ktu_Latn",
199 |   "ktz_Latn",
200 |   "kua_Latn",
201 |   "kum_Cyrl",
202 |   "kwn_Latn",
203 |   "kzj_Latn",
204 |   "lbe_Cyrl",
205 |   "lew_Latn",
206 |   "lez_Cyrl",
207 |   "lfn_Cyrl",
208 |   "lfn_Latn",
209 |   "lgg_Latn",
210 |   "lia_Latn",
211 |   "lif_Limb",
212 |   "lij_Latn",
213 |   "lis_Lisu",
214 |   "liv_Latn",
215 |   "ljp_Latn",
216 |   "lki_Arab",
217 |   "lld_Latn",
218 |   "lmk_Latn",
219 |   "lmo_Latn",
220 |   "lsi_Latn",
221 |   "ltg_Latn",
222 |   "lug_Latn",
223 |   "lun_Latn",
224 |   "luo_Latn",
225 |   "lwg_Latn",
226 |   "maa_Latn",
227 |   "mag_Deva",
228 |   "mak_Latn",
229 |   "mal_Latn",
230 |   "mam_Latn",
231 |   "mar_Latn",
232 |   "mas_Latn",
233 |   "mau_Latn",
234 |   "maz_Latn",
235 |   "mdf_Cyrl",
236 |   "men_Latn",
237 |   "mfe_Latn",
238 |   "mfq_Latn",
239 |   "mfy_Latn",
240 |   "mhi_Latn",
241 |   "mhw_Latn",
242 |   "mhx_Latn",
243 |   "mic_Latn",
244 |   "min_Latn",
245 |   "mip_Latn",
246 |   "miq_Latn",
247 |   "mjc_Latn",
248 |   "mjw_Latn",
249 |   "mkn_Latn",
250 |   "mnb_Latn",
251 |   "mni_Beng",
252 |   "mni_Latn",
253 |   "mni_Mtei",
254 |   "mnk_Latn",
255 |   "mns_Cyrl",
256 |   "mnw_Mymr",
257 |   "moc_Latn",
258 |   "moh_Latn",
259 |   "mph_Latn",
260 |   "mqj_Latn",
261 |   "mrw_Latn",
262 |   "mui_Latn",
263 |   "mup_Deva",
264 |   "mvp_Latn",
265 |   "mwl_Latn",
266 |   "mxt_Latn",
267 |   "myv_Cyrl",
268 |   "mzn_Arab",
269 |   "nah_Latn",
270 |   "nan_Latn",
271 |   "naq_Latn",
272 |   "nbl_Latn",
273 |   "nbu_Latn",
274 |   "nch_Latn",
275 |   "ncj_Latn",
276 |   "ndo_Latn",
277 |   "new_Deva",
278 |   "nhi_Latn",
279 |   "nhw_Latn",
280 |   "nhx_Latn",
281 |   "nia_Latn",
282 |   "niu_Latn",
283 |   "njn_Latn",
284 |   "nmz_Latn",
285 |   "nnb_Latn",
286 |   "nnw_Latn",
287 |   "nog_Cyrl",
288 |   "non_Latn",
289 |   "not_Latn",
290 |   "nov_Latn",
291 |   "npi_Latn",
292 |   "npy_Latn",
293 |   "nqo_Nkoo",
294 |   "nso_Latn",
295 |   "nsu_Latn",
296 |   "nuj_Latn",
297 |   "nus_Latn",
298 |   "nyk_Latn",
299 |   "nyn_Latn",
300 |   "nzi_Latn",
301 |   "nzm_Latn",
302 |   "obo_Latn",
303 |   "ojb_Latn",
304 |   "olo_Latn",
305 |   "omw_Latn",
306 |   "orv_Cyrl",
307 |   "ory_Latn",
308 |   "oss_Cyrl",
309 |   "ota_Arab",
310 |   "otd_Latn",
311 |   "ote_Latn",
312 |   "otw_Latn",
313 |   "pag_Latn",
314 |   "pam_Latn",
315 |   "pan_Latn",
316 |   "pbb_Latn",
317 |   "pcd_Latn",
318 |   "pck_Latn",
319 |   "pdt_Latn",
320 |   "pfl_Latn",
321 |   "pis_Latn",
322 |   "pjt_Latn",
323 |   "pls_Latn",
324 |   "pma_Latn",
325 |   "pmf_Latn",
326 |   "pms_Latn",
327 |   "pmx_Latn",
328 |   "pnt_Grek",
329 |   "poe_Latn",
330 |   "pot_Latn",
331 |   "ppk_Latn",
332 |   "prg_Latn",
333 |   "ptu_Latn",
334 |   "pui_Latn",
335 |   "pwn_Latn",
336 |   "qub_Latn",
337 |   "qug_Latn",
338 |   "qup_Latn",
339 |   "quy_Latn",
340 |   "quz_Latn",
341 |   "qvc_Latn",
342 |   "qvw_Latn",
343 |   "qxl_Latn",
344 |   "qxo_Latn",
345 |   "rad_Latn",
346 |   "rar_Latn",
347 |   "raw_Latn",
348 |   "rcf_Latn",
349 |   "rjs_Deva",
350 |   "rmc_Latn",
351 |   "rml_Latn",
352 |   "rmn_Cyrl",
353 |   "rmn_Latn",
354 |   "rmy_Cyrl",
355 |   "rmy_Latn",
356 |   "rnl_Latn",
357 |   "ron_Cyrl",
358 |   "rop_Latn",
359 |   "rtm_Latn",
360 |   "rue_Cyrl",
361 |   "rup_Latn",
362 |   "sat_Latn",
363 |   "sbd_Latn",
364 |   "sdc_Latn",
365 |   "ses_Latn",
366 |   "sgc_Latn",
367 |   "sgh_Cyrl",
368 |   "sgs_Latn",
369 |   "shi_Latn",
370 |   "shk_Latn",
371 |   "shn_Mymr",
372 |   "shp_Latn",
373 |   "sid_Latn",
374 |   "skr_Arab",
375 |   "sma_Latn",
376 |   "smj_Latn",
377 |   "smk_Latn",
378 |   "sml_Latn",
379 |   "smn_Latn",
380 |   "snd_Deva",
381 |   "sop_Latn",
382 |   "spp_Latn",
383 |   "srn_Latn",
384 |   "srr_Latn",
385 |   "ssg_Latn",
386 |   "stq_Latn",
387 |   "suk_Latn",
388 |   "swg_Latn",
389 |   "swp_Latn",
390 |   "sxb_Latn",
391 |   "syc_Syrc",
392 |   "syl_Beng",
393 |   "szl_Latn",
394 |   "szy_Latn",
395 |   "tab_Cyrl",
396 |   "tah_Latn",
397 |   "tam_Latn",
398 |   "tar_Latn",
399 |   "tat_Latn",
400 |   "tay_Latn",
401 |   "tcy_Knda",
402 |   "tcz_Latn",
403 |   "tel_Latn",
404 |   "teo_Latn",
405 |   "tgp_Latn",
406 |   "thl_Deva",
407 |   "tig_Ethi",
408 |   "tkl_Latn",
409 |   "tkr_Cyrl",
410 |   "tlh_Latn",
411 |   "tll_Latn",
412 |   "tly_Latn",
413 |   "tnr_Latn",
414 |   "tob_Latn",
415 |   "toi_Latn",
416 |   "toj_Latn",
417 |   "tok_Latn",
418 |   "top_Latn",
419 |   "trn_Latn",
420 |   "trp_Latn",
421 |   "trv_Latn",
422 |   "tsg_Latn",
423 |   "tuk_Arab",
424 |   "tuk_Cyrl",
425 |   "tyv_Cyrl",
426 |   "tzh_Latn",
427 |   "tzm_Tfng",
428 |   "tzo_Latn",
429 |   "udm_Cyrl",
430 |   "uig_Cyrl",
431 |   "uig_Latn",
432 |   "und_Dsrt",
433 |   "und_Gran",
434 |   "und_Hung",
435 |   "und_Newa",
436 |   "und_Shaw",
437 |   "und_Sylo",
438 |   "uzs_Arab",
439 |   "vep_Latn",
440 |   "vmw_Latn",
441 |   "vol_Latn",
442 |   "wal_Latn",
443 |   "war_Latn",
444 |   "way_Latn",
445 |   "wbm_Latn",
446 |   "wbp_Latn",
447 |   "wes_Latn",
448 |   "wls_Latn",
449 |   "wlx_Latn",
450 |   "wsg_Telu",
451 |   "xal_Cyrl",
452 |   "xmf_Geor",
453 |   "xmm_Latn",
454 |   "xmv_Latn",
455 |   "xtm_Latn",
456 |   "yap_Latn",
457 |   "yka_Latn",
458 |   "yli_Latn",
459 |   "yom_Latn",
460 |   "yrk_Cyrl",
461 |   "yrl_Latn",
462 |   "yua_Latn",
463 |   "yue_Hani",
464 |   "zai_Latn",
465 |   "zdj_Latn",
466 |   "zea_Latn",
467 |   "zom_Latn",
468 |   "zpm_Latn",
469 |   "zpo_Latn",
470 |   "zpu_Latn",
471 |   "zyb_Latn",
472 |   "zyp_Latn"
473 | ]
474 |   cooldown_between_languages: 60 # Seconds to wait between processing each language
475 | 


--------------------------------------------------------------------------------
/pipeline/filter_config.yaml:
--------------------------------------------------------------------------------
1 | model_path: "model_v3.bin"
2 | json_filename: "search_dump/results_0-10.json"
3 | iso_list_file: "metadata/iso_list.json"
4 | domain_file: "metadata/filterlist.txt"
5 | output_directory: "lang_dump_confidence"
6 | 


--------------------------------------------------------------------------------
/pipeline/language_filter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import fasttext
  5 | from trafilatura import extract, fetch_url
  6 | import urllib3
  7 | from tqdm import tqdm
  8 | import yaml
  9 | 
 10 | class LanguageFilter:
 11 |     def __init__(self, config_file):
 12 |         self.load_config(config_file)
 13 |         self.model = self.load_model()
 14 |     
 15 |     def load_config(self, config_file):
 16 |         with open(config_file, 'r') as file:
 17 |             self.config = yaml.safe_load(file)
 18 |     
 19 |     def load_model(self):
 20 |         model_path = self.config['model_path']  # Use the local model path from the config
 21 |         return fasttext.load_model(model_path)
 22 | 
 23 |     def remove_entries_with_domains(self, json_data):
 24 |         with open(self.config['domain_file'], 'r') as f:
 25 |             domains = [line.strip() for line in f.readlines()]
 26 |         
 27 |         for iso_key, entries in json_data.items():
 28 |             json_data[iso_key] = [entry for entry in entries if 'link' in entry and not any(domain in entry['link'] for domain in domains)]
 29 |         
 30 |         return json_data
 31 | 
 32 |     def extract_language_code(self, input_str):
 33 |         pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
 34 |         match = re.search(pattern, input_str)
 35 |         if match:
 36 |             return match.group(2)
 37 |         else:
 38 |             return None
 39 | 
 40 |     def trafilatura_scrape(self, url):
 41 |         document = fetch_url(url)
 42 |         text = extract(document)
 43 |         return text
 44 | 
 45 |     def scraped_lang_filter(self, filtered_data, iso_list):
 46 |         scraped_lang_list = []
 47 |         output_directory = self.config['output_directory']
 48 |         
 49 |         for key in tqdm(filtered_data, desc="Processing"):
 50 |             entry = filtered_data[key]
 51 |             for item in entry:
 52 |                 link = item['link']
 53 |                 scraped_text = self.trafilatura_scrape(link)
 54 |                 
 55 |                 if scraped_text is not None:
 56 |                     lid_label_script = self.model.predict(scraped_text.replace('\n', ''))
 57 |                     lid_label = self.extract_language_code(lid_label_script[0][0])
 58 |                     lid_confidence = lid_label_script[1][0]
 59 |                     item['predicted_lid'] = lid_label
 60 |                     item['lid_confidence'] = lid_confidence
 61 | 
 62 |                     if lid_label not in iso_list:
 63 |                         output_file_path = os.path.join(output_directory, f'{lid_label}.json')
 64 |                         if os.path.exists(output_file_path):
 65 |                             with open(output_file_path, 'r') as output_file:
 66 |                                 existing_data = json.load(output_file)
 67 |                         else:
 68 |                             existing_data = []
 69 | 
 70 |                         scraped_lang_list.append(lid_label)
 71 |                         existing_data.append(item)
 72 |                         
 73 |                         with open(output_file_path, 'w') as output_file:
 74 |                             json.dump(existing_data, output_file, ensure_ascii=False, indent=4)
 75 | 
 76 |         print(scraped_lang_list)
 77 |         print(len(scraped_lang_list))
 78 | 
 79 |     def run(self):
 80 |         # Load the JSON dump of web search
 81 |         with open(self.config['json_filename'], 'r') as json_file:
 82 |             data = json.load(json_file)
 83 | 
 84 |         # Load ISO code list
 85 |         with open(self.config['iso_list_file'], 'r') as f:
 86 |             iso_list = json.load(f)
 87 |         
 88 |         # Filter the search results using the domain list
 89 |         filtered_data = self.remove_entries_with_domains(data)
 90 |         
 91 |         # Perform the language filter and scraping
 92 |         self.scraped_lang_filter(filtered_data, iso_list)
 93 |         
 94 |         print("DONE ALL")
 95 | 
 96 | if __name__ == "__main__":
 97 |     config_file = "pipeline/filter_config.yaml"
 98 |     filter_instance = LanguageFilter(config_file)
 99 |     filter_instance.run()
100 | 


--------------------------------------------------------------------------------
/pipeline/search_config.yaml:
--------------------------------------------------------------------------------
 1 | searx_host: "http://127.0.0.1:8080"
 2 | engines:
 3 |   - "bing"
 4 |   - "yahoo"
 5 |   - "qwant"
 6 |   - "duckduckgo"
 7 | num_results: 50
 8 | max_retries: 3
 9 | retry_wait_time: 2
10 | output_file_prefix: "results"
11 | input_file: "metadata/randomsample.txt"
12 | start_index: 0
13 | end_index: 10
14 | output_directory: "search_dump"
15 | 


--------------------------------------------------------------------------------
/pipeline/search_service.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | import json
 4 | import time
 5 | from langchain_community.utilities import SearxSearchWrapper
 6 | from requests.exceptions import RequestException
 7 | from urllib3.exceptions import ProtocolError
 8 | 
 9 | class SearchService:
10 |     def __init__(self, config_file):
11 |         self.config = self.load_config(config_file)
12 |         self.search = SearxSearchWrapper(searx_host=self.config['searx_host'])
13 | 
14 |     def load_config(self, config_file):
15 |         with open(config_file, 'r') as file:
16 |             return yaml.safe_load(file)
17 | 
18 |     def search_and_save_results(self):
19 |         start_index = self.config['start_index']
20 |         end_index = self.config['end_index']
21 |         file_path = self.config['input_file']
22 |         output_directory = self.config['output_directory']
23 | 
24 |         if not os.path.exists(output_directory):
25 |             os.makedirs(output_directory)
26 | 
27 |         with open(file_path, 'r') as file:
28 |             results_dict = {}
29 |             for i, line in enumerate(file):
30 |                 if start_index <= i < end_index:
31 |                     iso_code, sentence = line.strip().split('\t')
32 |                     try:
33 |                         results = self.retry_request(
34 |                             lambda: self.search.results(
35 |                                 sentence,
36 |                                 num_results=self.config['num_results'],
37 |                                 engines=self.config['engines']
38 |                             )
39 |                         )
40 |                         results_dict[iso_code] = results
41 |                     except RequestException as e:
42 |                         print(f"Error occurred: {e}")
43 |                         continue
44 |                 if i >= end_index:
45 |                     break
46 | 
47 |         output_file = os.path.join(output_directory, f"{self.config['output_file_prefix']}_{start_index}-{end_index}.json")
48 |         with open(output_file, 'w') as json_file:
49 |             json.dump(results_dict, json_file, indent=4)
50 | 
51 |     def retry_request(self, request_func):
52 |         retries = 0
53 |         max_retries = self.config['max_retries']
54 |         while retries < max_retries:
55 |             try:
56 |                 return request_func()
57 |             except (RequestException, ProtocolError) as e:
58 |                 print(f"Error occurred: {e}")
59 |                 retries += 1
60 |                 if retries < max_retries:
61 |                     print("Retrying...")
62 |                     time.sleep(self.config['retry_wait_time'])
63 |                 else:
64 |                     print("Max retries exceeded.")
65 |                     raise
66 | 
67 | if __name__ == "__main__":
68 |     config_file = "pipeline/search_config.yaml"
69 |     search_service = SearchService(config_file)
70 |     search_service.search_and_save_results()
71 | 


--------------------------------------------------------------------------------
/pipeline/seedcrawler_alpha.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | from urllib.parse import urljoin, urlparse
  6 | import time
  7 | import json
  8 | from typing import List, Dict, Any
  9 | from tqdm import tqdm
 10 | from trafilatura import extract, fetch_url
 11 | import fasttext
 12 | import urllib3
 13 | from concurrent.futures import ThreadPoolExecutor
 14 | import logging
 15 | import yaml
 16 | 
 17 | def load_config(config_file: str) -> Dict[str, Any]:
 18 |     with open(config_file, 'r') as file:
 19 |         return yaml.safe_load(file)
 20 | 
 21 | config = load_config('pipeline/config.yaml')
 22 | 
 23 | log_dir = os.path.dirname(config['logging']['file_path'])
 24 | os.makedirs(log_dir, exist_ok=True)
 25 | 
 26 | logging.basicConfig(
 27 |     level=getattr(logging, config['logging']['level']),
 28 |     format='%(asctime)s - %(levelname)s - %(message)s',
 29 |     filename=config['logging']['file_path']
 30 | )
 31 | 
 32 | class SeedReader:
 33 |     def __init__(self, json_file_path: str):
 34 |         self.json_file_path = json_file_path
 35 |         self.data = self.read_json_file()
 36 | 
 37 |     def read_json_file(self) -> List[Dict[str, Any]]:
 38 |         try:
 39 |             with open(self.json_file_path, 'r', encoding='utf-8') as file:
 40 |                 data = json.load(file)
 41 |                 for item in data:
 42 |                     if not all(key in item for key in ["link", "lid_confidence", "predicted_lid"]):
 43 |                         raise ValueError("Missing one or more required keys: 'link', 'lid_confidence', or 'predicted_lid' in the JSON data")
 44 |                 return data
 45 |         except FileNotFoundError:
 46 |             logging.error(f"File not found: {self.json_file_path}")
 47 |             return []
 48 |         except json.JSONDecodeError:
 49 |             logging.error(f"Error decoding JSON from file: {self.json_file_path}")
 50 |             return []
 51 | 
 52 |     def get_data(self) -> List[Dict[str, Any]]:
 53 |         return self.data
 54 | 
 55 |     def get_entry_by_index(self, index: int) -> Dict[str, Any]:
 56 |         try:
 57 |             return self.data[index]
 58 |         except IndexError:
 59 |             logging.error(f"Index out of range: {index}")
 60 |             return {}
 61 | 
 62 |     def filter_by_key_value(self, key: str, value: Any) -> List[Dict[str, Any]]:
 63 |         if not self.data:
 64 |             logging.warning("No data available for filtering.")
 65 |             return []
 66 | 
 67 |         filtered_data = [entry for entry in self.data if entry.get(key) == value]
 68 |         
 69 |         if not filtered_data:
 70 |             logging.warning(f"No entries found with {key} = {value}")
 71 |         
 72 |         return filtered_data
 73 | 
 74 | 
 75 | class SeedCrawler:
 76 | 
 77 |     def __init__(self, seed_urls):
 78 |         self.seed_urls = seed_urls
 79 |         self.max_pages = config['seed_crawler']['max_pages']
 80 |         self.max_time = config['seed_crawler']['max_time']
 81 |         self.visited = set()
 82 |         self.to_visit = list(set(seed_urls))  # Remove duplicates
 83 |         self.to_visit_growth_factor = config['seed_crawler']['to_visit_growth_factor']
 84 |         self.all_links = set(seed_urls)  # Initialize with seed URLs
 85 |         self.session = requests.Session()
 86 | 
 87 |     def get_links(self, url):
 88 |         try:
 89 |             response = self.session.get(url, timeout=config['url_settings']['request_timeout'])
 90 |             response.raise_for_status()
 91 |         except (requests.RequestException, requests.HTTPError) as e:
 92 |             logging.error(f"Error fetching {url}: {e}")
 93 |             return set()
 94 | 
 95 |         links = set()
 96 |         
 97 |         try:
 98 |             soup = BeautifulSoup(response.text, 'html.parser')
 99 |         except Exception as e:
100 |             logging.warning(f"Error parsing {url} with html.parser: {e}")
101 |             try:
102 |                 soup = BeautifulSoup(response.text, 'lxml')
103 |             except ImportError:
104 |                 logging.error("lxml parser not available. Unable to parse the page.")
105 |                 return set()
106 |             except Exception as e:
107 |                 logging.error(f"Error parsing {url} with lxml: {e}")
108 |                 return set()
109 | 
110 |         for anchor in soup.find_all('a', href=True):
111 |             link = urljoin(url, anchor['href'])
112 |             parsed_link = urlparse(link)
113 | 
114 |             if any(parsed_link.netloc.endswith(urlparse(seed).netloc) for seed in self.seed_urls) and len(link) <= config['url_settings']['max_url_length']:
115 |                 links.add(link)
116 | 
117 |         return links
118 | 
119 |     def crawl_websites(self):
120 |         logging.info(f"Crawling links from {len(self.seed_urls)} seed URLs")
121 |         start_time = time.time()
122 |         with tqdm(total=self.max_pages, disable=not config['progress_bar']['enabled']) as pbar:
123 |             while self.to_visit and len(self.visited) < self.max_pages:
124 |                 if time.time() - start_time > self.max_time:
125 |                     logging.info("Maximum crawl time reached. Stopping.")
126 |                     break
127 | 
128 |                 current_url = self.to_visit.pop(0)
129 | 
130 |                 if current_url in self.visited:
131 |                     continue
132 | 
133 |                 links = self.get_links(current_url)
134 |                 self.all_links.update(links)
135 | 
136 |                 for link in links:
137 |                     if link not in self.visited and link not in self.to_visit:
138 |                         self.to_visit.append(link)
139 | 
140 |                 self.visited.add(current_url)
141 |                 time.sleep(config['seed_crawler']['crawl_delay'])
142 |                 pbar.update(1)
143 | 
144 |                 if len(self.visited) % 10 == 0:
145 |                     if len(self.to_visit) > len(self.visited) * self.to_visit_growth_factor:
146 |                         logging.warning("To-visit list growing too fast. Possible circular link structure.")
147 |                         break
148 | 
149 |         logging.info(f"Finished crawling links from {len(self.seed_urls)} seed URLs")
150 |         logging.info(f"Visited {len(self.visited)} pages in {time.time() - start_time:.2f} seconds")
151 |         return self.all_links
152 |     
153 | class LanguageDetector:
154 |     def __init__(self, model):
155 |         self.model = model
156 | 
157 |     @staticmethod
158 |     def extract_language_code(input_str):
159 |         pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
160 |         match = re.search(pattern, input_str)
161 |         return match.group(2) if match else None
162 | 
163 |     @staticmethod
164 |     def trafilatura_scrape(url):
165 |         document = fetch_url(url)
166 |         return extract(document)
167 | 
168 |     def language_predict(self, scraped_text):
169 |         if scraped_text is not None:
170 |             lid_label_script = self.model.predict(scraped_text.replace('\n', ''))
171 |             lid_label = self.extract_language_code(lid_label_script[0][0])
172 |             lid_confidence = lid_label_script[1][0]
173 |             return lid_label, lid_confidence
174 |         return None, None
175 | 
176 |     def filter_seeds(self, links, input_label, confidence):
177 |         new_list = []
178 |         logging.info(f"Starting to filter {len(links)} links")
179 | 
180 |         with ThreadPoolExecutor(max_workers=config['seed_crawler']['max_workers']) as executor:
181 |             futures = [executor.submit(self.process_link, link, input_label, confidence) for link in links]
182 |             for future in tqdm(futures, desc="Filtering scraped links", unit="link", disable=not config['progress_bar']['enabled']):
183 |                 try:
184 |                     result = future.result()
185 |                     if result:
186 |                         new_list.append(result)
187 |                         logging.debug(f"Appended result: {result}")
188 |                 except Exception as e:
189 |                     logging.error(f"Error processing link: {e}")
190 | 
191 |         logging.info(f"Finished filtering. Found {len(new_list)} matching links")
192 |         return new_list
193 | 
194 |     def process_link(self, link, input_label, confidence):
195 |         try:
196 |             scraped_text = self.trafilatura_scrape(link)
197 |             lid_label, lid_confidence = self.language_predict(scraped_text)
198 |             
199 |             logging.debug(f"Link: {link}, LID Label: {lid_label}, LID Confidence: {lid_confidence}")
200 |             
201 |             if lid_label == input_label and lid_confidence >= confidence:
202 |                 if config['language_detector']['save_text'] == True:
203 |                     return {"link": link, "lid_label": lid_label, "lid_confidence": lid_confidence, "text": scraped_text}  
204 |                 else:  
205 |                     return {"link": link, "lid_label": lid_label, "lid_confidence": lid_confidence}
206 |         except Exception as e:
207 |             logging.error(f"Error processing link {link}: {e}")
208 |         return None
209 | 
210 | def remove_entries_with_domains(final_list):
211 |         with open(config['domain_file'], 'r') as f:
212 |             domains = [line.strip() for line in f.readlines()]
213 | 
214 |         filtered_list = [
215 |             entry for entry in final_list 
216 |             if 'link' in entry and not any(domain in entry['link'] for domain in domains)
217 |         ]
218 |         
219 |         return filtered_list
220 | 
221 | def save_to_json(data: List[Dict[str, Any]], filename: str):
222 |     try:
223 |         with open(filename, 'w', encoding='utf-8') as file:
224 |             json.dump(data, file, ensure_ascii=False, indent=4)
225 |         logging.info(f"Successfully saved data to {filename}")
226 |         print(f"Successfully saved data to {filename}")
227 |     except Exception as e:
228 |         logging.error(f"Error saving to {filename}: {e}")
229 | 
230 | def process_language(input_label: str, model: fasttext.FastText._FastText) -> None:
231 |     """Process a single language input."""
232 |     json_file_path = os.path.join(config['seed_reader']['input_directory'], f"{input_label}.json")
233 |     input_confidence = config['language_detector']['minimum_confidence']
234 |     
235 |     logging.info(f"Processing language: {input_label}")
236 |     print(f"Processing language: {input_label}")
237 |     
238 |     reader = SeedReader(json_file_path)
239 |     all_data = reader.get_data()
240 | 
241 |     all_data = remove_entries_with_domains(all_data)
242 | 
243 |     links_meta_data = {}
244 | 
245 |     seed_urls = [entry['link'] for entry in all_data if entry['lid_confidence'] > input_confidence]
246 | 
247 |     links_meta_data['seed_urls'] = seed_urls
248 |     links_meta_data['seed_urls_len'] = len(seed_urls)
249 |     
250 |     if not seed_urls:
251 |         logging.warning(f"No seed URLs found for language {input_label}")
252 |         return
253 |     
254 |     crawler = SeedCrawler(seed_urls)
255 |     all_website_links = crawler.crawl_websites()
256 | 
257 |     links_meta_data['all_website_links'] = list(all_website_links)  # Convert set to list for JSON serialization
258 |     links_meta_data['all_website_links_len'] = len(all_website_links)
259 | 
260 |     lang_detector = LanguageDetector(model)
261 |     filtered_links = lang_detector.filter_seeds(all_website_links, input_label, input_confidence)
262 | 
263 |     links_meta_data['filtered_links'] = [link['link'] for link in filtered_links]
264 |     links_meta_data['filtered_links_len'] = len(filtered_links)
265 | 
266 |     unique_links = set_minus([link['link'] for link in filtered_links], seed_urls)  # Extract links from filtered_links
267 |     links_meta_data['unique_links'] = unique_links
268 |     links_meta_data['unique_links_len'] = len(unique_links)
269 | 
270 |     rejected_links = set_minus(seed_urls, [link['link'] for link in filtered_links])
271 |     links_meta_data['rejected_links'] = rejected_links
272 |     links_meta_data['rejected_links_len'] = len(rejected_links)
273 | 
274 |     # Create metadata directory if it doesn't exist
275 |     meta_data_dir = os.path.join(config['output']['directory'], "meta_data")
276 |     os.makedirs(meta_data_dir, exist_ok=True)
277 | 
278 |     # Construct the filename correctly
279 |     meta_file_name = os.path.join(meta_data_dir, f"{input_label}_meta_data.json")
280 | 
281 |     try:
282 |         # Open and save data to JSON
283 |         with open(meta_file_name, 'w', encoding='utf-8') as file:
284 |             json.dump(links_meta_data, file, ensure_ascii=False, indent=4)
285 |         logging.info(f"Successfully saved metadata to {meta_file_name}")
286 |     except Exception as e:
287 |         logging.error(f"Error saving metadata to {meta_file_name}: {e}")
288 | 
289 |     logging.info(f"Number of filtered links for {input_label}: {len(filtered_links)}")
290 | 
291 |     output_file = os.path.join(config['output']['directory'], 
292 |                               config['output']['output_file_name'].format(language=input_label))
293 |     save_to_json(filtered_links, output_file)
294 | 
295 |     if os.path.exists(output_file):
296 |         file_size = os.path.getsize(output_file)
297 |         logging.info(f"Output file size for {input_label}: {file_size} bytes")
298 |     else:
299 |         logging.error(f"Output file was not created for {input_label}")
300 | 
301 | def set_minus(list1, list2):
302 |     
303 |     set1 = set(list1)
304 |     set2 = set(list2)
305 |     
306 |     uncommon_elements = set1 - set2
307 |     
308 |     return list(uncommon_elements)
309 | 
310 | def batch_process(input_labels: List[str]) -> None:
311 |     """Process multiple languages in batch."""
312 |     model_path = config['language_detector']['model_path']
313 |     model = fasttext.load_model(model_path)
314 |     
315 |     total_languages = len(input_labels)
316 |     logging.info(f"Starting batch processing for {total_languages} languages")
317 |     print(f"Starting batch processing for {total_languages} languages")
318 |     
319 |     for idx, input_label in enumerate(input_labels, 1):
320 |         logging.info(f"Processing language {idx}/{total_languages}: {input_label}")
321 |         print(f"Processing language {idx}/{total_languages}: {input_label}")
322 |         try:
323 |             process_language(input_label, model)
324 |         except Exception as e:
325 |             logging.error(f"Error processing language {input_label}: {e}")
326 |             continue
327 |         
328 |         if idx < total_languages:
329 |             cooldown = config.get('batch_processing', {}).get('cooldown_between_languages', 60)
330 |             logging.info(f"Cooling down for {cooldown} seconds before processing next language")
331 |             print(f"Cooling down for {cooldown} seconds before processing next language")
332 |             time.sleep(cooldown)
333 |     
334 |     logging.info("Batch processing completed")
335 |     print("Batch processing completed")
336 | 
337 | if __name__ == "__main__":
338 |     # Check if batch processing is enabled in config
339 |     if config.get('batch_processing', {}).get('enabled', False):
340 |         input_labels = config['batch_processing']['input_labels']
341 |         if not input_labels:
342 |             logging.error("Batch processing enabled but no input labels provided in config")
343 |         else:
344 |             batch_process(input_labels)
345 |     else:
346 |         # Original single language processing
347 |         input_label = config['language_detector']['desired_language']
348 |         model_path = config['language_detector']['model_path']
349 |         model = fasttext.load_model(model_path)
350 |         process_language(input_label, model)
351 | 


--------------------------------------------------------------------------------
/pipeline/seedcrawler_beta.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | from urllib.parse import urljoin, urlparse
  6 | import time
  7 | import json
  8 | from typing import List, Dict, Any
  9 | from tqdm import tqdm
 10 | from trafilatura import extract, fetch_url
 11 | import fasttext
 12 | import urllib3
 13 | from concurrent.futures import ThreadPoolExecutor
 14 | import logging
 15 | import yaml
 16 | 
 17 | def load_config(config_file: str) -> Dict[str, Any]:
 18 |     with open(config_file, 'r') as file:
 19 |         return yaml.safe_load(file)
 20 | 
 21 | config = load_config('config.yaml')
 22 | 
 23 | log_dir = os.path.dirname(config['logging']['file_path'])
 24 | os.makedirs(log_dir, exist_ok=True)
 25 | 
 26 | logging.basicConfig(
 27 |     level=getattr(logging, config['logging']['level']),
 28 |     format='%(asctime)s - %(levelname)s - %(message)s',
 29 |     filename=config['logging']['file_path']
 30 | )
 31 | 
 32 | class SeedReader:
 33 |     def __init__(self, json_file_path: str):
 34 |         self.json_file_path = json_file_path
 35 |         self.data = self.read_json_file()
 36 | 
 37 |     def read_json_file(self) -> List[Dict[str, Any]]:
 38 |         try:
 39 |             with open(self.json_file_path, 'r', encoding='utf-8') as file:
 40 |                 data = json.load(file)
 41 |                 for item in data:
 42 |                     if not all(key in item for key in ["link", "lid_confidence", "predicted_lid"]):
 43 |                         raise ValueError("Missing one or more required keys: 'link', 'lid_confidence', or 'predicted_lid' in the JSON data")
 44 |                 return data
 45 |         except FileNotFoundError:
 46 |             logging.error(f"File not found: {self.json_file_path}")
 47 |             return []
 48 |         except json.JSONDecodeError:
 49 |             logging.error(f"Error decoding JSON from file: {self.json_file_path}")
 50 |             return []
 51 | 
 52 |     def get_data(self) -> List[Dict[str, Any]]:
 53 |         return self.data
 54 | 
 55 |     def get_entry_by_index(self, index: int) -> Dict[str, Any]:
 56 |         try:
 57 |             return self.data[index]
 58 |         except IndexError:
 59 |             logging.error(f"Index out of range: {index}")
 60 |             return {}
 61 | 
 62 |     def filter_by_key_value(self, key: str, value: Any) -> List[Dict[str, Any]]:
 63 |         if not self.data:
 64 |             logging.warning("No data available for filtering.")
 65 |             return []
 66 | 
 67 |         filtered_data = [entry for entry in self.data if entry.get(key) == value]
 68 |         
 69 |         if not filtered_data:
 70 |             logging.warning(f"No entries found with {key} = {value}")
 71 |         
 72 |         return filtered_data
 73 | 
 74 | 
 75 | class SeedCrawler:
 76 | 
 77 |     def __init__(self, seed_urls):
 78 |         self.seed_urls = seed_urls
 79 |         self.max_pages = config['seed_crawler']['max_pages']
 80 |         self.max_time = config['seed_crawler']['max_time']
 81 |         self.visited = set()
 82 |         self.to_visit = list(set(seed_urls))  # Remove duplicates
 83 |         self.to_visit_growth_factor = config['seed_crawler']['to_visit_growth_factor']
 84 |         self.all_links = set(seed_urls)  # Initialize with seed URLs
 85 |         self.session = requests.Session()
 86 | 
 87 |     def get_links(self, url):
 88 |         try:
 89 |             response = self.session.get(url, timeout=config['url_settings']['request_timeout'])
 90 |             response.raise_for_status()
 91 |         except (requests.RequestException, requests.HTTPError) as e:
 92 |             logging.error(f"Error fetching {url}: {e}")
 93 |             return set()
 94 | 
 95 |         links = set()
 96 |         
 97 |         try:
 98 |             soup = BeautifulSoup(response.text, 'html.parser')
 99 |         except Exception as e:
100 |             logging.warning(f"Error parsing {url} with html.parser: {e}")
101 |             try:
102 |                 soup = BeautifulSoup(response.text, 'lxml')
103 |             except ImportError:
104 |                 logging.error("lxml parser not available. Unable to parse the page.")
105 |                 return set()
106 |             except Exception as e:
107 |                 logging.error(f"Error parsing {url} with lxml: {e}")
108 |                 return set()
109 | 
110 |         for anchor in soup.find_all('a', href=True):
111 |             link = urljoin(url, anchor['href'])
112 |             parsed_link = urlparse(link)
113 | 
114 |             if any(parsed_link.netloc.endswith(urlparse(seed).netloc) for seed in self.seed_urls) and len(link) <= config['url_settings']['max_url_length']:
115 |                 links.add(link)
116 | 
117 |         return links
118 | 
119 |     def crawl_websites(self):
120 |         logging.info(f"Crawling links from {len(self.seed_urls)} seed URLs")
121 |         start_time = time.time()
122 |         with tqdm(total=self.max_pages, disable=not config['progress_bar']['enabled']) as pbar:
123 |             while self.to_visit and len(self.visited) < self.max_pages:
124 |                 if time.time() - start_time > self.max_time:
125 |                     logging.info("Maximum crawl time reached. Stopping.")
126 |                     break
127 | 
128 |                 current_url = self.to_visit.pop(0)
129 | 
130 |                 if current_url in self.visited:
131 |                     continue
132 | 
133 |                 links = self.get_links(current_url)
134 |                 self.all_links.update(links)
135 | 
136 |                 for link in links:
137 |                     if link not in self.visited and link not in self.to_visit:
138 |                         self.to_visit.append(link)
139 | 
140 |                 self.visited.add(current_url)
141 |                 time.sleep(config['seed_crawler']['crawl_delay'])
142 |                 pbar.update(1)
143 | 
144 |                 if len(self.visited) % 10 == 0:
145 |                     if len(self.to_visit) > len(self.visited) * self.to_visit_growth_factor:
146 |                         logging.warning("To-visit list growing too fast. Possible circular link structure.")
147 |                         break
148 | 
149 |         logging.info(f"Finished crawling links from {len(self.seed_urls)} seed URLs")
150 |         logging.info(f"Visited {len(self.visited)} pages in {time.time() - start_time:.2f} seconds")
151 |         return self.all_links
152 |     
153 | class LanguageDetector:
154 |     def __init__(self, model):
155 |         self.model = model
156 | 
157 |     @staticmethod
158 |     def extract_language_code(input_str):
159 |         pattern = r'__(label)__([a-zA-Z]+_[a-zA-Z]+)'
160 |         match = re.search(pattern, input_str)
161 |         return match.group(2) if match else None
162 | 
163 |     @staticmethod
164 |     def trafilatura_scrape(url):
165 |         document = fetch_url(url)
166 |         return extract(document)
167 | 
168 |     def language_predict(self, scraped_text):
169 |         if scraped_text is not None:
170 |             lid_label_script = self.model.predict(scraped_text.replace('\n', ''))
171 |             lid_label = self.extract_language_code(lid_label_script[0][0])
172 |             lid_confidence = lid_label_script[1][0]
173 |             return lid_label, lid_confidence
174 |         return None, None
175 | 
176 |     def filter_seeds(self, links, input_label, confidence):
177 |         new_list = []
178 |         logging.info(f"Starting to filter {len(links)} links")
179 | 
180 |         with ThreadPoolExecutor(max_workers=config['seed_crawler']['max_workers']) as executor:
181 |             futures = [executor.submit(self.process_link, link, input_label, confidence) for link in links]
182 |             for future in tqdm(futures, desc="Filtering scraped links", unit="link", disable=not config['progress_bar']['enabled']):
183 |                 try:
184 |                     result = future.result()
185 |                     if result:
186 |                         new_list.append(result)
187 |                         logging.debug(f"Appended result: {result}")
188 |                 except Exception as e:
189 |                     logging.error(f"Error processing link: {e}")
190 | 
191 |         logging.info(f"Finished filtering. Found {len(new_list)} matching links")
192 |         return new_list
193 | 
194 |     def process_link(self, link, input_label, confidence):
195 |         try:
196 |             scraped_text = self.trafilatura_scrape(link)
197 |             lid_label, lid_confidence = self.language_predict(scraped_text)
198 |             
199 |             logging.debug(f"Link: {link}, LID Label: {lid_label}, LID Confidence: {lid_confidence}")
200 |             
201 |             if lid_label == input_label and lid_confidence >= confidence:
202 |                 if config['language_detector']['save_text'] == True:
203 |                     return {"link": link, "lid_label": lid_label, "lid_confidence": lid_confidence, "text": scraped_text}  
204 |                 else:  
205 |                     return {"link": link, "lid_label": lid_label, "lid_confidence": lid_confidence}
206 |         except Exception as e:
207 |             logging.error(f"Error processing link {link}: {e}")
208 |         return None
209 | 
210 | def remove_entries_with_domains(final_list):
211 |         with open(config['domain_file'], 'r') as f:
212 |             domains = [line.strip() for line in f.readlines()]
213 | 
214 |         filtered_list = [
215 |             entry for entry in final_list 
216 |             if 'link' in entry and not any(domain in entry['link'] for domain in domains)
217 |         ]
218 |         
219 |         return filtered_list
220 | 
221 | def save_to_json(data: List[Dict[str, Any]], filename: str):
222 |     try:
223 |         with open(filename, 'w', encoding='utf-8') as file:
224 |             json.dump(data, file, ensure_ascii=False, indent=4)
225 |         logging.info(f"Successfully saved data to {filename}")
226 |         print(f"Successfully saved data to {filename}")
227 |     except Exception as e:
228 |         logging.error(f"Error saving to {filename}: {e}")
229 | 
230 | def process_language(input_label: str, model: fasttext.FastText._FastText) -> None:
231 |     """Process a single language input."""
232 |     json_file_path = os.path.join(config['seed_reader']['input_directory'], f"{input_label}.json")
233 |     input_confidence = config['language_detector']['minimum_confidence']
234 |     
235 |     logging.info(f"Processing language: {input_label}")
236 |     print(f"Processing language: {input_label}")
237 |     
238 |     reader = SeedReader(json_file_path)
239 |     all_data = reader.get_data()
240 | 
241 |     all_data = remove_entries_with_domains(all_data)
242 | 
243 |     links_meta_data = {}
244 | 
245 |     seed_urls = [entry['link'] for entry in all_data if entry['lid_confidence'] > input_confidence]
246 | 
247 |     links_meta_data['seed_urls'] = seed_urls
248 |     links_meta_data['seed_urls_len'] = len(seed_urls)
249 |     
250 |     if not seed_urls:
251 |         logging.warning(f"No seed URLs found for language {input_label}")
252 |         return
253 |     
254 |     crawler = SeedCrawler(seed_urls)
255 |     all_website_links = crawler.crawl_websites()
256 | 
257 |     links_meta_data['all_website_links'] = list(all_website_links)  # Convert set to list for JSON serialization
258 |     links_meta_data['all_website_links_len'] = len(all_website_links)
259 | 
260 |     lang_detector = LanguageDetector(model)
261 |     filtered_links = lang_detector.filter_seeds(all_website_links, input_label, input_confidence)
262 | 
263 |     links_meta_data['filtered_links'] = [link['link'] for link in filtered_links]
264 |     links_meta_data['filtered_links_len'] = len(filtered_links)
265 | 
266 |     unique_links = set_minus([link['link'] for link in filtered_links], seed_urls)  # Extract links from filtered_links
267 |     links_meta_data['unique_links'] = unique_links
268 |     links_meta_data['unique_links_len'] = len(unique_links)
269 | 
270 |     rejected_links = set_minus(seed_urls, [link['link'] for link in filtered_links])
271 |     links_meta_data['rejected_links'] = rejected_links
272 |     links_meta_data['rejected_links_len'] = len(rejected_links)
273 | 
274 |     # Create metadata directory if it doesn't exist
275 |     meta_data_dir = os.path.join(config['output']['directory'], "meta_data")
276 |     os.makedirs(meta_data_dir, exist_ok=True)
277 | 
278 |     # Construct the filename correctly
279 |     meta_file_name = os.path.join(meta_data_dir, f"{input_label}_meta_data.json")
280 | 
281 |     try:
282 |         # Open and save data to JSON
283 |         with open(meta_file_name, 'w', encoding='utf-8') as file:
284 |             json.dump(links_meta_data, file, ensure_ascii=False, indent=4)
285 |         logging.info(f"Successfully saved metadata to {meta_file_name}")
286 |     except Exception as e:
287 |         logging.error(f"Error saving metadata to {meta_file_name}: {e}")
288 | 
289 |     logging.info(f"Number of filtered links for {input_label}: {len(filtered_links)}")
290 | 
291 |     output_file = os.path.join(config['output']['directory'], 
292 |                               config['output']['output_file_name'].format(language=input_label))
293 |     save_to_json(filtered_links, output_file)
294 | 
295 |     if os.path.exists(output_file):
296 |         file_size = os.path.getsize(output_file)
297 |         logging.info(f"Output file size for {input_label}: {file_size} bytes")
298 |     else:
299 |         logging.error(f"Output file was not created for {input_label}")
300 | 
301 | def set_minus(list1, list2):
302 |     
303 |     set1 = set(list1)
304 |     set2 = set(list2)
305 |     
306 |     uncommon_elements = set1 - set2
307 |     
308 |     return list(uncommon_elements)
309 | 
310 | def batch_process(input_labels: List[str]) -> None:
311 |     """Process multiple languages in batch."""
312 |     model_path = config['language_detector']['model_path']
313 |     model = fasttext.load_model(model_path)
314 |     
315 |     total_languages = len(input_labels)
316 |     logging.info(f"Starting batch processing for {total_languages} languages")
317 |     print(f"Starting batch processing for {total_languages} languages")
318 |     
319 |     for idx, input_label in enumerate(input_labels, 1):
320 |         logging.info(f"Processing language {idx}/{total_languages}: {input_label}")
321 |         print(f"Processing language {idx}/{total_languages}: {input_label}")
322 |         try:
323 |             process_language(input_label, model)
324 |         except Exception as e:
325 |             logging.error(f"Error processing language {input_label}: {e}")
326 |             continue
327 |         
328 |         if idx < total_languages:
329 |             cooldown = config.get('batch_processing', {}).get('cooldown_between_languages', 60)
330 |             logging.info(f"Cooling down for {cooldown} seconds before processing next language")
331 |             print(f"Cooling down for {cooldown} seconds before processing next language")
332 |             time.sleep(cooldown)
333 |     
334 |     logging.info("Batch processing completed")
335 |     print("Batch processing completed")
336 | 
337 | if __name__ == "__main__":
338 |     # Check if batch processing is enabled in config
339 |     if config.get('batch_processing', {}).get('enabled', False):
340 |         input_labels = config['batch_processing']['input_labels']
341 |         if not input_labels:
342 |             logging.error("Batch processing enabled but no input labels provided in config")
343 |         else:
344 |             batch_process(input_labels)
345 |     else:
346 |         # Original single language processing
347 |         input_label = config['language_detector']['desired_language']
348 |         model_path = config['language_detector']['model_path']
349 |         model = fasttext.load_model(model_path)
350 |         process_language(input_label, model)
351 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain-community
2 | langchain-core
3 | fasttext
4 | huggingface-hub
5 | beautifulsoup4
6 | urllib3
7 | trafilatura
8 | lxml


--------------------------------------------------------------------------------
/result_filtering/final_filtering.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | import re
  4 | import requests
  5 | from urllib.parse import urljoin, urlparse
  6 | import json
  7 | from typing import List, Dict, Any
  8 | from tqdm import tqdm
  9 | 
 10 | def load_config(config_file: str) -> Dict[str, Any]:
 11 |     with open(config_file, 'r') as file:
 12 |         return yaml.safe_load(file)
 13 | 
 14 | config = load_config('pipeline/config.yaml')
 15 | 
 16 | def remove_entries_with_domains(final_list):
 17 |         with open(config['domain_file'], 'r') as f:
 18 |             domains = [line.strip() for line in f.readlines()]
 19 | 
 20 |         filtered_list = [
 21 |             entry for entry in final_list 
 22 |             if 'link' in entry and not any(domain in entry['link'] for domain in domains)
 23 |         ]
 24 |         
 25 |         return filtered_list
 26 | 
 27 | def load_crawled_output(code: str, data_dir: str) -> List[Dict[str, Any]]:
 28 |     """Load crawled output JSON for a given code."""
 29 |     file_path = os.path.join(data_dir, f"{code}_crawled_output.json")
 30 |     with open(file_path, 'r', encoding='utf-8') as file:
 31 |         return json.load(file)
 32 |     
 33 | def save_to_json(data: List[Dict[str, Any]], filename: str):
 34 |     try:
 35 |         with open(filename, 'w', encoding='utf-8') as file:
 36 |             json.dump(data, file, ensure_ascii=False, indent=4)
 37 |         print(f"Successfully saved data to {filename}")
 38 |     except Exception as e:
 39 |         print(f"Error saving to {filename}: {e}")
 40 | 
 41 | def load_meta_data(code,data_dir):
 42 |     file_path = os.path.join(data_dir, f"{code}_meta_data.json")
 43 |     with open(file_path, 'r', encoding='utf-8') as file:
 44 |         return json.load(file)
 45 | 
 46 | def filter_list_urls(urls, domain_filter):
 47 |     """
 48 |     Filter out URLs that match domains in the filter list.
 49 |      
 50 |     Args:
 51 |     urls (list): List of URLs to filter
 52 |     domain_filter (list): List of domains to filter out
 53 |      
 54 |     Returns:
 55 |     list: Filtered list of URLs
 56 |     """
 57 |     def should_filter(url):
 58 |         parsed_url = urlparse(f'http://{url}')
 59 |          
 60 |         # Check if any filter domain matches the URL
 61 |         return any(
 62 |             filter_domain in parsed_url.netloc or 
 63 |             filter_domain in url 
 64 |             for filter_domain in domain_filter
 65 |         )
 66 |     
 67 |     return [url for url in urls if not should_filter(url)]
 68 | 
 69 | def filter_meta_data(meta_data):
 70 |     with open(config['domain_file'], 'r') as f:
 71 |             domains = [line.strip() for line in f.readlines()]
 72 |     meta_data['seed_urls'] = filter_list_urls(meta_data['seed_urls'],domains)
 73 |     meta_data['seed_urls_len'] = len(meta_data['seed_urls'])
 74 |     meta_data['all_website_links'] = filter_list_urls(meta_data['all_website_links'],domains)
 75 |     meta_data['all_website_links_len'] = len(meta_data['all_website_links'])
 76 |     meta_data['filtered_links'] = filter_list_urls(meta_data['filtered_links'],domains)
 77 |     meta_data['filtered_links_len'] = len(meta_data['filtered_links'])
 78 |     meta_data['unique_links'] = filter_list_urls(meta_data['unique_links'],domains)
 79 |     meta_data['unique_links_len'] = len(meta_data['unique_links'])
 80 |     meta_data['rejected_links'] = filter_list_urls(meta_data['rejected_links'],domains)
 81 |     meta_data['rejected_links_len'] = len(meta_data['rejected_links'])
 82 |     meta_data['active_seed_urls'] = list(set(meta_data['seed_urls'])-set(meta_data['rejected_links']))
 83 |     meta_data['active_seed_urls_len'] = len(meta_data['active_seed_urls'])
 84 | 
 85 |     return meta_data
 86 | 
 87 | 
 88 |     
 89 | 
 90 | 
 91 | 
 92 | def main():
 93 |     config = load_config('pipeline/config.yaml')
 94 |     crawled_output_dir = config['output']['directory']
 95 |     code = config['language_detector']['desired_language']
 96 |     meta_data_dir = os.path.join(config['output']['directory'], "meta_data")
 97 | 
 98 |     # Determine input labels
 99 |     if config['batch_processing']['enabled']:
100 |         input_labels = config['batch_processing']['input_labels']
101 |     else:
102 |         input_labels = [code]
103 | 
104 |     for input_label in tqdm(input_labels, desc="Processing input labels"):
105 |         print(f"Processing {input_label}")
106 |         data =  load_crawled_output(input_label,crawled_output_dir)
107 |         data = remove_entries_with_domains(data)
108 |         output_file = os.path.join(config['output']['directory'], 
109 |                               config['output']['output_file_name'].format(language=input_label))
110 |         save_to_json(data, output_file)
111 | 
112 |         meta_data = load_meta_data(input_label,meta_data_dir)
113 |         meta_data = filter_meta_data(meta_data)
114 |         meta_file_name = os.path.join(meta_data_dir, f"{input_label}_meta_data.json")
115 |         with open(meta_file_name, 'w', encoding='utf-8') as file:
116 |             json.dump(meta_data, file, ensure_ascii=False, indent=4)
117 | 
118 | 
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/result_filtering/formatted_output_robots_filtering.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | import json
 4 | from typing import List, Dict, Any
 5 | import requests
 6 | from urllib.parse import urlparse
 7 | from tqdm import tqdm
 8 | 
 9 | # Load configuration from a YAML file
10 | def load_config(config_file: str) -> Dict[str, Any]:
11 |     with open(config_file, 'r') as file:
12 |         return yaml.safe_load(file)
13 | 
14 | # Load data from JSON files based on a language code
15 | def load_data(code: str, meta_data_dir: str) -> Dict[str, Any]:
16 |     meta_file_name = os.path.join(meta_data_dir, f"{code}.json")
17 |     with open(meta_file_name, 'r', encoding='utf-8') as file:
18 |         return json.load(file)
19 | 
20 | # Check if a website's robots.txt file blocks CCBot
21 | def is_ccbot_blocked(url: str) -> bool:
22 |     """
23 |     Check if a website's robots.txt file blocks CCBot.
24 | 
25 |     Parameters:
26 |         url (str): The URL or domain to check.
27 | 
28 |     Returns:
29 |         bool: True if CCBot is blocked, False otherwise.
30 |     """
31 |     parsed_url = urlparse(url)
32 |     domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
33 |     robots_url = f"{domain}/robots.txt"
34 | 
35 |     try:
36 |         response = requests.get(robots_url, timeout=5)
37 |         response.raise_for_status()
38 | 
39 |         # Parse robots.txt for CCBot rules
40 |         lines = response.text.splitlines()
41 |         user_agent = None
42 | 
43 |         for line in lines:
44 |             line = line.strip()
45 |             if line.startswith("User-agent:"):
46 |                 user_agent = line.split(":")[1].strip()
47 |             elif user_agent == "CCBot" and line.startswith("Disallow:"):
48 |                 path = line.split(":")[1].strip()
49 |                 if path == "/":
50 |                     return True
51 |             elif user_agent == "CCBot" and line.startswith("Allow:"):
52 |                 path = line.split(":")[1].strip()
53 |                 if path == "/":
54 |                     return False
55 |         return False
56 |     except requests.RequestException:
57 |         return False  # Assume not blocked if robots.txt is inaccessible
58 | 
59 | # Remove sites blocked by CCBot from the dataset
60 | def remove_cc_blocked_site(lang_code: str, meta_data_dir: str) -> Dict[str, Any]:
61 |     data = load_data(lang_code, meta_data_dir)
62 |     sites = data.get('Sites', [])
63 |     data['Sites'] = [site for site in sites if not is_ccbot_blocked(site['Site URL'])]
64 |     return data
65 | 
66 | # Process files and save cleaned data
67 | 
68 | 
69 | # Process files and save cleaned data
70 | def process_files(input_label: str, meta_data_dir: str, output_dir: str) -> None:
71 |     os.makedirs(output_dir, exist_ok=True)
72 | 
73 |     print(f"Processing input label: {input_label}")
74 |     cleaned_data = remove_cc_blocked_site(input_label, meta_data_dir)
75 |     output_path = os.path.join(output_dir, f"{input_label}.json")
76 | 
77 |     with open(output_path, 'w', encoding='utf-8') as file:
78 |         json.dump(cleaned_data, file, indent=4, ensure_ascii=False)
79 |     print(f"Processed and saved: {output_path}")
80 | 
81 | # Main processing
82 | if __name__ == "__main__":
83 |     config = load_config('pipeline/config.yaml')
84 |     meta_data_dir = config['output']['formated_directory']
85 |     output_dir = config['output']['cleaned_directory']
86 | 
87 |     # Determine input labels
88 |     if config['batch_processing']['enabled']:
89 |         input_labels = config['batch_processing']['input_labels']
90 |     else:
91 |         input_labels = []
92 |         input_labels.append(config['language_detector']['desired_language'])
93 |     
94 |     # Process and save cleaned data with tqdm
95 |     for input_label in tqdm(input_labels, desc="Processing input labels"):
96 |         process_files(input_label, meta_data_dir, output_dir)
97 | 
98 | 


--------------------------------------------------------------------------------
/result_filtering/http_merge_2.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from urllib.parse import urlparse
  4 | import shutil
  5 | 
  6 | def normalize_url(url):
  7 |     """
  8 |     Normalize URL by removing protocol, trailing slashes, and standardizing www.
  9 |     """
 10 |     parsed = urlparse(url)
 11 |     # Remove 'www.' if present
 12 |     netloc = parsed.netloc.replace('www.', '')
 13 |     # Combine with path and remove trailing slashes
 14 |     return netloc + parsed.path.rstrip('/')
 15 | 
 16 | def normalize_link(link):
 17 |     """
 18 |     Normalize a link URL in the same way as the main site URL.
 19 |     """
 20 |     parsed = urlparse(link)
 21 |     netloc = parsed.netloc.replace('www.', '')
 22 |     # Preserve query parameters and fragments in links
 23 |     normalized = netloc + parsed.path.rstrip('/')
 24 |     if parsed.query:
 25 |         normalized += '?' + parsed.query
 26 |     if parsed.fragment:
 27 |         normalized += '#' + parsed.fragment
 28 |     return normalized
 29 | 
 30 | def merge_sites(sites):
 31 |     """Merge sites that have the same URL but different protocols or www prefix."""
 32 |     # Group sites by site name and normalized URL
 33 |     site_groups = {}
 34 |     for site in sites:
 35 |         norm_url = normalize_url(site['Site URL'])
 36 |         site_name = site['Site Name']
 37 |         key = (site_name, norm_url)
 38 |         if key not in site_groups:
 39 |             site_groups[key] = []
 40 |         site_groups[key].append(site)
 41 |     
 42 |     # Merge sites that need to be merged
 43 |     merged_sites = []
 44 |     for sites_group in site_groups.values():
 45 |         if len(sites_group) == 1:
 46 |             merged_sites.append(sites_group[0])
 47 |         else:
 48 |             # Prefer https over http for the main site URL
 49 |             https_site = next((site for site in sites_group if site['Site URL'].startswith('https')), None)
 50 |             base_site = https_site if https_site else sites_group[0]
 51 |             
 52 |             # Merge all links from all versions and normalize them
 53 |             all_links = set()
 54 |             for site in sites_group:
 55 |                 # Normalize each link to handle www consistently
 56 |                 normalized_links = [link for link in site['Links']]
 57 |                 all_links.update(normalized_links)
 58 |             
 59 |             # Create merged site entry
 60 |             merged_site = base_site.copy()
 61 |             merged_site['Links'] = sorted(list(all_links))
 62 |             merged_sites.append(merged_site)
 63 |     
 64 |     return merged_sites
 65 | 
 66 | def process_file(input_path, output_path):
 67 |     """Process a single JSON file."""
 68 |     with open(input_path, 'r', encoding='utf-8') as f:
 69 |         data = json.load(f)
 70 |     
 71 |     # Merge sites
 72 |     data['Sites'] = merge_sites(data['Sites'])
 73 |     
 74 |     # Write the processed data
 75 |     with open(output_path, 'w', encoding='utf-8') as f:
 76 |         json.dump(data, f, ensure_ascii=False, indent=4)
 77 | 
 78 | def main(input_dir, output_dir):
 79 |     """Process all JSON files in the input directory."""
 80 |     # Create output directory if it doesn't exist
 81 |     os.makedirs(output_dir, exist_ok=True)
 82 |     
 83 |     # Process each JSON file
 84 |     for filename in os.listdir(input_dir):
 85 |         if filename.endswith('.json'):
 86 |             input_path = os.path.join(input_dir, filename)
 87 |             output_path = os.path.join(output_dir, filename)
 88 |             process_file(input_path, output_path)
 89 |             print(f"Processed {filename}")
 90 | 
 91 | if __name__ == "__main__":
 92 |     import sys
 93 |     
 94 |     
 95 |     
 96 |     input_dir = "output/robots_filtered"  # Replace with your input directory path
 97 |     output_dir = "output/http_merged"
 98 |     
 99 |     if not os.path.exists(input_dir):
100 |         print(f"Error: Input directory '{input_dir}' does not exist")
101 |         sys.exit(1)
102 |     
103 |     main(input_dir, output_dir)


--------------------------------------------------------------------------------
/result_filtering/output_formatter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import yaml
  4 | import pycountry
  5 | from typing import Dict, Any, Optional, List
  6 | import json
  7 | import json
  8 | from urllib.parse import urlparse
  9 | from collections import defaultdict
 10 | 
 11 | 
 12 | flores_list = [
 13 |     "ace_Arab", "ace_Latn", "acm_Arab", "acq_Arab", "aeb_Arab", "afr_Latn", "ajp_Arab",
 14 |     "aka_Latn", "amh_Ethi", "apc_Arab", "arb_Arab", "ars_Arab", "ary_Arab", "arz_Arab",
 15 |     "asm_Beng", "ast_Latn", "awa_Deva", "ayr_Latn", "azb_Arab", "azj_Latn", "bak_Cyrl",
 16 |     "bam_Latn", "ban_Latn", "bel_Cyrl", "bem_Latn", "ben_Beng", "bho_Deva", "bjn_Arab", 
 17 |     "bjn_Latn", "bod_Tibt", "bos_Latn", "bug_Latn", "bul_Cyrl", "cat_Latn", "ceb_Latn", 
 18 |     "ces_Latn", "cjk_Latn", "ckb_Arab", "crh_Latn", "cym_Latn", "dan_Latn", "deu_Latn", 
 19 |     "dik_Latn", "dyu_Latn", "dzo_Tibt", "ell_Grek", "eng_Latn", "epo_Latn", "est_Latn", 
 20 |     "eus_Latn", "ewe_Latn", "fao_Latn", "pes_Arab", "fij_Latn", "fin_Latn", "fon_Latn", 
 21 |     "fra_Latn", "fur_Latn", "fuv_Latn", "gla_Latn", "gle_Latn", "glg_Latn", "grn_Latn", 
 22 |     "guj_Gujr", "hat_Latn", "hau_Latn", "heb_Hebr", "hin_Deva", "hne_Deva", "hrv_Latn", 
 23 |     "hun_Latn", "hye_Armn", "ibo_Latn", "ilo_Latn", "ind_Latn", "isl_Latn", "ita_Latn", 
 24 |     "jav_Latn", "jpn_Jpan", "kab_Latn", "kac_Latn", "kam_Latn", "kan_Knda", "kas_Arab", 
 25 |     "kas_Deva", "kat_Geor", "knc_Arab", "knc_Latn", "kaz_Cyrl", "kbp_Latn", "kea_Latn", 
 26 |     "khm_Khmr", "kik_Latn", "kin_Latn", "kir_Cyrl", "kmb_Latn", "kon_Latn", "kor_Hang", 
 27 |     "kmr_Latn", "lao_Laoo", "lvs_Latn", "lij_Latn", "lim_Latn", "lin_Latn", "lit_Latn", 
 28 |     "lmo_Latn", "ltg_Latn", "ltz_Latn", "lua_Latn", "lug_Latn", "luo_Latn", "lus_Latn", 
 29 |     "mag_Deva", "mai_Deva", "mal_Mlym", "mar_Deva", "min_Latn", "mkd_Cyrl", "plt_Latn", 
 30 |     "mlt_Latn", "mni_Beng", "khk_Cyrl", "mos_Latn", "mri_Latn", "zsm_Latn", "mya_Mymr", 
 31 |     "nld_Latn", "nno_Latn", "nob_Latn", "npi_Deva", "nso_Latn", "nus_Latn", "nya_Latn", 
 32 |     "oci_Latn", "gaz_Latn", "ory_Orya", "pag_Latn", "pan_Guru", "pap_Latn", "pol_Latn", 
 33 |     "por_Latn", "prs_Arab", "pbt_Arab", "quy_Latn", "ron_Latn", "run_Latn", "rus_Cyrl", 
 34 |     "sag_Latn", "san_Deva", "sat_Beng", "scn_Latn", "shn_Mymr", "sin_Sinh", "slk_Latn", 
 35 |     "slv_Latn", "smo_Latn", "sna_Latn", "snd_Arab", "som_Latn", "sot_Latn", "spa_Latn", 
 36 |     "als_Latn", "srd_Latn", "srp_Cyrl", "ssw_Latn", "sun_Latn", "swe_Latn", "swh_Latn", 
 37 |     "szl_Latn", "tam_Taml", "tat_Cyrl", "tel_Telu", "tgk_Cyrl", "tgl_Latn", "tha_Thai", 
 38 |     "tir_Ethi", "taq_Latn", "taq_Tfng", "tpi_Latn", "tsn_Latn", "tso_Latn", "tuk_Latn", 
 39 |     "tum_Latn", "tur_Latn", "twi_Latn", "tzm_Tfng", "uig_Arab", "ukr_Cyrl", "umb_Latn", 
 40 |     "urd_Arab", "uzn_Latn", "vec_Latn", "vie_Latn", "war_Latn", "wol_Latn", "xho_Latn", 
 41 |     "ydd_Hebr", "yor_Latn", "yue_Hant", "zho_Hans", "zho_Hant", "zul_Latn"
 42 | ]
 43 | 
 44 | IGNORED_SUBDOMAINS = [
 45 |     "www", "en", "de", "fr", "us", "uk", "ca",
 46 |     "mail", "webmail", "email", "ftp", "blog", 
 47 |     "shop", "help", "support", "docs", "kb", 
 48 |     "api", "cdn", "assets", "static", "analytics",
 49 |     "track", "metrics", "m", "beta", "staging", 
 50 |     "dev", "portal", "dashboard", "media", "http", "https", "www1", "www2", "www3"
 51 | ]
 52 | 
 53 | 
 54 | def load_config(config_file: str) -> Dict[str, Any]:
 55 |     with open(config_file, 'r') as file:
 56 |         return yaml.safe_load(file)
 57 | 
 58 | def get_language_name(code: str) -> str:
 59 |     """
 60 |     Get the language name using pycountry.
 61 |     If not found, return the original code.
 62 |     """
 63 |     try:
 64 |         language = pycountry.languages.get(alpha_3=code)
 65 |         return language.name if language else code
 66 |     except (AttributeError, KeyError):
 67 |         return code
 68 | 
 69 | def get_speakers(code: str, df: pd.DataFrame) -> Optional[str]:
 70 |     """
 71 |     Get the number of speakers from the CSV data.
 72 |     If not found by code, try to find by language name.
 73 |     Return None if no data is found.
 74 |     """
 75 |     if code in df.index:
 76 |         return df.loc[code, 'Speakers worldwide']
 77 |     
 78 |     lang_name = get_language_name(code)
 79 |     matching_rows = df[df['Name'].str.lower() == lang_name.lower()]
 80 |     if not matching_rows.empty:
 81 |         return matching_rows['Speakers worldwide'].iloc[0]
 82 |     
 83 |     return None
 84 | 
 85 | 
 86 | def get_number_of_speakers(file_path, iso_639_3_code):
 87 |     """
 88 |     Reads a TSV file and retrieves the 'estimated_number_of_speakers' for a given 'iso_639_3_code'.
 89 |     
 90 |     Args:
 91 |         file_path (str): Path to the 'linguameta.tsv' file.
 92 |         iso_639_3_code (str): The ISO 639-3 code to look up.
 93 |         
 94 |     Returns:
 95 |         int or None: The estimated number of speakers for the given 'iso_639_3_code', 
 96 |                      or None if the code is not found.
 97 |     """
 98 |     try:
 99 |         # Load the TSV file into a DataFrame
100 |         df = pd.read_csv(file_path, sep='\t')
101 |     except Exception as e:
102 |         print(f"Error reading the file: {e}")
103 |         return None
104 | 
105 |     # Filter the DataFrame to find the row matching the 'iso_639_3_code'
106 |     row = df[df['iso_639_3_code'] == iso_639_3_code]
107 |     if row.empty:
108 |         print(f"No matching row found for iso_639_3_code: {iso_639_3_code}")
109 |         return None
110 | 
111 |     # Retrieve the value of 'estimated_number_of_speakers'
112 |     try:
113 |         estimated_speakers = row['estimated_number_of_speakers'].values[0]
114 |         return estimated_speakers
115 |     except KeyError as e:
116 |         print(f"Error: Column not found - {e}")
117 |         return None
118 | 
119 | 
120 | def get_family_name_from_iso(file_path, iso639P3code):
121 |     """
122 |     Reads a CSV file and retrieves the family name corresponding to a given 'iso639P3code'.
123 |     
124 |     Args:
125 |         file_path (str): Path to the 'languoid.csv' file.
126 |         iso639P3code (str): The ISO 639-3 code to look up.
127 |         
128 |     Returns:
129 |         str: The family name corresponding to the given 'iso639P3code', or None if not found.
130 |     """
131 |     # Load the CSV file into a DataFrame
132 |     try:
133 |         df = pd.read_csv(file_path)
134 |     except Exception as e:
135 |         print(f"Error reading the file: {e}")
136 |         return None
137 | 
138 |     # Step 1: Retrieve the 'family_id' for the given 'iso639P3code'
139 |     family_id_row = df[df['iso639P3code'] == iso639P3code]
140 |     if family_id_row.empty:
141 |         print(f"No matching row found for iso639P3code: {iso639P3code}")
142 |         return None
143 | 
144 |     family_id = family_id_row['family_id'].values[0]
145 | 
146 |     # Step 2: Retrieve the 'name' where 'id' equals the retrieved 'family_id'
147 |     family_name_row = df[df['id'] == family_id]
148 |     if family_name_row.empty:
149 |         print(f"No matching row found for family_id: {family_id}")
150 |         return None
151 | 
152 |     family_name = family_name_row['name'].values[0]
153 | 
154 |     return family_name
155 | 
156 | def is_in_madlad(code: str) -> int:
157 |     with open('metadata/madlad_aplha_3.json', 'r') as file:
158 |         madlad_aplha_3 = json.load(file)
159 |     return 1 if code in madlad_aplha_3 else 0
160 | 
161 | def is_in_flores(langisocode693_Script: str) -> int:
162 |     return 1 if langisocode693_Script in flores_list else 0
163 | 
164 | def is_in_glot500(langisocode693_Script: str) -> int:
165 |     with open('metadata/glot500_iso_code.json', 'r') as file:
166 |         glot500_aplha_3 = json.load(file)
167 |     return 1 if langisocode693_Script in glot500_aplha_3 else 0
168 | 
169 | def categorize_urls(json_file: str) -> List[Dict[str, Any]]:
170 |     try:
171 |         with open(json_file, 'r') as file:
172 |             data = json.load(file)
173 |     except FileNotFoundError:
174 |         print(f"Warning: File not found: {json_file}")
175 |         return []
176 |     except json.JSONDecodeError:
177 |         print(f"Warning: Invalid JSON in file: {json_file}")
178 |         return []
179 | 
180 |     url_categories = defaultdict(list)
181 |     
182 |     for item in data:
183 |         url = item['link']
184 |         parsed_url = urlparse(url)
185 |         site_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
186 |         url_categories[site_url].append(url)
187 |     
188 |     result = []
189 |     for site_url, links in url_categories.items():
190 |         netloc_parts = urlparse(site_url).netloc.split('.')
191 |         
192 |         primary_domain_parts = [
193 |             part for part in netloc_parts 
194 |             if part.lower() not in IGNORED_SUBDOMAINS
195 |         ]
196 |         
197 |         if primary_domain_parts:
198 |             if len(primary_domain_parts[0]) <= 3 and len(primary_domain_parts) > 1:
199 |                 site_name = primary_domain_parts[1]
200 |             else:
201 |                 site_name = primary_domain_parts[0]
202 |         else:
203 |             site_name = netloc_parts[0]
204 |         
205 |         result.append({
206 |             "Site Name": site_name,
207 |             "Site URL": site_url,
208 |             "Info": 'confirmed by glotlid',
209 |             "confidence": "🟩",
210 |             "Links": links
211 |         })
212 |     
213 |     return result
214 | 
215 | def save_language_info(language_info: Dict[str, Any], language_name: str, config: Dict[str, Any]) -> None:
216 |     directory = config['output']['formated_directory']
217 |     file_name = config['output']['formated_file_name'].format(language=language_name)
218 |     
219 |     os.makedirs(directory, exist_ok=True)
220 |     file_path = os.path.join(directory, file_name)
221 |     
222 |     with open(file_path, 'w', encoding='utf-8') as json_file:
223 |         json.dump(language_info, json_file, ensure_ascii=False, indent=4)
224 |     
225 |     print(f"Saved formatted output for {language_name} to {file_path}")
226 | 
227 | def process_single_language(langisocode693_Script: str, df: pd.DataFrame, config: Dict[str, Any]) -> None:
228 |     """Process a single language and generate its formatted output."""
229 |     code, script = langisocode693_Script.split('_')
230 |     
231 |     # Get language information
232 |     lang_name = get_language_name(code)
233 |     speakers = get_number_of_speakers('metadata/linguameta.tsv', code)
234 |     family = get_family_name_from_iso('metadata/languoid.csv', code)
235 |     
236 |     # Create language info dictionary
237 |     language_info = {
238 |         "Language Name": lang_name,
239 |         "Number of Speakers": speakers if speakers is not None else "No data",
240 |         "Family": family if family is not None else "No data",
241 |         "Subgrouping": '',
242 |         "Supported by allenai/MADLAD-400": is_in_madlad(code),
243 |         "Supported by facebook/flores": is_in_flores(langisocode693_Script),
244 |         "Supported by cis-lmu/Glot500": is_in_glot500(langisocode693_Script)
245 |     }
246 |     
247 |     # Process URLs
248 |     json_file_name = os.path.join(
249 |         config['output']['directory'],
250 |         f"{langisocode693_Script}_crawled_output.json"
251 |     )
252 |     categorized_urls = categorize_urls(json_file_name)
253 |     language_info['Sites'] = categorized_urls
254 |     
255 |     # Save formatted output
256 |     save_language_info(language_info, langisocode693_Script, config)
257 | 
258 | def batch_process_languages(config: Dict[str, Any]) -> None:
259 |     """Process multiple languages in batch."""
260 |     print("Starting batch processing...")
261 |     
262 |     # Read the speakers data once for all languages
263 |     df = pd.read_csv('metadata/language_speakers_data.csv', usecols=['ISO Alpha-3/5', 'Name', 'Speakers worldwide'])
264 |     df.set_index('ISO Alpha-3/5', inplace=True)
265 |     
266 |     # Get list of languages to process
267 |     if config.get('batch_processing', {}).get('enabled', False):
268 |         languages = config['batch_processing']['input_labels']
269 |     else:
270 |         languages = [config['language_detector']['desired_language']]
271 |     
272 |     total_languages = len(languages)
273 |     print(f"Processing {total_languages} languages...")
274 |     
275 |     for idx, language in enumerate(languages, 1):
276 |         print(f"\nProcessing language {idx}/{total_languages}: {language}")
277 |         try:
278 |             process_single_language(language, df, config)
279 |         except Exception as e:
280 |             print(f"Error processing language {language}: {str(e)}")
281 |             continue
282 |     
283 |     print("\nBatch processing completed!")
284 | 
285 | def main():
286 |     # Load configuration
287 |     config = load_config('pipeline/config.yaml')
288 |     
289 |     # Start batch processing
290 |     batch_process_languages(config)
291 | 
292 | if __name__ == "__main__":
293 |     main()
294 | 


--------------------------------------------------------------------------------
/result_filtering/remove_all_hash.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from urllib.parse import urlparse, urlunparse
 4 | 
 5 | def normalize_url(url):
 6 |     """
 7 |     Normalize URL by removing the fragment (hashtag) portion.
 8 |     Returns the URL without the fragment.
 9 |     """
10 |     parsed = urlparse(url)
11 |     # Create new parsed URL without fragment
12 |     clean_parsed = parsed._replace(fragment='')
13 |     return urlunparse(clean_parsed)
14 | 
15 | def deduplicate_links(links, strip_all_fragments=True):
16 |     """
17 |     Deduplicate links by removing URLs that are the same except for hashtags.
18 |     
19 |     Args:
20 |         links: List of URLs to process
21 |         strip_all_fragments: If True, strips fragments from all URLs.
22 |                             If False, only deduplicates based on normalized URLs.
23 |     """
24 |     # Create a dictionary to store unique normalized URLs
25 |     unique_urls = {}
26 |     
27 |     # For each link, store only the first occurrence of its normalized version
28 |     for link in links:
29 |         normalized = normalize_url(link)
30 |         if normalized not in unique_urls:
31 |             # If we're stripping all fragments, store the normalized URL
32 |             # Otherwise, store the original URL with fragment
33 |             unique_urls[normalized] = normalized if strip_all_fragments else link
34 |     
35 |     # Return the deduplicated links in the same order they first appeared
36 |     return list(unique_urls.values())
37 | 
38 | def process_file(input_path, output_path, strip_all_fragments=True):
39 |     """Process a single JSON file to deduplicate links."""
40 |     with open(input_path, 'r', encoding='utf-8') as f:
41 |         data = json.load(f)
42 |     
43 |     # Process each site's links
44 |     for site in data.get('Sites', []):
45 |         if 'Links' in site:
46 |             site['Links'] = deduplicate_links(site['Links'], strip_all_fragments)
47 |     
48 |     # Write the processed data
49 |     with open(output_path, 'w', encoding='utf-8') as f:
50 |         json.dump(data, f, ensure_ascii=False, indent=4)
51 | 
52 | def main():
53 |     # Specify your input and output directories here
54 |     input_dir = "output/http_merged"  # Replace with your input directory path
55 |     output_dir = "output/deduplication"  # Replace with your output directory path
56 |     
57 |     # Set this to True to completely strip all fragments
58 |     strip_all_fragments = True
59 |     
60 |     # Create output directory if it doesn't exist
61 |     os.makedirs(output_dir, exist_ok=True)
62 |     
63 |     # Process each JSON file
64 |     for filename in os.listdir(input_dir):
65 |         if filename.endswith('.json'):
66 |             input_path = os.path.join(input_dir, filename)
67 |             output_path = os.path.join(output_dir, filename)
68 |             
69 |             try:
70 |                 process_file(input_path, output_path, strip_all_fragments)
71 |                 print(f"Processed {filename}")
72 |             except Exception as e:
73 |                 print(f"Error processing {filename}: {str(e)}")
74 | 
75 | if __name__ == "__main__":
76 |     main()


--------------------------------------------------------------------------------
/result_filtering/remove_hash.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from urllib.parse import urlparse, urlunparse
 4 | 
 5 | def normalize_url(url):
 6 |     """
 7 |     Normalize URL by removing the fragment (hashtag) portion.
 8 |     Returns the URL without the fragment.
 9 |     """
10 |     parsed = urlparse(url)
11 |     # Create new parsed URL without fragment
12 |     clean_parsed = parsed._replace(fragment='')
13 |     return urlunparse(clean_parsed)
14 | 
15 | def deduplicate_links(links):
16 |     """
17 |     Deduplicate links by removing URLs that are the same except for hashtags.
18 |     """
19 |     # Create a dictionary to store unique normalized URLs
20 |     unique_urls = {}
21 |     
22 |     # For each link, store only the first occurrence of its normalized version
23 |     for link in links:
24 |         normalized = normalize_url(link)
25 |         if normalized not in unique_urls:
26 |             unique_urls[normalized] = link
27 |     
28 |     # Return the deduplicated links in the same order they first appeared
29 |     return list(unique_urls.values())
30 | 
31 | def process_file(input_path, output_path):
32 |     """Process a single JSON file to deduplicate links."""
33 |     with open(input_path, 'r', encoding='utf-8') as f:
34 |         data = json.load(f)
35 |     
36 |     # Process each site's links
37 |     for site in data.get('Sites', []):
38 |         if 'Links' in site:
39 |             site['Links'] = deduplicate_links(site['Links'])
40 |     
41 |     # Write the processed data
42 |     with open(output_path, 'w', encoding='utf-8') as f:
43 |         json.dump(data, f, ensure_ascii=False, indent=4)
44 | 
45 | def main():
46 |     # Specify your input and output directories here
47 |     input_dir = "output/http_merged"  # Replace with your input directory path
48 |     output_dir = "output/deduplication"  # Replace with your output directory path
49 |     
50 |     # Create output directory if it doesn't exist
51 |     os.makedirs(output_dir, exist_ok=True)
52 |     
53 |     # Process each JSON file
54 |     for filename in os.listdir(input_dir):
55 |         if filename.endswith('.json'):
56 |             input_path = os.path.join(input_dir, filename)
57 |             output_path = os.path.join(output_dir, filename)
58 |             
59 |             try:
60 |                 process_file(input_path, output_path)
61 |                 print(f"Processed {filename}")
62 |             except Exception as e:
63 |                 print(f"Error processing {filename}: {str(e)}")
64 | 
65 | if __name__ == "__main__":
66 |     main()


--------------------------------------------------------------------------------
/search_dump/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cisnlp/GlotWeb/921fbf2f5e307f1f3e0927d078fa5afc95b74673/search_dump/.keep


--------------------------------------------------------------------------------
/searxng/uwsgi.ini:
--------------------------------------------------------------------------------
 1 | [uwsgi]
 2 | # Who will run the code
 3 | uid = searxng
 4 | gid = searxng
 5 | 
 6 | # Number of workers (usually CPU count)
 7 | # default value: %k (= number of CPU core, see Dockerfile)
 8 | workers = %k
 9 | 
10 | # Number of threads per worker
11 | # default value: 4 (see Dockerfile)
12 | threads = 4
13 | 
14 | # The right granted on the created socket
15 | chmod-socket = 666
16 | 
17 | # Plugin to use and interpreter config
18 | single-interpreter = true
19 | master = true
20 | plugin = python3
21 | lazy-apps = true
22 | enable-threads = 4
23 | 
24 | # Module to import
25 | module = searx.webapp
26 | 
27 | # Virtualenv and python path
28 | pythonpath = /usr/local/searxng/
29 | chdir = /usr/local/searxng/searx/
30 | 
31 | # automatically set processes name to something meaningful
32 | auto-procname = true
33 | 
34 | # Disable request logging for privacy
35 | disable-logging = true
36 | log-5xx = true
37 | 
38 | # Set the max size of a request (request-body excluded)
39 | buffer-size = 9216
40 | 
41 | # No keep alive
42 | # See https://github.com/searx/searx-docker/issues/24
43 | add-header = Connection: close
44 | 
45 | # Follow SIGTERM convention
46 | # See https://github.com/searxng/searxng/issues/3427
47 | die-on-term
48 | 
49 | # uwsgi serves the static files
50 | static-map = /static=/usr/local/searxng/searx/static
51 | # expires set to one day
52 | static-expires = /* 86400
53 | static-gzip-all = True
54 | offload-threads = 4
55 | 


--------------------------------------------------------------------------------