├── requirements.txt
├── run-me.sh
├── README.md
├── json2jsonl.py
└── ScrapeFandom.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4>=0.0.1
2 | requests>=2.24.0
3 | argparse>=1.4.0
4 | webdriver-manager>=3.8.5
5 | selenium>=4.8.2
6 | 


--------------------------------------------------------------------------------
/run-me.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # check if values are provided
 4 | if [ $# -eq 0 ]; then
 5 |     echo "Please provide one or more values as arguments."
 6 |     exit 1
 7 | fi
 8 | 
 9 | # loop through the values and run the commands for each value
10 | for value in "$@"; do
11 |     echo "Running commands for value: $value"
12 |     
13 |     # run python script to scrape fandom
14 |     python3 ScrapeFandom.py "$value"
15 |     
16 |     # run wikiextractor with necessary options
17 |     wikiextractor "$value.xml" --no-templates -l --json -o "$value"
18 |     
19 |     # run python script to convert json to text
20 |     python3 json2jsonl.py "$value/" "$value.jsonl"
21 | done
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrape Fandom
 2 | Fandom.com provides Wiki dumps at https://*.fandom.com/wiki/Special:Statistics, but most of the dumps are outdated, and require contacting an admin to produce a new dump.
 3 | 
 4 | This script scrapes Fandom.com for an updated Wiki dump. It scrapes the Special:AllPages to get a list of article names and requests a wiki dump from Special:Export. Instructions to get a corpus for natural language processing and training is provided.
 5 | 
 6 | Works only for English fandom sites. Some slight modifications are needed for other languages.
 7 | 
 8 | # Notes
 9 | Will require the Chrome browser to be installed on the machine. The most up-to-date Chrome Driver will be handled by `webdriver-manager`.
10 | The requirements.txt file should list all Python libraries that your notebooks depend on, and they will be installed using:
11 | 
12 | `pip install -r requirements.txt`
13 | 
14 | # Instructions
15 | 1. Clone the extractor locally (https://github.com/JOHW85/wikiextractor) with 
16 | ```git clone https://github.com/JOHW85/wikiextractor```
17 | 2. Open the terminal and cd  your way to the repo dir: `cd wikiextractor`
18 | 4. Run 
19 | ```python3 setup.py install```
20 | 5. Finally, run `run-me.sh FANDOM1 FANDOM2` in the terminal to get FANDOM1.jsonl and FANDOM2.jsonl in the directory.
21 | 
22 | Example
23 | ```run-me.sh harrypotter finalfantasy```
24 | 


--------------------------------------------------------------------------------
/json2jsonl.py:
--------------------------------------------------------------------------------
 1 | # Modified from thaalesalves#8854 from NovelAI Discord
 2 | import json
 3 | import os
 4 | import re
 5 | import argparse
 6 | from tqdm import tqdm
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('input_dir', help='Directory with wiki json files')
10 | parser.add_argument('output', help='Txt file output')
11 | args = parser.parse_args()
12 | # Remove URLs from the text field using regex
13 | url_regex = r'&lt;a href="(.*?)"&gt;(.*?)&lt;/a&gt;'
14 | 
15 | # Get all directories in input_dir
16 | directories = os.listdir(args.input_dir)
17 | counter = 0
18 | with open(args.output, 'w') as fout:
19 |     for directory in tqdm(directories):
20 |         for filename in tqdm(os.listdir(os.path.join(args.input_dir, directory)), desc="Processing "+directory):
21 |             if not filename.startswith('wiki'):
22 |                 continue
23 | 
24 |             path = os.path.join(os.path.join(args.input_dir, directory), filename)
25 |             with open(path, 'r') as fin:
26 |                 for line in fin:
27 |                     data = json.loads(line)
28 | 
29 |                     # remove non-processed templates
30 |                     if data['text'] == "":
31 |                         continue
32 |                     else:
33 |                         # Output to JSON in "text" field
34 |                         title = "#"+data['title']+"\n"
35 |                         text = re.sub(url_regex, r'\2', data['text'])
36 |                         text = re.sub(r'\(\s+', '(', text)
37 |                         output_json = { "meta": data["url"],
38 |                                         "text": title+text.replace('()', '').replace("\u00a0"," ").replace(" , ", ", ")
39 |                                     }
40 |                         counter += 1
41 |                         fout.write(json.dumps(output_json) + '\n')
42 | print(counter)
43 | 


--------------------------------------------------------------------------------
/ScrapeFandom.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import argparse
  4 | from tqdm import tqdm
  5 | import errno, os
  6 | from selenium import webdriver
  7 | from selenium.webdriver.chrome.service import Service
  8 | from webdriver_manager.chrome import ChromeDriverManager
  9 | 
 10 | def mkdir_p(path):
 11 |     try:
 12 |         os.makedirs(path)
 13 |     except OSError as exc: # Python >2.5
 14 |         if exc.errno == errno.EEXIST and os.path.isdir(path):
 15 |             pass
 16 |         else: raise
 17 | def safe_open_w(path):
 18 |     ''' Open "path" for writing, creating any parent directories as needed.
 19 |     '''
 20 |     mkdir_p(os.path.dirname(path))
 21 |     return open(path, 'wb')
 22 | 
 23 | 
 24 | chrome_options = webdriver.ChromeOptions()
 25 | chrome_options.add_argument("disable-infobars")
 26 | chrome_options.add_argument('--disable-extensions')
 27 | chrome_options.add_argument('--headless')
 28 | chrome_options.add_argument('--disable-gpu')
 29 | chrome_options.add_argument('--disable-dev-shm-usage')
 30 | chrome_options.add_argument('--ignore-certificate-errors')
 31 | chrome_options.add_argument('--no-sandbox')
 32 | 
 33 | parser = argparse.ArgumentParser()
 34 | parser.add_argument('input_fandom', help='Fandom\'s name')
 35 | args = parser.parse_args()
 36 | fandom_site = args.input_fandom
 37 | #Get All Pages
 38 | nextpage_url = "/wiki/Special:AllPages"
 39 | AllPage = "https://"+fandom_site+".fandom.com"+nextpage_url
 40 | counter = 0
 41 | while nextpage_url != "":
 42 |     listofpages = ""
 43 |     try:
 44 |         req = requests.get(AllPage, allow_redirects=False)
 45 |         if req.content != b'':
 46 |             soup = BeautifulSoup(req.content,"lxml")
 47 |             content = soup.find("div",{"class":"mw-allpages-body"})
 48 |             nextpage = soup.find("div",{"class":"mw-allpages-nav"})
 49 |         else:
 50 |             driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
 51 |             driver.get(AllPage)
 52 |             soup = BeautifulSoup(driver.page_source, 'html.parser')
 53 |             content = soup.find("div",{"class":"mw-allpages-body"})
 54 |             nextpage = soup.find("div",{"class":"mw-allpages-nav"})
 55 |             driver.quit()            
 56 |         if content:
 57 |             listofentries = content.find_all("li")
 58 |             for i in tqdm(listofentries, desc="Scraping "+AllPage):
 59 |                 listofpages += i.text.replace("(redirect","") + "\n"   
 60 |             # Exports XML file of all the pages scraped
 61 |             payload = {'catname':'','pages':listofpages,'curonly':'1','wpDownload':1,'wpEditToken':'+\\','title':'Special:Export'}    
 62 |             response = requests.post("https://"+fandom_site+".fandom.com/wiki/Special:Export", data=payload)
 63 |             data = response.content
 64 |             # Create directory 'fandom_site' if it doesn't exist
 65 |             with safe_open_w(f"{fandom_site}_raw/{counter}.xml") as f:
 66 |                 f.write(data)
 67 |             counter += 1
 68 |         else:
 69 |             print("No content found")
 70 |         # Gets next page from AllPages
 71 |         if nextpage:
 72 |             nav = nextpage.findAll("a")
 73 |             if len(nav) > 0:
 74 |                 if "Next page" in nav[-1].text:
 75 |                     nextpage_url = nav[-1]["href"]
 76 |                     AllPage = "https://"+fandom_site+".fandom.com"+nextpage_url
 77 |                 else:
 78 |                     nextpage_url = ""
 79 |                     break
 80 |         else:
 81 |             # Break if there's only one index page on AllPages
 82 |             break
 83 |     except Exception as e:
 84 |         print("Error", e)
 85 |         continue
 86 |     
 87 | # Combine all the MediaWiki XML files into one
 88 | # Get all files in the directory
 89 | # Files in present working directory
 90 | files = os.listdir(f"{fandom_site}_raw")
 91 | # Create a new file to write to
 92 | with open(f"{fandom_site}.xml", "w") as outfile:
 93 |     # Loop through all files in the directory
 94 |     for fname in files:
 95 |         # Open each file and read it as a MediaWiki XML file
 96 |         with open(f"{fandom_site}_raw/{fname}", "r") as infile:
 97 |             # Write the contents of each file to the new file
 98 |             outfile.write(infile.read())
 99 |             outfile.write("\n")
100 | 


--------------------------------------------------------------------------------