├── requirements.txt ├── run-me.sh ├── README.md ├── json2jsonl.py └── ScrapeFandom.py /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4>=0.0.1 2 | requests>=2.24.0 3 | argparse>=1.4.0 4 | webdriver-manager>=3.8.5 5 | selenium>=4.8.2 6 | -------------------------------------------------------------------------------- /run-me.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # check if values are provided 4 | if [ $# -eq 0 ]; then 5 | echo "Please provide one or more values as arguments." 6 | exit 1 7 | fi 8 | 9 | # loop through the values and run the commands for each value 10 | for value in "$@"; do 11 | echo "Running commands for value: $value" 12 | 13 | # run python script to scrape fandom 14 | python3 ScrapeFandom.py "$value" 15 | 16 | # run wikiextractor with necessary options 17 | wikiextractor "$value.xml" --no-templates -l --json -o "$value" 18 | 19 | # run python script to convert json to text 20 | python3 json2jsonl.py "$value/" "$value.jsonl" 21 | done 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrape Fandom 2 | Fandom.com provides Wiki dumps at https://*.fandom.com/wiki/Special:Statistics, but most of the dumps are outdated, and require contacting an admin to produce a new dump. 3 | 4 | This script scrapes Fandom.com for an updated Wiki dump. It scrapes the Special:AllPages to get a list of article names and requests a wiki dump from Special:Export. Instructions to get a corpus for natural language processing and training is provided. 5 | 6 | Works only for English fandom sites. Some slight modifications are needed for other languages. 7 | 8 | # Notes 9 | Will require the Chrome browser to be installed on the machine. The most up-to-date Chrome Driver will be handled by `webdriver-manager`. 10 | The requirements.txt file should list all Python libraries that your notebooks depend on, and they will be installed using: 11 | 12 | `pip install -r requirements.txt` 13 | 14 | # Instructions 15 | 1. Clone the extractor locally (https://github.com/JOHW85/wikiextractor) with 16 | ```git clone https://github.com/JOHW85/wikiextractor``` 17 | 2. Open the terminal and cd your way to the repo dir: `cd wikiextractor` 18 | 4. Run 19 | ```python3 setup.py install``` 20 | 5. Finally, run `run-me.sh FANDOM1 FANDOM2` in the terminal to get FANDOM1.jsonl and FANDOM2.jsonl in the directory. 21 | 22 | Example 23 | ```run-me.sh harrypotter finalfantasy``` 24 | -------------------------------------------------------------------------------- /json2jsonl.py: -------------------------------------------------------------------------------- 1 | # Modified from thaalesalves#8854 from NovelAI Discord 2 | import json 3 | import os 4 | import re 5 | import argparse 6 | from tqdm import tqdm 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('input_dir', help='Directory with wiki json files') 10 | parser.add_argument('output', help='Txt file output') 11 | args = parser.parse_args() 12 | # Remove URLs from the text field using regex 13 | url_regex = r'<a href="(.*?)">(.*?)</a>' 14 | 15 | # Get all directories in input_dir 16 | directories = os.listdir(args.input_dir) 17 | counter = 0 18 | with open(args.output, 'w') as fout: 19 | for directory in tqdm(directories): 20 | for filename in tqdm(os.listdir(os.path.join(args.input_dir, directory)), desc="Processing "+directory): 21 | if not filename.startswith('wiki'): 22 | continue 23 | 24 | path = os.path.join(os.path.join(args.input_dir, directory), filename) 25 | with open(path, 'r') as fin: 26 | for line in fin: 27 | data = json.loads(line) 28 | 29 | # remove non-processed templates 30 | if data['text'] == "": 31 | continue 32 | else: 33 | # Output to JSON in "text" field 34 | title = "#"+data['title']+"\n" 35 | text = re.sub(url_regex, r'\2', data['text']) 36 | text = re.sub(r'\(\s+', '(', text) 37 | output_json = { "meta": data["url"], 38 | "text": title+text.replace('()', '').replace("\u00a0"," ").replace(" , ", ", ") 39 | } 40 | counter += 1 41 | fout.write(json.dumps(output_json) + '\n') 42 | print(counter) 43 | -------------------------------------------------------------------------------- /ScrapeFandom.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import argparse 4 | from tqdm import tqdm 5 | import errno, os 6 | from selenium import webdriver 7 | from selenium.webdriver.chrome.service import Service 8 | from webdriver_manager.chrome import ChromeDriverManager 9 | 10 | def mkdir_p(path): 11 | try: 12 | os.makedirs(path) 13 | except OSError as exc: # Python >2.5 14 | if exc.errno == errno.EEXIST and os.path.isdir(path): 15 | pass 16 | else: raise 17 | def safe_open_w(path): 18 | ''' Open "path" for writing, creating any parent directories as needed. 19 | ''' 20 | mkdir_p(os.path.dirname(path)) 21 | return open(path, 'wb') 22 | 23 | 24 | chrome_options = webdriver.ChromeOptions() 25 | chrome_options.add_argument("disable-infobars") 26 | chrome_options.add_argument('--disable-extensions') 27 | chrome_options.add_argument('--headless') 28 | chrome_options.add_argument('--disable-gpu') 29 | chrome_options.add_argument('--disable-dev-shm-usage') 30 | chrome_options.add_argument('--ignore-certificate-errors') 31 | chrome_options.add_argument('--no-sandbox') 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('input_fandom', help='Fandom\'s name') 35 | args = parser.parse_args() 36 | fandom_site = args.input_fandom 37 | #Get All Pages 38 | nextpage_url = "/wiki/Special:AllPages" 39 | AllPage = "https://"+fandom_site+".fandom.com"+nextpage_url 40 | counter = 0 41 | while nextpage_url != "": 42 | listofpages = "" 43 | try: 44 | req = requests.get(AllPage, allow_redirects=False) 45 | if req.content != b'': 46 | soup = BeautifulSoup(req.content,"lxml") 47 | content = soup.find("div",{"class":"mw-allpages-body"}) 48 | nextpage = soup.find("div",{"class":"mw-allpages-nav"}) 49 | else: 50 | driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) 51 | driver.get(AllPage) 52 | soup = BeautifulSoup(driver.page_source, 'html.parser') 53 | content = soup.find("div",{"class":"mw-allpages-body"}) 54 | nextpage = soup.find("div",{"class":"mw-allpages-nav"}) 55 | driver.quit() 56 | if content: 57 | listofentries = content.find_all("li") 58 | for i in tqdm(listofentries, desc="Scraping "+AllPage): 59 | listofpages += i.text.replace("(redirect","") + "\n" 60 | # Exports XML file of all the pages scraped 61 | payload = {'catname':'','pages':listofpages,'curonly':'1','wpDownload':1,'wpEditToken':'+\\','title':'Special:Export'} 62 | response = requests.post("https://"+fandom_site+".fandom.com/wiki/Special:Export", data=payload) 63 | data = response.content 64 | # Create directory 'fandom_site' if it doesn't exist 65 | with safe_open_w(f"{fandom_site}_raw/{counter}.xml") as f: 66 | f.write(data) 67 | counter += 1 68 | else: 69 | print("No content found") 70 | # Gets next page from AllPages 71 | if nextpage: 72 | nav = nextpage.findAll("a") 73 | if len(nav) > 0: 74 | if "Next page" in nav[-1].text: 75 | nextpage_url = nav[-1]["href"] 76 | AllPage = "https://"+fandom_site+".fandom.com"+nextpage_url 77 | else: 78 | nextpage_url = "" 79 | break 80 | else: 81 | # Break if there's only one index page on AllPages 82 | break 83 | except Exception as e: 84 | print("Error", e) 85 | continue 86 | 87 | # Combine all the MediaWiki XML files into one 88 | # Get all files in the directory 89 | # Files in present working directory 90 | files = os.listdir(f"{fandom_site}_raw") 91 | # Create a new file to write to 92 | with open(f"{fandom_site}.xml", "w") as outfile: 93 | # Loop through all files in the directory 94 | for fname in files: 95 | # Open each file and read it as a MediaWiki XML file 96 | with open(f"{fandom_site}_raw/{fname}", "r") as infile: 97 | # Write the contents of each file to the new file 98 | outfile.write(infile.read()) 99 | outfile.write("\n") 100 | --------------------------------------------------------------------------------