├── README.md ├── minecraftversionlister.py ├── .gitignore ├── mcarchivescraper.py └── cursescraper.py /README.md: -------------------------------------------------------------------------------- 1 | Archiving Utilities 2 | 3 | These are made for https://files.pymcl.net/archive and https://pymcl.net 4 | -------------------------------------------------------------------------------- /minecraftversionlister.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import json 3 | import simplejson 4 | 5 | file = open("page.html", "r") 6 | 7 | page = bs4.BeautifulSoup(file.read(), "html.parser") 8 | 9 | lis = [] 10 | files = {} 11 | 12 | for li in page.find_all("li"): 13 | 14 | links = li.find_all("a") 15 | 16 | for link in links: 17 | url = link.get("href") 18 | 19 | if url.__contains__("launcher.mojang.com") and url.__contains__("client.jar"): 20 | files[li.get("id")] = url.split("objects/")[1].split("/client.jar")[0] 21 | 22 | file.close() 23 | file = open("out.json", "w") 24 | 25 | file.write(simplejson.dumps(files, indent=4)) 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | venv/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | 60 | \.idea/ 61 | 62 | cursescraper_data/ 63 | 64 | mcarchivescraper_data/ 65 | -------------------------------------------------------------------------------- /mcarchivescraper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import bs4 4 | import io 5 | import urllib3 6 | 7 | def match_class(target): 8 | def do_match(tag): 9 | classes = tag.get('class', []) 10 | return all(c in classes for c in target) 11 | return do_match 12 | 13 | site = "https://mcarchive.net" 14 | mods = [] 15 | parentDir = "./mcarchivescraper_data" 16 | 17 | try: 18 | os.mkdir(parentDir) 19 | except: 20 | pass 21 | 22 | os.chdir(parentDir) 23 | first = True 24 | 25 | with requests.get(site + "/mods") as response: 26 | siteData = response.text 27 | 28 | siteDict = bs4.BeautifulSoup(siteData, "html.parser") 29 | 30 | for mod in siteDict.findAll(match_class(["block"])): 31 | if first: 32 | first = False 33 | else: 34 | mod = mod.find("a").get("href") 35 | mods.append(mod) 36 | 37 | downloaded = 0 38 | 39 | for mod in mods: 40 | with requests.get(site + mod) as response: 41 | page = bs4.BeautifulSoup(response.text, "html.parser") 42 | print("Parsing " + site + mod) 43 | for link in page.findAll(match_class(["btn"])): 44 | if link.contents[0].strip() == "IPFS Download": 45 | try: 46 | name = os.path.basename(link.get("href")) 47 | if os.path.exists(name): 48 | print("\"" + name + "\" exists. Skipping.") 49 | else: 50 | print("Trying to download \"" + name + "\"") 51 | with requests.get(link.get("href"), stream=True, timeout=20) as response: 52 | with io.open(name, 'wb') as fd: 53 | oldDone = 0 54 | for chunk in response.iter_content(chunk_size=4096): 55 | fd.write(chunk) 56 | downloaded += 1 57 | except Exception as e: 58 | print(e) 59 | try: 60 | os.unlink(name) 61 | except: 62 | pass 63 | 64 | print("Downloaded " + str(downloaded) + (" mod." if downloaded == 1 else "mods.")) -------------------------------------------------------------------------------- /cursescraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exit codes: 3 | -1: User termination. 4 | 0: Normal exit. 5 | 1: Generic exception. 6 | 2: Invalid arguments. 7 | 3: Missing write access. 8 | """ 9 | 10 | utilversion = "v0.3.4" 11 | print("Cursescraper " + utilversion + " starting...") 12 | 13 | import sys 14 | import os 15 | import requests 16 | import getopt 17 | import traceback 18 | import bs4 19 | import io 20 | import shutil 21 | import time 22 | import contextlib 23 | import json 24 | 25 | helpText = """Usage: cursescraper (params) 26 | 27 | Accepted params: 28 | -h: Shows this. 29 | -p, --parentdir : Use the directory specified.""" 30 | 31 | parentDir = "cursescraper_data" 32 | 33 | keepcharacters = (" ", ".", "_", "(", ")", "[", "]", "{", "}", "-") 34 | 35 | 36 | def getURLAsFile(url, filepath): 37 | print("Getting \"" + url + "\" and storing it in \"" + filepath + "\".") 38 | with requests.get(url) as response: 39 | if response.status_code == 200: 40 | with open(filepath, "w") as file: 41 | file.write(response.content) 42 | else: 43 | print("URL responded with error code " + str(response.status_code) + " for URL \"" + url + "\".") 44 | 45 | 46 | def match_class(target): 47 | def do_match(tag): 48 | classes = tag.get('class', []) 49 | return all(c in classes for c in target) 50 | return do_match 51 | 52 | 53 | def download(url, name, newDir=None): 54 | if not os.path.exists(newDir + "/" + name): 55 | try: 56 | with requests.get(url, stream=True) as response: 57 | total_length = int(response.headers.get("content-length")) 58 | dl = 0 59 | if response.content is None: 60 | raise ConnectionError 61 | try: 62 | os.mkdir("tmp") 63 | except: 64 | pass 65 | 66 | with io.open("tmp/" + name, 'wb') as fd: 67 | oldDone = 0 68 | for chunk in response.iter_content(chunk_size=4096): 69 | dl += len(chunk) 70 | fd.write(chunk) 71 | done = int(50 * dl / total_length) 72 | if done != oldDone: 73 | print("\r[%s%s]" % ("=" * done, " " * (50 - done)), end="") 74 | oldDone = done 75 | 76 | print("") 77 | 78 | if newDir: 79 | shutil.move("tmp/" + name, newDir) 80 | except Exception as e: 81 | print("An exception ocurred:") 82 | traceback.print_exc() 83 | 84 | else: 85 | print("File \"" + newDir + "/" + name + "\" already exists! Skipping.") 86 | 87 | 88 | def start(): 89 | # Insert startup thingy here. 90 | print("Cursescraper " + utilversion + " started!\n\n") 91 | 92 | print("What minecraft curse site do you want to scrape?\n1: Bukkit\n2: Mods\n3: Modpacks\n4: Custom URL\n5: Exit\n\nFormat: