├── README.md
├── minecraftversionlister.py
├── .gitignore
├── mcarchivescraper.py
└── cursescraper.py


/README.md:
--------------------------------------------------------------------------------
1 | Archiving Utilities
2 | 
3 | These are made for https://files.pymcl.net/archive and https://pymcl.net
4 | 


--------------------------------------------------------------------------------
/minecraftversionlister.py:
--------------------------------------------------------------------------------
 1 | import bs4
 2 | import json
 3 | import simplejson
 4 | 
 5 | file = open("page.html", "r")
 6 | 
 7 | page = bs4.BeautifulSoup(file.read(), "html.parser")
 8 | 
 9 | lis = []
10 | files = {}
11 | 
12 | for li in page.find_all("li"):
13 | 
14 |     links = li.find_all("a")
15 | 
16 |     for link in links:
17 |         url = link.get("href")
18 | 
19 |         if url.__contains__("launcher.mojang.com") and url.__contains__("client.jar"):
20 |             files[li.get("id")] = url.split("objects/")[1].split("/client.jar")[0]
21 | 
22 | file.close()
23 | file = open("out.json", "w")
24 | 
25 | file.write(simplejson.dumps(files, indent=4))
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | venv/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | # PyBuilder
58 | target/
59 | 
60 | \.idea/
61 | 
62 | cursescraper_data/
63 | 
64 | mcarchivescraper_data/
65 | 


--------------------------------------------------------------------------------
/mcarchivescraper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | import bs4
 4 | import io
 5 | import urllib3
 6 | 
 7 | def match_class(target):
 8 |     def do_match(tag):
 9 |         classes = tag.get('class', [])
10 |         return all(c in classes for c in target)
11 |     return do_match
12 | 
13 | site = "https://mcarchive.net"
14 | mods = []
15 | parentDir = "./mcarchivescraper_data"
16 | 
17 | try:
18 |     os.mkdir(parentDir)
19 | except:
20 |     pass
21 | 
22 | os.chdir(parentDir)
23 | first = True
24 | 
25 | with requests.get(site + "/mods") as response:
26 |     siteData = response.text
27 | 
28 |     siteDict = bs4.BeautifulSoup(siteData, "html.parser")
29 | 
30 |     for mod in siteDict.findAll(match_class(["block"])):
31 |         if first:
32 |             first = False
33 |         else:
34 |             mod = mod.find("a").get("href")
35 |             mods.append(mod)
36 | 
37 | downloaded = 0
38 | 
39 | for mod in mods:
40 |     with requests.get(site + mod) as response:
41 |         page = bs4.BeautifulSoup(response.text, "html.parser")
42 |         print("Parsing " + site + mod)
43 |         for link in page.findAll(match_class(["btn"])):
44 |             if link.contents[0].strip() == "IPFS Download":
45 |                 try:
46 |                     name = os.path.basename(link.get("href"))
47 |                     if os.path.exists(name):
48 |                         print("\"" + name + "\" exists. Skipping.")
49 |                     else:
50 |                         print("Trying to download \"" + name + "\"")
51 |                         with requests.get(link.get("href"), stream=True, timeout=20) as response:
52 |                             with io.open(name, 'wb') as fd:
53 |                                 oldDone = 0
54 |                                 for chunk in response.iter_content(chunk_size=4096):
55 |                                     fd.write(chunk)
56 |                     downloaded += 1
57 |                 except Exception as e:
58 |                     print(e)
59 |                     try:
60 |                         os.unlink(name)
61 |                     except:
62 |                         pass
63 | 
64 | print("Downloaded " + str(downloaded) + (" mod." if downloaded == 1 else "mods."))


--------------------------------------------------------------------------------
/cursescraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Exit codes:
  3 | -1: User termination.
  4 | 0: Normal exit.
  5 | 1: Generic exception.
  6 | 2: Invalid arguments.
  7 | 3: Missing write access.
  8 | """
  9 | 
 10 | utilversion = "v0.3.4"
 11 | print("Cursescraper " + utilversion + " starting...")
 12 | 
 13 | import sys
 14 | import os
 15 | import requests
 16 | import getopt
 17 | import traceback
 18 | import bs4
 19 | import io
 20 | import shutil
 21 | import time
 22 | import contextlib
 23 | import json
 24 | 
 25 | helpText = """Usage: cursescraper (params)
 26 | 
 27 | Accepted params:
 28 |     -h: Shows this.
 29 |     -p, --parentdir <path>: Use the directory specified."""
 30 | 
 31 | parentDir = "cursescraper_data"
 32 | 
 33 | keepcharacters = (" ", ".", "_", "(", ")", "[", "]", "{", "}", "-")
 34 | 
 35 | 
 36 | def getURLAsFile(url, filepath):
 37 |     print("Getting \"" + url + "\" and storing it in \"" + filepath + "\".")
 38 |     with requests.get(url) as response:
 39 |         if response.status_code == 200:
 40 |             with open(filepath, "w") as file:
 41 |                 file.write(response.content)
 42 |         else:
 43 |             print("URL responded with error code " + str(response.status_code) + " for URL \"" + url + "\".")
 44 | 
 45 | 
 46 | def match_class(target):
 47 |     def do_match(tag):
 48 |         classes = tag.get('class', [])
 49 |         return all(c in classes for c in target)
 50 |     return do_match
 51 | 
 52 | 
 53 | def download(url, name, newDir=None):
 54 |     if not os.path.exists(newDir + "/" + name):
 55 |         try:
 56 |             with requests.get(url, stream=True) as response:
 57 |                 total_length = int(response.headers.get("content-length"))
 58 |                 dl = 0
 59 |                 if response.content is None:
 60 |                     raise ConnectionError
 61 |                 try:
 62 |                     os.mkdir("tmp")
 63 |                 except:
 64 |                     pass
 65 | 
 66 |                 with io.open("tmp/" + name, 'wb') as fd:
 67 |                     oldDone = 0
 68 |                     for chunk in response.iter_content(chunk_size=4096):
 69 |                         dl += len(chunk)
 70 |                         fd.write(chunk)
 71 |                         done = int(50 * dl / total_length)
 72 |                         if done != oldDone:
 73 |                             print("\r[%s%s]" % ("=" * done, " " * (50 - done)), end="")
 74 |                             oldDone = done
 75 | 
 76 |                 print("")
 77 | 
 78 |             if newDir:
 79 |                 shutil.move("tmp/" + name, newDir)
 80 |         except Exception as e:
 81 |             print("An exception ocurred:")
 82 |             traceback.print_exc()
 83 | 
 84 |     else:
 85 |         print("File \"" + newDir + "/" + name + "\" already exists! Skipping.")
 86 | 
 87 | 
 88 | def start():
 89 |     # Insert startup thingy here.
 90 |     print("Cursescraper " + utilversion + " started!\n\n")
 91 | 
 92 |     print("What minecraft curse site do you want to scrape?\n1: Bukkit\n2: Mods\n3: Modpacks\n4: Custom URL\n5: Exit\n\nFormat: <option>:<version> e.g. 1:CB 1060")
 93 |     selection = " "
 94 | 
 95 |     while not any(selection.split(":")[0] in s for s in ["1", "2", "3", "4", "5"]):
 96 |         selection = str(input("> ")).strip()
 97 |         #print("> 1:CB 1060")
 98 |         #selection = "1:CB 1060"
 99 | 
100 |     site = None
101 |     siteSub = None
102 |     cfSite = None
103 |     mcVersion = None
104 | 
105 |     if selection.split(":")[0] == "1":
106 |         site = "https://dev.bukkit.org/"
107 |         siteSub = "projects"
108 |         siteRepo = "bukkit-plugins"
109 |         cfSite = "https://api.cfwidget.com/minecraft/bukkit-plugins/"
110 | 
111 |     try:
112 |         mcVersion = selection.split(":")[1]
113 |         siteName = site.split("/")[2]
114 |     except:
115 |         print("Version not given. Exiting...")
116 |         sys.exit(2)
117 | 
118 |     if not site or not cfSite or not mcVersion or not siteSub:
119 |         print("Invalid arguments. Exiting...")
120 |         sys.exit(2)
121 | 
122 |     print("Getting versions")
123 |     with requests.get(site + siteRepo) as response:
124 |         versionElements = bs4.BeautifulSoup(response.text, "html.parser").find(id="filter-game-version").find_all("option")[1:]
125 |         versions = {}
126 |         for element in versionElements:
127 |             versionID = element.get("value")
128 |             element = str(element)
129 |             versions[element.split("\xa0\xa0")[1].split("<")[0].strip()] = versionID
130 | 
131 | 
132 |     with requests.get(site + siteRepo + "?filter-game-version=" + versions[mcVersion]) as response:
133 |         try:
134 |             pages = bs4.BeautifulSoup(response.text, "html.parser").find_all(match_class(["b-pagination-list", "paging-list", "j-tablesorter-pager", "j-listing-pagination"]))[0]
135 |             pages = bs4.BeautifulSoup(str(pages), "html.parser").find_all(match_class(["b-pagination-item"]))[-1]
136 |             pages = int(str(pages).split("page=")[1].split("\"")[0])
137 |         except IndexError:
138 |             pages = 1
139 | 
140 |     if pages == 1:
141 |         pagesOrPage = "page"
142 |     else:
143 |         pagesOrPage = "pages"
144 |     print(str(pages) + " {0} found.".format(pagesOrPage))
145 |     projects = {}
146 | 
147 |     for index in range(pages):
148 |         print("Getting page " + str(index+1) + "'s info.")
149 |         with requests.get(site + siteRepo + "?filter-game-version=" + versions[mcVersion] + "&page=" + str(index+1)) as response:  # Add +1 cause page 0 is same as page 1.
150 |             if response.status_code != 200:
151 |                 break
152 |             else:
153 |                 page = bs4.BeautifulSoup(response.text, "html.parser")
154 |                 page = page.findAll(match_class(["project-list-item"]))
155 |                 count = 0
156 |                 for project in page:
157 |                     count += 1
158 |                     projectInfo = project.find(match_class(["name-wrapper", "overflow-tip"]))
159 |                     projects[projectInfo.find("a").get_text().strip()] = {}
160 |                     projects[projectInfo.find("a").get_text().strip()]["url"] = projectInfo.find("a").get("href").split("/")[2]
161 |                 print("Parsed " + str(count) + " projects.\n")
162 | 
163 |     try:
164 |         os.mkdir(siteName)
165 |     except:
166 |         pass
167 |     try:
168 |         os.mkdir(siteName + "/" + mcVersion)
169 |     except:
170 |         pass
171 | 
172 |     for project in projects:
173 |         project = projects[project]
174 |         print("Downloading all versions for " + project["url"] + " on version " + mcVersion)
175 |         with requests.get(cfSite + project["url"]) as response:
176 |             if response.status_code == 202:
177 |                 print("API retrieving info. Waiting 5 seconds.")
178 |                 time.sleep(5)
179 |                 response = requests.get(cfSite + project["url"])
180 | 
181 |             if response.status_code != 200:
182 |                 print("Error " + str(response.status_code) + " occurred. Skipping file.")
183 |             else:
184 |                 for file in json.loads(response.content)["files"]:
185 |                     if mcVersion in [p.strip() for p in file["versions"]]:
186 |                         with contextlib.closing(requests.get(site + siteSub + "/" + project["url"] + "/files/" + str(file["id"]), stream=True)) as res:
187 |                             fileName = None
188 |                             buffer = ""
189 |                             for chunk in res.iter_content(chunk_size=2048, decode_unicode=True):
190 |                                 buffer = "".join([buffer, chunk])
191 |                                 siteHeader = bs4.BeautifulSoup(buffer, "html.parser")
192 |                                 try:
193 |                                     match = siteHeader.find("title")
194 |                                 except:
195 |                                     match = None
196 | 
197 |                                 if match:
198 |                                     match = str(match)
199 |                                     fileName = match.split(" - ")
200 |                                     print(fileName[0].lower().replace("<title>", ""))
201 |                                     if fileName[0].lower().replace("<title>", "") == "archive":
202 |                                         fileName = fileName[3] + "-" + fileName[1].lower().replace("<title>", "").replace(fileName[3].lower(), "").strip() + ".jar"
203 |                                     else:
204 |                                         fileName = fileName[2] + "-" + fileName[0].lower().replace("<title>", "").replace(fileName[2].lower(), "").strip() + ".jar"
205 |                                     fileName = "".join(c for c in fileName if c.isalnum() or c in keepcharacters).rstrip()
206 |                                     break
207 | 
208 |                         try:
209 |                             os.mkdir(siteName + "/" + mcVersion + "/" + project["url"])
210 |                         except:
211 |                             pass
212 |                         print("Downloading " + fileName)
213 |                         download(site + siteSub + "/" + project["url"] + "/files/" + str(file["id"]) + "/download", fileName, siteName + "/" + mcVersion + "/" + project["url"])
214 | 
215 | 
216 | if __name__ == '__main__':
217 |     argv = sys.argv[1:]
218 |     try:
219 |         opts, args = getopt.getopt(argv, "hp:", ["parentdir="])
220 |     except getopt.GetoptError:
221 |         print(helpText)
222 |         sys.exit(2)
223 | 
224 |     for opt, arg in opts:
225 |         if opt == "-h":
226 |             print(helpText)
227 |             sys.exit()
228 |         elif opt in ("-p", "--parentdir"):
229 |             parentDir = arg
230 | 
231 |     try:
232 |         os.mkdir(parentDir)
233 |     except Exception as e:
234 |         if not str(e).lower().__contains__("file already exists"):
235 |             traceback.print_exc()
236 |             print("Could not make data folder.\nDo you have write access?")
237 |             sys.exit(3)
238 | 
239 |     try:
240 |         shutil.rmtree("tmp")
241 |     except:
242 |         pass
243 | 
244 |     os.chdir(parentDir)
245 | 
246 |     start()
247 | 


--------------------------------------------------------------------------------