├── requirements.txt
├── .github
└── workflows
│ └── pylint.yml
├── thmq2md.py
└── README.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.11.1
2 | selenium==4.7.2
3 | webdriver_manager
4 | html5lib
5 |
--------------------------------------------------------------------------------
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
1 | name: Pylint
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 |
10 | steps:
11 | - uses: actions/checkout@v3
12 | - name: Set up Python 3.10
13 | uses: actions/setup-python@v3
14 | with:
15 | python-version: "3.10"
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install flake8 pytest
20 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
21 | - name: Lint with flake8
22 | run: |
23 | # stop the build if there are Python syntax errors or undefined names
24 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
25 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
26 | flake8 ./*.py --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
27 |
--------------------------------------------------------------------------------
/thmq2md.py:
--------------------------------------------------------------------------------
1 | #!/bin/python3
2 | from selenium import webdriver
3 | from bs4 import BeautifulSoup
4 | import re, sys
5 | from selenium.webdriver.firefox.service import Service
6 | from webdriver_manager.firefox import GeckoDriverManager
7 |
8 |
9 |
10 | style ="\n"
11 | T=[" "," "]
12 | A=[" "," "]
13 | Q=[" "," "]
14 | H=[" "]
15 | N=[" "," "]
16 |
17 |
18 |
19 |
20 |
21 | def getTotalQ(data):
22 | return re.finditer(r"task-\d+", data)
23 |
24 |
25 | def getData(url):
26 | fireFoxOptions = webdriver.FirefoxOptions()
27 | fireFoxOptions.headless = True
28 | brower = webdriver.Firefox(options=fireFoxOptions, service=Service(executable_path=GeckoDriverManager().install()))
29 | brower.get(url)
30 | r = brower.page_source
31 | brower.quit()
32 | return r
33 |
34 | def extractData(tag, data="", clas="", url="", id1=""):
35 | if data == "":
36 | html = getData(url)
37 | contents = BeautifulSoup(html, 'html5lib')
38 | if clas == "" and id1 == "":
39 | return contents.find_all(tag)
40 | elif id1 != "" and clas== "":
41 | return contents.find_all(tag, {"id":id1})
42 | elif id1 == "" and clas != "":
43 | return contents.find_all(tag, {"class":clas})
44 | elif id1 != "" and clas != "" :
45 | return contents.find_all(tag, {"id":id1},{"class":clas})
46 | else :
47 | contents = BeautifulSoup(data, 'html5lib')
48 | if clas == "" and id1 == "":
49 | return contents.find_all(tag)
50 | elif id1 != "" and clas== "":
51 | return contents.find_all(tag, {"id":id1})
52 | elif id1 == "" and clas != "":
53 | return contents.find_all(tag, {"class":clas})
54 | elif id1 != "" and clas != "" :
55 | return contents.find_all(tag, {"id":id1},{"class":clas})
56 |
57 |
58 | def filtering(data, tag):
59 | d = re.finditer(r"
(.*)
", data)
60 | return d
61 |
62 | def cleanH(raw):
63 | cleanr = re.compile('<.*?>')
64 | cleantext = re.sub(cleanr, '', raw)
65 | return cleantext
66 |
67 | def main():
68 |
69 | url = ""
70 | file = ""
71 | raw = False
72 | if len(sys.argv) == 1:
73 | url = input("enter URL for the room :" )
74 | file = input("enter output file name with path :")
75 | else :
76 | for arg in sys.argv[1:]:
77 | if arg.find("http") > -1:
78 | url = arg
79 | elif arg.find("-raw") > -1:
80 | raw = True
81 | else :
82 | file = arg
83 |
84 | if raw:
85 | T[:]=[" "," "]
86 | A[:]=[" "," "]
87 | Q[:]=[" "," "]
88 | H[:]=[" "," "]
89 | N[:]=[" "," "]
90 | style=""
91 | source = getData(url)
92 | tasks = getTotalQ(source)
93 | room = extractData(data=source, id1="title", tag='h1')
94 | f = open(file, 'w')
95 | f.write(style)
96 | f.write(f"#{H[0]} {room[0].get_text()}{H[1]}\n\n")
97 | for task in tasks :
98 | questions = str(extractData(data=source, id1=task.group(0), tag='div'))
99 | question = str(extractData(data=questions, tag='div', clas='room-task-question-details'))
100 | title = extractData(data=questions, clas="card-link", tag='a')
101 | num = 1
102 | for ti in title :
103 | f.write(f"##{T[0]}"+str(ti.get_text()).strip()+f"{T[1]}\n")
104 | da = filtering(question, "p")
105 | for i in da :
106 | f.write(f"\t{num}. **"+cleanH(i.group(1).strip())+f'**\n\t\t*{A[0]} answer here {A[1]}\n')
107 | num+=1
108 | f.write("\n
\n")
109 | f.close()
110 | print("done")
111 |
112 | if '__main__' == __name__ :
113 | main()
114 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/cabbagec2hlbGwK/Try-hack-me-questions-crawler/actions/workflows/pylint.yml)
2 | ---
3 | # Try-hack-me-questions-crawler
4 | this is a small script for extracting data from try hack me room ,and compile a MD file
5 |
6 | ### Requirements
7 | * selenium
8 | * bs4
9 | * Firefox
10 | ---
11 |
12 | ### setup
13 | * change the file permission of "geckodriver" and "thmq2md.py"
14 | ```html
15 | pip install -r requirements.txt
16 | apt install firefox # If you do not have firefox
17 | ```
18 |
19 | ### Usage
20 | ```bash
21 | thmq2md.py