├── parse_hh_data
    ├── __init__.py
    ├── download.py
    └── parse.py
├── .gitignore
├── requirements.txt
├── parse
├── setup.py
├── LICENSE
├── README.md
└── download


/parse_hh_data/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | /build
3 | /dist
4 | /parse_hh_data.egg-info
5 | 
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.8.2
2 | random-user-agent==1.0.1
3 | requests==2.23.0
4 | tqdm==4.44.1
5 | 


--------------------------------------------------------------------------------
/parse:
--------------------------------------------------------------------------------
 1 | #! python
 2 | import os
 3 | import sys
 4 | import json
 5 | import argparse
 6 | 
 7 | from tqdm import tqdm
 8 | from bs4 import BeautifulSoup
 9 | from parse_hh_data.parse import resume
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     parser = argparse.ArgumentParser()
14 | 
15 |     parser.add_argument("path_html")
16 |     parser.add_argument("path_json")
17 |     parser.add_argument("--view_progress_bar", action='store_true')
18 | 
19 |     args = parser.parse_args()
20 | 
21 |     resume_names = os.listdir(args.path_html)
22 |     if args.view_progress_bar:
23 |         resume_names = tqdm(resume_names, file=sys.stdout)
24 | 
25 |     for resume_name in resume_names:
26 |         with open(os.path.join(args.path_html, resume_name)) as fl:
27 |             page = BeautifulSoup(fl.read(), 'html.parser')
28 | 
29 |         resume_name = os.path.splitext(resume_name)[0]
30 | 
31 |         with open(os.path.join(args.path_json, resume_name + ".json"), "w") as fl:
32 |             json.dump(resume(page), fl)
33 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="parse-hh-data",
 8 |     version="0.1.14",
 9 |     author="Arina Ageeva",
10 |     author_email="arina.a.ageeva@gmail.com",
11 |     description="Package for parsing data (vacancies and resumes) from site hh.ru",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/arinaaageeva/parse_hh_data",
15 |     packages=setuptools.find_packages(),
16 |     install_requires=["beautifulsoup4==4.8.2",
17 |                       "random-user-agent==1.0.1",
18 |                       "requests==2.23.0",
19 |                       "tqdm==4.44.1",
20 |                       "urllib3==1.25.8"],
21 |     classifiers=[
22 |         "Programming Language :: Python :: 3",
23 |         "License :: OSI Approved :: MIT License",
24 |         "Operating System :: OS Independent",
25 |     ],
26 |     scripts=["download", "parse"],
27 |     python_requires='>=3.6',
28 | )
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020 The Python Parsing HH Data
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Parse HH Data Project
 2 | 
 3 | Данный модуль предназначен для удобного скачивания вакансий и резюме с сайта `hh.ru`
 4 | 
 5 | ```python
 6 | from parse_hh_data import download, parse
 7 | 
 8 | vacancy = download.vacancy("36070814")
 9 | 
10 | resume = download.resume("d40ce6f80001a8c8380039ed1f5874726f5a6e")
11 | resume = parse.resume(resume)
12 | ```
13 | 
14 | **Вакансии** скачиваются с помощью [API HH](https://dev.hh.ru/) и возвращаются в формате описанном 
15 | [здесь](https://github.com/hhru/api/blob/master/docs/vacancies.md#%D0%BF%D1%80%D0%BE%D1%81%D0%BC%D0%BE%D1%82%D1%80-%D0%B2%D0%B0%D0%BA%D0%B0%D0%BD%D1%81%D0%B8%D0%B8).
16 | 
17 | Обезличенные **резюме** скачиваются непосредственно с [сайта](https://hh.ru/search/resume) в html-формате, 
18 | а затем могут быть преобразованны в json-формат:
19 |     
20 |     birth_date : str - день рождения
21 |     gender : str - пол
22 |     area : str - город проживания
23 |     title : str - желаемая должность
24 |     specialization : list - специализации соискателя
25 |         name : str - название специализации
26 |         profarea_name : str - название профессиональной области, в которую входит специализация
27 |     salary : dict - желаемая зарплата
28 |         amount : int - сумма
29 |         currency : str - валюта
30 |     education_level : str - уровень образования
31 |     education : list - образование
32 |         year : int - год окончания
33 |         name : str - название учебного заведения
34 |         organization : str - организация, специальность / специализация
35 |     language : list - список языков, которыми владеет соискатель
36 |         name : str - название языка
37 |         level : str - уровень знания языка
38 |     experience : list - опыт работы
39 |         start : str - начало работы (дата в формате dd-MM-yyyy)
40 |         end : str - окончание работы (дата в формате dd-MM-yyyy)
41 |         position : str - должность
42 |         description : str - обязанности, функции, достижения (может содержать html-код)
43 |     skills : str - дополнительная информация, описание навыков в свободной форме (может содержать html-код)
44 |     skill_set : list - ключевые навыки
45 |             
46 | с помощью `parse_hh_data.parse.resume`.
47 | 
48 | Скачать списки идентификаторов вакансий или резюме можно используя 
49 | `parse_hh_data.download.vacancy_ids` или `parse_hh_data.download.resume_ids`, соответсвенно.
50 | 
51 | ### Command line interface
52 | 
53 | `download ~/resumes resume --area_ids 113 --specialization_ids 1 --search_period 30`
54 | 
55 | `parse ~/data/resumes ~/data/resumes_json`


--------------------------------------------------------------------------------
/download:
--------------------------------------------------------------------------------
 1 | #! python
 2 | import os
 3 | import sys
 4 | import json
 5 | import argparse
 6 | 
 7 | from tqdm import tqdm
 8 | from parse_hh_data import download
 9 | from requests.exceptions import HTTPError
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     parser = argparse.ArgumentParser()
14 | 
15 |     parser.add_argument("path")
16 |     parser.add_argument("data", choices=["vacancy", "resume"])
17 |     parser.add_argument("--area_ids", nargs='+', default=[""])
18 |     parser.add_argument("--specialization_ids", nargs='+', default=[""])
19 |     parser.add_argument("--search_period", type=int, default=1)
20 |     parser.add_argument("--num_pages", type=int, default=None)
21 |     parser.add_argument("--timeout", type=int, default=10)
22 |     parser.add_argument("--requests_interval", type=int, default=10)
23 |     parser.add_argument("--max_requests_number", type=int, default=100)
24 |     parser.add_argument("--break_reasons", nargs='+', default=["Forbidden", "Not Found"])
25 |     parser.add_argument("--view_progress_area", action='store_true')
26 |     parser.add_argument("--view_progress_specialization", action='store_true')
27 |     parser.add_argument("--log", choices=["BASE", "DEBUG"], default="BASE")
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     download_params = {"timeout": args.timeout,
32 |                        "requests_interval": args.requests_interval,
33 |                        "max_requests_number": args.max_requests_number,
34 |                        "break_reasons": args.break_reasons}
35 | 
36 |     area_ids = args.area_ids
37 |     if args.view_progress_area:
38 |         area_ids = tqdm(area_ids, file=sys.stdout)
39 | 
40 |     specialization_ids = args.specialization_ids
41 |     if args.view_progress_specialization:
42 |         specialization_ids = tqdm(specialization_ids, file=sys.stdout)
43 | 
44 |     for area_id in area_ids:
45 |         for specialization_id in specialization_ids:
46 |             if args.data == "vacancy":
47 |                 document_ids = download.vacancy_ids(area_id, specialization_id, args.search_period, args.num_pages, **download_params)
48 |             if args.data == "resume":
49 |                 document_ids = download.resume_ids(area_id, specialization_id, args.search_period, args.num_pages, **download_params)
50 | 
51 |             for document_id in document_ids:
52 |                 try:
53 |                     if args.data == "vacancy":
54 |                         document = download.vacancy(document_id, **download_params)
55 |                     if args.data == "resume":
56 |                         document = download.resume(document_id, **download_params)
57 | 
58 |                 except HTTPError as http_error:
59 |                     pass
60 | 
61 |                 else:
62 |                     if args.data == "vacancy":
63 |                         document_id = f"{document_id}.json"
64 |                         document = json.dumps(document)
65 |                     if args.data == "resume":
66 |                         document_id = f"{document_id}.html"
67 |                         document = str(document)
68 | 
69 |                     with open(os.path.join(args.path, document_id), "w") as fl:
70 |                         fl.write(document)
71 | 


--------------------------------------------------------------------------------
/parse_hh_data/download.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import json
  4 | import requests
  5 | 
  6 | from functools import wraps
  7 | from bs4 import BeautifulSoup
  8 | from requests.exceptions import HTTPError, ConnectionError, Timeout
  9 | from random_user_agent.user_agent import UserAgent
 10 | from random_user_agent.params import SoftwareName, OperatingSystem
 11 | 
 12 | from .parse import num_pages as parse_num_pages
 13 | from .parse import resume_hashes as parse_resume_hashes
 14 | 
 15 | SOFTWARE_NAMES = [SoftwareName.CHROME.value]
 16 | OPERATING_SYSTEMS = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]
 17 | USER_AGENT = UserAgent(software_names=SOFTWARE_NAMES, operating_systems=OPERATING_SYSTEMS, limit=100)
 18 | 
 19 | RESUME_URL = "https://hh.ru/resume/{}"
 20 | VACANCY_URL = "https://api.hh.ru/vacancies/{}"
 21 | AREAS_URL = "https://api.hh.ru/areas"
 22 | SPECIALIZATIONS_URL = "https://api.hh.ru/specializations"
 23 | RESUME_PAGE_URL = "https://hh.ru/search/resume?area={}&specialization={}&search_period={}&page={}"
 24 | VACANCY_PAGE_URL = "https://api.hh.ru/vacancies?area={}&specialization={}&period={}&page={}&per_page=100"
 25 | 
 26 | 
 27 | def download(get_url):
 28 |     @wraps(get_url)
 29 |     def wrapper(*args, timeout=10, requests_interval=10, max_requests_number=100, break_reasons=None):
 30 |         """
 31 |         :param int requests_interval: time interval between requests (sec.)
 32 |         :param int max_requests_number: maximum number of requests
 33 |         :param list break_reasons: list of reasons
 34 |         """
 35 |         url = get_url(*args)
 36 |         break_reasons = set() if break_reasons is None else set(break_reasons)
 37 | 
 38 |         for _ in range(max_requests_number):
 39 |             try:
 40 |                 request = requests.get(url, headers={'User-Agent': USER_AGENT.get_random_user_agent()}, timeout=timeout)
 41 |                 request.raise_for_status()
 42 |             except ConnectionError as connection_error:
 43 |                 print(f"Connection error occurred: {connection_error}", file=sys.stderr)
 44 |             except Timeout as time_out:
 45 |                 print(f"Timeout error occurred: {time_out}", file=sys.stderr)
 46 |             except HTTPError as http_error:
 47 |                 print(f"HTTP error occurred: {http_error}", file=sys.stderr)
 48 |                 if request.reason in break_reasons:
 49 |                     break
 50 |             else:
 51 |                 return request.content
 52 | 
 53 |             print(f"A second request to the {url} will be sent in {requests_interval} seconds")
 54 |             time.sleep(requests_interval)
 55 | 
 56 |         raise HTTPError(f"Page on this {url} has not been downloaded")
 57 |     return wrapper
 58 | 
 59 | 
 60 | def load_json(get_content):
 61 |     @wraps(get_content)
 62 |     def wrapper(*args, **kwargs):
 63 |         return json.loads(get_content(*args, **kwargs))
 64 |     return wrapper
 65 | 
 66 | 
 67 | def parse_html(get_content):
 68 |     @wraps(get_content)
 69 |     def wrapper(*args, **kwargs):
 70 |         return BeautifulSoup(get_content(*args, **kwargs), "html.parser")
 71 |     return wrapper
 72 | 
 73 | 
 74 | @load_json
 75 | @download
 76 | def areas():
 77 |     """
 78 |     :return: str
 79 |     """
 80 |     return AREAS_URL
 81 | 
 82 | 
 83 | @load_json
 84 | @download
 85 | def specializations():
 86 |     """
 87 |     :return: str
 88 |     """
 89 |     return SPECIALIZATIONS_URL
 90 | 
 91 | 
 92 | @load_json
 93 | @download
 94 | def vacancy_search_page(area_id, specialization_id, search_period, num_page):
 95 |     """
 96 |     :param area_id: area identifier from https://api.hh.ru/areas
 97 |     :param specialization_id: specialization identifier from https://api.hh.ru/specializations
 98 |     :param int search_period: the number of days for search, max value 30
 99 |     :param num_page: page number
100 |     :return: str
101 |     """
102 |     return VACANCY_PAGE_URL.format(area_id, specialization_id, search_period, num_page)
103 | 
104 | 
105 | @load_json
106 | @download
107 | def vacancy(identifier):
108 |     """
109 |     :param str identifier: vacancy identifier
110 |     :return: str
111 |     """
112 |     return VACANCY_URL.format(identifier)
113 | 
114 | 
115 | @parse_html
116 | @download
117 | def resume_search_page(area_id, specialization_id, search_period, num_page):
118 |     """
119 |     :param str area_id: area identifier from https://api.hh.ru/areas
120 |     :param str specialization_id: specialization identifier from https://api.hh.ru/specializations
121 |     :param int search_period: the number of days for search,
122 |                               available values: 0 - all period, 1 - day,
123 |                               3 - three days, 7 - week, 30 - month, 365 - year,
124 |                               all other values are equivalent 0
125 |     :param int num_page: page number
126 |     :return: str
127 |     """
128 |     return RESUME_PAGE_URL.format(area_id, specialization_id, search_period, num_page)
129 | 
130 | 
131 | @parse_html
132 | @download
133 | def resume(identifier):
134 |     """
135 |     :param str identifier: resume identifier
136 |     :return: str
137 |     """
138 |     return RESUME_URL.format(identifier)
139 | 
140 | 
141 | def vacancy_ids(area_id, specialization_id, search_period, num_pages, **kwargs):
142 |     """
143 |     :param area_id: area identifier from https://api.hh.ru/areas
144 |     :param specialization_id: specialization identifier from https://api.hh.ru/specializations
145 |     :param search_period: the number of days for search
146 |     :param num_pages: number pages for download
147 |     :return: list
148 |     """
149 |     if num_pages is None:
150 |         num_pages = 19
151 | 
152 |     ids = []
153 |     for num_page in range(num_pages):
154 |         page = vacancy_search_page(area_id, specialization_id, search_period, num_page, **kwargs)
155 | 
156 |         if not page["items"]:
157 |             break
158 | 
159 |         ids.extend([item["id"] for item in page["items"]])
160 | 
161 |     return list(set(ids))
162 | 
163 | 
164 | def resume_ids(area_id, specialization_id, search_period, num_pages, **kwargs):
165 |     """
166 |     :param area_id: area identifier from https://api.hh.ru/areas
167 |     :param specialization_id: specialization identifier from https://api.hh.ru/specializations
168 |     :param search_period: the number of days for search,
169 |                           available values: 0 - all period, 1 - day,
170 |                           3 - three days, 7 - week, 30 - month, 365 - year,
171 |                           all other values are equivalent 0
172 |     :param num_pages: number pages for download
173 |     :return: list
174 |     """
175 |     page = resume_search_page(area_id, specialization_id, search_period, 0, **kwargs)
176 |     ids = parse_resume_hashes(page)
177 | 
178 |     num_pages = parse_num_pages(page) if num_pages is None else min(num_pages, parse_num_pages(page))
179 | 
180 |     for num_page in range(num_pages):
181 |         page = resume_search_page(area_id, specialization_id, search_period, num_page, **kwargs)
182 |         ids.extend(parse_resume_hashes(page))
183 | 
184 |     return list(set(ids))
185 | 


--------------------------------------------------------------------------------
/parse_hh_data/parse.py:
--------------------------------------------------------------------------------
  1 | from functools import wraps
  2 | from datetime import datetime
  3 | 
  4 | MONTHS = {"Январь": "January",
  5 |           "Февраль": "February",
  6 |           "Март": "March",
  7 |           "Апрель": "April",
  8 |           "Май": "May",
  9 |           "Июнь": "June",
 10 |           "Июль": "July",
 11 |           "Август": "August",
 12 |           "Сентябрь": "September",
 13 |           "Октябрь": "October",
 14 |           "Ноябрь": "November",
 15 |           "Декабрь": "December"}
 16 | 
 17 | 
 18 | def num_pages(page):
 19 |     """
 20 |     :param bs4.BeautifulSoup page: resumes search page
 21 |     :return: int
 22 |     """
 23 |     num = page.find("div", {"data-qa": "pager-block"})
 24 | 
 25 |     if not num:
 26 |         return 1
 27 | 
 28 |     num = num.findAll("a", {"class": "bloko-button"})
 29 | 
 30 |     if not num:
 31 |         return 1
 32 | 
 33 |     return int(num[-2].getText())
 34 | 
 35 | 
 36 | def resume_hashes(page):
 37 |     """
 38 |     :param bs4.BeautifulSoup page: resumes search page
 39 |     :return: list
 40 |     """
 41 |     hashes = []
 42 |     page = page.find("div", {"data-qa": "resume-serp__results-search"})
 43 | 
 44 |     if page is not None:
 45 |         hashes = page.findAll("div", {"data-qa": "resume-serp__resume"})
 46 |         hashes = [item.find("a")["href"][8:46] for item in hashes]
 47 | 
 48 |     return hashes
 49 | 
 50 | 
 51 | def header(page):
 52 |     """
 53 |     :param bs4.BeautifulSoup page: resume page
 54 |     :return: bs4.Tag
 55 |     """
 56 |     return page.find("div", {"class": "resume-header-block"})
 57 | 
 58 | 
 59 | def get_optional_text(find_optional_element):
 60 |     @wraps(find_optional_element)
 61 |     def wrapper(page):
 62 |         """
 63 |         :param bs4.Tag element: element from resume
 64 |         :return: str or None
 65 |         """
 66 |         optional_element = find_optional_element(page)
 67 |         return None if optional_element is None else optional_element.getText()
 68 |     return wrapper
 69 | 
 70 | 
 71 | @get_optional_text
 72 | def birth_date(page):
 73 |     """
 74 |     :param bs4.BeautifulSoup page: resume page
 75 |     :return: str or None
 76 |     """
 77 |     return page.find("span", {"data-qa": "resume-personal-birthday"})
 78 | 
 79 | 
 80 | @get_optional_text
 81 | def gender(page):
 82 |     """
 83 |     :param bs4.BeautifulSoup page: resume page
 84 |     :return: str or None
 85 |     """
 86 |     return page.find("span", {"data-qa": "resume-personal-gender"})
 87 | 
 88 | 
 89 | @get_optional_text
 90 | def area(page):
 91 |     """
 92 |     :param bs4.BeautifulSoup page: resume page
 93 |     :return: str or None
 94 |     """
 95 |     return page.find("span", {"data-qa": "resume-personal-address"})
 96 | 
 97 | 
 98 | def position(page):
 99 |     """
100 |     :param bs4.BeautifulSoup page: resume page
101 |     :return: bs4.Tag
102 |     """
103 |     return page.find("div", {"class": "resume-block", "data-qa": "resume-block-position"})
104 | 
105 | 
106 | def position_title(position_block):
107 |     """
108 |     :param bs4.Tag position_block: position block
109 |     :return: str
110 |     """
111 |     title = position_block.find("span", {"class": "resume-block__title-text",
112 |                                          "data-qa": "resume-block-title-position"})
113 | 
114 |     return title.getText()
115 | 
116 | 
117 | def position_specializations(position_block):
118 |     """
119 |     :param bs4.Tag position_block: position block
120 |     :return: list
121 |     """
122 |     position_block = position_block.find("div", {"class": "bloko-gap bloko-gap_bottom"})
123 | 
124 |     profarea_name = position_block.find("span", {"data-qa": "resume-block-specialization-category"})
125 |     profarea_name = profarea_name.getText()
126 | 
127 |     profarea_specializations = position_block.find("ul")
128 |     profarea_specializations = profarea_specializations.findAll("li", {"class": "resume-block__specialization",
129 |                                                                        "data-qa": "resume-block-position-specialization"})
130 | 
131 |     profarea_specializations = [item.getText() for item in profarea_specializations]
132 |     profarea_specializations = [{"name": specialization_name, "profarea_name": profarea_name}
133 |                                 for specialization_name in profarea_specializations]
134 | 
135 |     return profarea_specializations
136 | 
137 | 
138 | def position_salary(position_block):
139 |     """
140 |     :param bs4.Tag position_block: position block
141 |     :return: dict
142 |     """
143 |     salary = position_block.find("span", {"class": "resume-block__salary resume-block__title-text_salary",
144 |                                           "data-qa": "resume-block-salary"})
145 |     amount = None
146 |     currency = None
147 |     if salary is not None:
148 |         salary = salary.getText().replace('\u2009', '').replace('\xa0', ' ').strip().split()
149 |         amount = int(salary[0])
150 |         currency = ' '.join(salary[1:])
151 | 
152 |     salary = {"amount": amount,
153 |               "currency": currency}
154 | 
155 |     return salary
156 | 
157 | 
158 | def education(page):
159 |     """
160 |     :param bs4.BeautifulSoup page: resume page
161 |     :return: bs4.Tag
162 |     """
163 |     return page.find("div", {"class": "resume-block", "data-qa": "resume-block-education"})
164 | 
165 | 
166 | def education_level(education_block):
167 |     """
168 |     :param bs4.Tag education_block: education block
169 |     :return: str
170 |     """
171 |     if education_block is not None:
172 |         return education_block.find("span", {"class": "resume-block__title-text resume-block__title-text_sub"}) \
173 |                               .getText()
174 | 
175 |     return "Образования нет"
176 | 
177 | 
178 | def educations(education_block):
179 |     """
180 |     :param bs4.Tag education_block: education block
181 |     :return: list
182 |     """
183 |     page_educations = []
184 |     if education_block is not None:
185 |         education_block = education_block.find("div", {"class": "resume-block-item-gap"}) \
186 |                                          .find("div", {"class": "bloko-columns-row"})
187 | 
188 |         for education_item in education_block.findAll("div", {"class": "resume-block-item-gap"}):
189 |             year = education_item.find("div", {"class": "bloko-column bloko-column_xs-4 bloko-column_s-2 bloko-column_m-2 bloko-column_l-2"}) \
190 |                                  .getText()
191 | 
192 |             item_name = education_item.find("div", {"data-qa": "resume-block-education-name"}) \
193 |                                       .getText()
194 | 
195 |             item_organization = education_item.find("div", {"data-qa": "resume-block-education-organization"})
196 |             if item_organization is not None:
197 |                 item_organization = item_organization.getText()
198 | 
199 |             page_educations.append(
200 |                 {"year": int(year),
201 |                  "name": item_name,
202 |                  "organization": item_organization}
203 |             )
204 | 
205 |     return page_educations
206 | 
207 | 
208 | def languages(page):
209 |     """
210 |     :param bs4.BeautifulSoup page: resume page
211 |     :return: list
212 |     """
213 |     page_languages = []
214 |     page = page.find("div", {"class": "resume-block", "data-qa": "resume-block-languages"})
215 | 
216 |     if page is not None:
217 |         for language in page.findAll("p", {"data-qa": "resume-block-language-item"}):
218 |             language = language.getText().split(" — ")
219 | 
220 |             level = ' - '.join(language[1:])
221 |             language = language[0]
222 | 
223 |             page_languages.append({"name": language,
224 |                                    "level": level})
225 | 
226 |     return page_languages
227 | 
228 | 
229 | def date(date, format="%d-%m-%Y"):
230 |     """
231 |     :param date str: date in format "Month (russian) Year"
232 |     :param format str: desired data format
233 |     :return: str
234 |     """
235 |     if date in ["по настоящее время", "currently"]:
236 |         return None
237 | 
238 |     month, year = date.split()
239 | 
240 |     if month in MONTHS:
241 |         month = MONTHS[month]
242 | 
243 |     date = f"{month} {year}"
244 |     date = datetime.strptime(date, "%B %Y").strftime(format)
245 | 
246 |     return date
247 | 
248 | 
249 | def experiences(page, format="%d-%m-%Y"):
250 |     """
251 |     :param bs4.BeautifulSoup page: resume page
252 |     :param format str: desired data format
253 |     :return: list
254 |     """
255 |     page_experiences = []
256 |     page = page.find("div", {"class": "resume-block", "data-qa": "resume-block-experience"})
257 | 
258 |     if page is not None:
259 |         page = page.find("div", {"class": "resume-block-item-gap"})
260 |         for experience_item in page.findAll("div", {"class": "resume-block-item-gap"}):
261 |             time_interval = experience_item.find("div", {"class": "bloko-column bloko-column_xs-4 bloko-column_s-2 bloko-column_m-2 bloko-column_l-2"})
262 |             time_interval.div.extract()
263 | 
264 |             start, end = time_interval.getText().replace("\xa0", " ").split(' — ')
265 |             
266 |             item_position = experience_item.find("div",  {"class": "resume-block__sub-title", "data-qa": "resume-block-experience-position"})
267 |             item_position = "" if item_position is None else item_position.getText()
268 | 
269 |             item_description = experience_item.find("div", {"data-qa": "resume-block-experience-description"})
270 |             description_child = item_description.findChild()
271 |             item_description = item_description.getText() if description_child is None else str(description_child)
272 | 
273 |             page_experiences.append(
274 |                 {"start": date(start, format=format),
275 |                  "end": date(end, format=format),
276 |                  "position": item_position,
277 |                  "description": item_description}
278 |             )
279 | 
280 |     return page_experiences
281 | 
282 | 
283 | def skill_set(page):
284 |     """
285 |     :param bs4.BeautifulSoup page: resume page
286 |     :return: list
287 |     """
288 |     page = page.find("div", {"data-qa": "skills-table", "class": "resume-block"})
289 | 
290 |     page_skill_set = []
291 |     if page is not None:
292 |         page_skill_set = page.findAll("div", {"class": "bloko-tag bloko-tag_inline bloko-tag_countable",
293 |                                               "data-qa": "bloko-tag bloko-tag_inline"})
294 |         page_skill_set = [skill.getText() for skill in page_skill_set]
295 | 
296 |     return page_skill_set
297 | 
298 | 
299 | def skills(page):
300 |     """
301 |     :param bs4.BeautifulSoup page: resume page
302 |     :return: str
303 |     """
304 |     page = page.find("div", {"data-qa": "resume-block-skills-content"})
305 | 
306 |     page_skills = ""
307 |     if page is not None:
308 |         skills_child = page.findChild()
309 |         page_skills = page.getText() if skills_child is None else str(skills_child)
310 | 
311 |     return page_skills
312 | 
313 | 
314 | def resume(page):
315 |     """
316 |     :param bs4.BeautifulSoup page: resume page
317 |     :return: dict
318 |     """
319 |     page = page.find("div", {"id": "HH-React-Root"})
320 | 
321 |     resume_position = position(page)
322 |     resume_education = education(page)
323 | 
324 |     return {
325 |         "birth_date": birth_date(page),
326 |         "gender": gender(page),
327 |         "area": area(page),
328 |         "title": position_title(resume_position),
329 |         "specialization": position_specializations(resume_position),
330 |         "salary": position_salary(resume_position),
331 |         "education_level": education_level(resume_education),
332 |         "education": educations(resume_education),
333 |         "language": languages(page),
334 |         "experience": experiences(page),
335 |         "skill_set": skill_set(page),
336 |         "skills": skills(page)
337 |     }
338 | 
339 |     return resume
340 | 


--------------------------------------------------------------------------------