├── parse_hh_data ├── __init__.py ├── download.py └── parse.py ├── .gitignore ├── requirements.txt ├── parse ├── setup.py ├── LICENSE ├── README.md └── download /parse_hh_data/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | /build 3 | /dist 4 | /parse_hh_data.egg-info 5 | 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.8.2 2 | random-user-agent==1.0.1 3 | requests==2.23.0 4 | tqdm==4.44.1 5 | -------------------------------------------------------------------------------- /parse: -------------------------------------------------------------------------------- 1 | #! python 2 | import os 3 | import sys 4 | import json 5 | import argparse 6 | 7 | from tqdm import tqdm 8 | from bs4 import BeautifulSoup 9 | from parse_hh_data.parse import resume 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument("path_html") 16 | parser.add_argument("path_json") 17 | parser.add_argument("--view_progress_bar", action='store_true') 18 | 19 | args = parser.parse_args() 20 | 21 | resume_names = os.listdir(args.path_html) 22 | if args.view_progress_bar: 23 | resume_names = tqdm(resume_names, file=sys.stdout) 24 | 25 | for resume_name in resume_names: 26 | with open(os.path.join(args.path_html, resume_name)) as fl: 27 | page = BeautifulSoup(fl.read(), 'html.parser') 28 | 29 | resume_name = os.path.splitext(resume_name)[0] 30 | 31 | with open(os.path.join(args.path_json, resume_name + ".json"), "w") as fl: 32 | json.dump(resume(page), fl) 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="parse-hh-data", 8 | version="0.1.14", 9 | author="Arina Ageeva", 10 | author_email="arina.a.ageeva@gmail.com", 11 | description="Package for parsing data (vacancies and resumes) from site hh.ru", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/arinaaageeva/parse_hh_data", 15 | packages=setuptools.find_packages(), 16 | install_requires=["beautifulsoup4==4.8.2", 17 | "random-user-agent==1.0.1", 18 | "requests==2.23.0", 19 | "tqdm==4.44.1", 20 | "urllib3==1.25.8"], 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "License :: OSI Approved :: MIT License", 24 | "Operating System :: OS Independent", 25 | ], 26 | scripts=["download", "parse"], 27 | python_requires='>=3.6', 28 | ) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 The Python Parsing HH Data 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parse HH Data Project 2 | 3 | Данный модуль предназначен для удобного скачивания вакансий и резюме с сайта `hh.ru` 4 | 5 | ```python 6 | from parse_hh_data import download, parse 7 | 8 | vacancy = download.vacancy("36070814") 9 | 10 | resume = download.resume("d40ce6f80001a8c8380039ed1f5874726f5a6e") 11 | resume = parse.resume(resume) 12 | ``` 13 | 14 | **Вакансии** скачиваются с помощью [API HH](https://dev.hh.ru/) и возвращаются в формате описанном 15 | [здесь](https://github.com/hhru/api/blob/master/docs/vacancies.md#%D0%BF%D1%80%D0%BE%D1%81%D0%BC%D0%BE%D1%82%D1%80-%D0%B2%D0%B0%D0%BA%D0%B0%D0%BD%D1%81%D0%B8%D0%B8). 16 | 17 | Обезличенные **резюме** скачиваются непосредственно с [сайта](https://hh.ru/search/resume) в html-формате, 18 | а затем могут быть преобразованны в json-формат: 19 | 20 | birth_date : str - день рождения 21 | gender : str - пол 22 | area : str - город проживания 23 | title : str - желаемая должность 24 | specialization : list - специализации соискателя 25 | name : str - название специализации 26 | profarea_name : str - название профессиональной области, в которую входит специализация 27 | salary : dict - желаемая зарплата 28 | amount : int - сумма 29 | currency : str - валюта 30 | education_level : str - уровень образования 31 | education : list - образование 32 | year : int - год окончания 33 | name : str - название учебного заведения 34 | organization : str - организация, специальность / специализация 35 | language : list - список языков, которыми владеет соискатель 36 | name : str - название языка 37 | level : str - уровень знания языка 38 | experience : list - опыт работы 39 | start : str - начало работы (дата в формате dd-MM-yyyy) 40 | end : str - окончание работы (дата в формате dd-MM-yyyy) 41 | position : str - должность 42 | description : str - обязанности, функции, достижения (может содержать html-код) 43 | skills : str - дополнительная информация, описание навыков в свободной форме (может содержать html-код) 44 | skill_set : list - ключевые навыки 45 | 46 | с помощью `parse_hh_data.parse.resume`. 47 | 48 | Скачать списки идентификаторов вакансий или резюме можно используя 49 | `parse_hh_data.download.vacancy_ids` или `parse_hh_data.download.resume_ids`, соответсвенно. 50 | 51 | ### Command line interface 52 | 53 | `download ~/resumes resume --area_ids 113 --specialization_ids 1 --search_period 30` 54 | 55 | `parse ~/data/resumes ~/data/resumes_json` -------------------------------------------------------------------------------- /download: -------------------------------------------------------------------------------- 1 | #! python 2 | import os 3 | import sys 4 | import json 5 | import argparse 6 | 7 | from tqdm import tqdm 8 | from parse_hh_data import download 9 | from requests.exceptions import HTTPError 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument("path") 16 | parser.add_argument("data", choices=["vacancy", "resume"]) 17 | parser.add_argument("--area_ids", nargs='+', default=[""]) 18 | parser.add_argument("--specialization_ids", nargs='+', default=[""]) 19 | parser.add_argument("--search_period", type=int, default=1) 20 | parser.add_argument("--num_pages", type=int, default=None) 21 | parser.add_argument("--timeout", type=int, default=10) 22 | parser.add_argument("--requests_interval", type=int, default=10) 23 | parser.add_argument("--max_requests_number", type=int, default=100) 24 | parser.add_argument("--break_reasons", nargs='+', default=["Forbidden", "Not Found"]) 25 | parser.add_argument("--view_progress_area", action='store_true') 26 | parser.add_argument("--view_progress_specialization", action='store_true') 27 | parser.add_argument("--log", choices=["BASE", "DEBUG"], default="BASE") 28 | 29 | args = parser.parse_args() 30 | 31 | download_params = {"timeout": args.timeout, 32 | "requests_interval": args.requests_interval, 33 | "max_requests_number": args.max_requests_number, 34 | "break_reasons": args.break_reasons} 35 | 36 | area_ids = args.area_ids 37 | if args.view_progress_area: 38 | area_ids = tqdm(area_ids, file=sys.stdout) 39 | 40 | specialization_ids = args.specialization_ids 41 | if args.view_progress_specialization: 42 | specialization_ids = tqdm(specialization_ids, file=sys.stdout) 43 | 44 | for area_id in area_ids: 45 | for specialization_id in specialization_ids: 46 | if args.data == "vacancy": 47 | document_ids = download.vacancy_ids(area_id, specialization_id, args.search_period, args.num_pages, **download_params) 48 | if args.data == "resume": 49 | document_ids = download.resume_ids(area_id, specialization_id, args.search_period, args.num_pages, **download_params) 50 | 51 | for document_id in document_ids: 52 | try: 53 | if args.data == "vacancy": 54 | document = download.vacancy(document_id, **download_params) 55 | if args.data == "resume": 56 | document = download.resume(document_id, **download_params) 57 | 58 | except HTTPError as http_error: 59 | pass 60 | 61 | else: 62 | if args.data == "vacancy": 63 | document_id = f"{document_id}.json" 64 | document = json.dumps(document) 65 | if args.data == "resume": 66 | document_id = f"{document_id}.html" 67 | document = str(document) 68 | 69 | with open(os.path.join(args.path, document_id), "w") as fl: 70 | fl.write(document) 71 | -------------------------------------------------------------------------------- /parse_hh_data/download.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import json 4 | import requests 5 | 6 | from functools import wraps 7 | from bs4 import BeautifulSoup 8 | from requests.exceptions import HTTPError, ConnectionError, Timeout 9 | from random_user_agent.user_agent import UserAgent 10 | from random_user_agent.params import SoftwareName, OperatingSystem 11 | 12 | from .parse import num_pages as parse_num_pages 13 | from .parse import resume_hashes as parse_resume_hashes 14 | 15 | SOFTWARE_NAMES = [SoftwareName.CHROME.value] 16 | OPERATING_SYSTEMS = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value] 17 | USER_AGENT = UserAgent(software_names=SOFTWARE_NAMES, operating_systems=OPERATING_SYSTEMS, limit=100) 18 | 19 | RESUME_URL = "https://hh.ru/resume/{}" 20 | VACANCY_URL = "https://api.hh.ru/vacancies/{}" 21 | AREAS_URL = "https://api.hh.ru/areas" 22 | SPECIALIZATIONS_URL = "https://api.hh.ru/specializations" 23 | RESUME_PAGE_URL = "https://hh.ru/search/resume?area={}&specialization={}&search_period={}&page={}" 24 | VACANCY_PAGE_URL = "https://api.hh.ru/vacancies?area={}&specialization={}&period={}&page={}&per_page=100" 25 | 26 | 27 | def download(get_url): 28 | @wraps(get_url) 29 | def wrapper(*args, timeout=10, requests_interval=10, max_requests_number=100, break_reasons=None): 30 | """ 31 | :param int requests_interval: time interval between requests (sec.) 32 | :param int max_requests_number: maximum number of requests 33 | :param list break_reasons: list of reasons 34 | """ 35 | url = get_url(*args) 36 | break_reasons = set() if break_reasons is None else set(break_reasons) 37 | 38 | for _ in range(max_requests_number): 39 | try: 40 | request = requests.get(url, headers={'User-Agent': USER_AGENT.get_random_user_agent()}, timeout=timeout) 41 | request.raise_for_status() 42 | except ConnectionError as connection_error: 43 | print(f"Connection error occurred: {connection_error}", file=sys.stderr) 44 | except Timeout as time_out: 45 | print(f"Timeout error occurred: {time_out}", file=sys.stderr) 46 | except HTTPError as http_error: 47 | print(f"HTTP error occurred: {http_error}", file=sys.stderr) 48 | if request.reason in break_reasons: 49 | break 50 | else: 51 | return request.content 52 | 53 | print(f"A second request to the {url} will be sent in {requests_interval} seconds") 54 | time.sleep(requests_interval) 55 | 56 | raise HTTPError(f"Page on this {url} has not been downloaded") 57 | return wrapper 58 | 59 | 60 | def load_json(get_content): 61 | @wraps(get_content) 62 | def wrapper(*args, **kwargs): 63 | return json.loads(get_content(*args, **kwargs)) 64 | return wrapper 65 | 66 | 67 | def parse_html(get_content): 68 | @wraps(get_content) 69 | def wrapper(*args, **kwargs): 70 | return BeautifulSoup(get_content(*args, **kwargs), "html.parser") 71 | return wrapper 72 | 73 | 74 | @load_json 75 | @download 76 | def areas(): 77 | """ 78 | :return: str 79 | """ 80 | return AREAS_URL 81 | 82 | 83 | @load_json 84 | @download 85 | def specializations(): 86 | """ 87 | :return: str 88 | """ 89 | return SPECIALIZATIONS_URL 90 | 91 | 92 | @load_json 93 | @download 94 | def vacancy_search_page(area_id, specialization_id, search_period, num_page): 95 | """ 96 | :param area_id: area identifier from https://api.hh.ru/areas 97 | :param specialization_id: specialization identifier from https://api.hh.ru/specializations 98 | :param int search_period: the number of days for search, max value 30 99 | :param num_page: page number 100 | :return: str 101 | """ 102 | return VACANCY_PAGE_URL.format(area_id, specialization_id, search_period, num_page) 103 | 104 | 105 | @load_json 106 | @download 107 | def vacancy(identifier): 108 | """ 109 | :param str identifier: vacancy identifier 110 | :return: str 111 | """ 112 | return VACANCY_URL.format(identifier) 113 | 114 | 115 | @parse_html 116 | @download 117 | def resume_search_page(area_id, specialization_id, search_period, num_page): 118 | """ 119 | :param str area_id: area identifier from https://api.hh.ru/areas 120 | :param str specialization_id: specialization identifier from https://api.hh.ru/specializations 121 | :param int search_period: the number of days for search, 122 | available values: 0 - all period, 1 - day, 123 | 3 - three days, 7 - week, 30 - month, 365 - year, 124 | all other values are equivalent 0 125 | :param int num_page: page number 126 | :return: str 127 | """ 128 | return RESUME_PAGE_URL.format(area_id, specialization_id, search_period, num_page) 129 | 130 | 131 | @parse_html 132 | @download 133 | def resume(identifier): 134 | """ 135 | :param str identifier: resume identifier 136 | :return: str 137 | """ 138 | return RESUME_URL.format(identifier) 139 | 140 | 141 | def vacancy_ids(area_id, specialization_id, search_period, num_pages, **kwargs): 142 | """ 143 | :param area_id: area identifier from https://api.hh.ru/areas 144 | :param specialization_id: specialization identifier from https://api.hh.ru/specializations 145 | :param search_period: the number of days for search 146 | :param num_pages: number pages for download 147 | :return: list 148 | """ 149 | if num_pages is None: 150 | num_pages = 19 151 | 152 | ids = [] 153 | for num_page in range(num_pages): 154 | page = vacancy_search_page(area_id, specialization_id, search_period, num_page, **kwargs) 155 | 156 | if not page["items"]: 157 | break 158 | 159 | ids.extend([item["id"] for item in page["items"]]) 160 | 161 | return list(set(ids)) 162 | 163 | 164 | def resume_ids(area_id, specialization_id, search_period, num_pages, **kwargs): 165 | """ 166 | :param area_id: area identifier from https://api.hh.ru/areas 167 | :param specialization_id: specialization identifier from https://api.hh.ru/specializations 168 | :param search_period: the number of days for search, 169 | available values: 0 - all period, 1 - day, 170 | 3 - three days, 7 - week, 30 - month, 365 - year, 171 | all other values are equivalent 0 172 | :param num_pages: number pages for download 173 | :return: list 174 | """ 175 | page = resume_search_page(area_id, specialization_id, search_period, 0, **kwargs) 176 | ids = parse_resume_hashes(page) 177 | 178 | num_pages = parse_num_pages(page) if num_pages is None else min(num_pages, parse_num_pages(page)) 179 | 180 | for num_page in range(num_pages): 181 | page = resume_search_page(area_id, specialization_id, search_period, num_page, **kwargs) 182 | ids.extend(parse_resume_hashes(page)) 183 | 184 | return list(set(ids)) 185 | -------------------------------------------------------------------------------- /parse_hh_data/parse.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from datetime import datetime 3 | 4 | MONTHS = {"Январь": "January", 5 | "Февраль": "February", 6 | "Март": "March", 7 | "Апрель": "April", 8 | "Май": "May", 9 | "Июнь": "June", 10 | "Июль": "July", 11 | "Август": "August", 12 | "Сентябрь": "September", 13 | "Октябрь": "October", 14 | "Ноябрь": "November", 15 | "Декабрь": "December"} 16 | 17 | 18 | def num_pages(page): 19 | """ 20 | :param bs4.BeautifulSoup page: resumes search page 21 | :return: int 22 | """ 23 | num = page.find("div", {"data-qa": "pager-block"}) 24 | 25 | if not num: 26 | return 1 27 | 28 | num = num.findAll("a", {"class": "bloko-button"}) 29 | 30 | if not num: 31 | return 1 32 | 33 | return int(num[-2].getText()) 34 | 35 | 36 | def resume_hashes(page): 37 | """ 38 | :param bs4.BeautifulSoup page: resumes search page 39 | :return: list 40 | """ 41 | hashes = [] 42 | page = page.find("div", {"data-qa": "resume-serp__results-search"}) 43 | 44 | if page is not None: 45 | hashes = page.findAll("div", {"data-qa": "resume-serp__resume"}) 46 | hashes = [item.find("a")["href"][8:46] for item in hashes] 47 | 48 | return hashes 49 | 50 | 51 | def header(page): 52 | """ 53 | :param bs4.BeautifulSoup page: resume page 54 | :return: bs4.Tag 55 | """ 56 | return page.find("div", {"class": "resume-header-block"}) 57 | 58 | 59 | def get_optional_text(find_optional_element): 60 | @wraps(find_optional_element) 61 | def wrapper(page): 62 | """ 63 | :param bs4.Tag element: element from resume 64 | :return: str or None 65 | """ 66 | optional_element = find_optional_element(page) 67 | return None if optional_element is None else optional_element.getText() 68 | return wrapper 69 | 70 | 71 | @get_optional_text 72 | def birth_date(page): 73 | """ 74 | :param bs4.BeautifulSoup page: resume page 75 | :return: str or None 76 | """ 77 | return page.find("span", {"data-qa": "resume-personal-birthday"}) 78 | 79 | 80 | @get_optional_text 81 | def gender(page): 82 | """ 83 | :param bs4.BeautifulSoup page: resume page 84 | :return: str or None 85 | """ 86 | return page.find("span", {"data-qa": "resume-personal-gender"}) 87 | 88 | 89 | @get_optional_text 90 | def area(page): 91 | """ 92 | :param bs4.BeautifulSoup page: resume page 93 | :return: str or None 94 | """ 95 | return page.find("span", {"data-qa": "resume-personal-address"}) 96 | 97 | 98 | def position(page): 99 | """ 100 | :param bs4.BeautifulSoup page: resume page 101 | :return: bs4.Tag 102 | """ 103 | return page.find("div", {"class": "resume-block", "data-qa": "resume-block-position"}) 104 | 105 | 106 | def position_title(position_block): 107 | """ 108 | :param bs4.Tag position_block: position block 109 | :return: str 110 | """ 111 | title = position_block.find("span", {"class": "resume-block__title-text", 112 | "data-qa": "resume-block-title-position"}) 113 | 114 | return title.getText() 115 | 116 | 117 | def position_specializations(position_block): 118 | """ 119 | :param bs4.Tag position_block: position block 120 | :return: list 121 | """ 122 | position_block = position_block.find("div", {"class": "bloko-gap bloko-gap_bottom"}) 123 | 124 | profarea_name = position_block.find("span", {"data-qa": "resume-block-specialization-category"}) 125 | profarea_name = profarea_name.getText() 126 | 127 | profarea_specializations = position_block.find("ul") 128 | profarea_specializations = profarea_specializations.findAll("li", {"class": "resume-block__specialization", 129 | "data-qa": "resume-block-position-specialization"}) 130 | 131 | profarea_specializations = [item.getText() for item in profarea_specializations] 132 | profarea_specializations = [{"name": specialization_name, "profarea_name": profarea_name} 133 | for specialization_name in profarea_specializations] 134 | 135 | return profarea_specializations 136 | 137 | 138 | def position_salary(position_block): 139 | """ 140 | :param bs4.Tag position_block: position block 141 | :return: dict 142 | """ 143 | salary = position_block.find("span", {"class": "resume-block__salary resume-block__title-text_salary", 144 | "data-qa": "resume-block-salary"}) 145 | amount = None 146 | currency = None 147 | if salary is not None: 148 | salary = salary.getText().replace('\u2009', '').replace('\xa0', ' ').strip().split() 149 | amount = int(salary[0]) 150 | currency = ' '.join(salary[1:]) 151 | 152 | salary = {"amount": amount, 153 | "currency": currency} 154 | 155 | return salary 156 | 157 | 158 | def education(page): 159 | """ 160 | :param bs4.BeautifulSoup page: resume page 161 | :return: bs4.Tag 162 | """ 163 | return page.find("div", {"class": "resume-block", "data-qa": "resume-block-education"}) 164 | 165 | 166 | def education_level(education_block): 167 | """ 168 | :param bs4.Tag education_block: education block 169 | :return: str 170 | """ 171 | if education_block is not None: 172 | return education_block.find("span", {"class": "resume-block__title-text resume-block__title-text_sub"}) \ 173 | .getText() 174 | 175 | return "Образования нет" 176 | 177 | 178 | def educations(education_block): 179 | """ 180 | :param bs4.Tag education_block: education block 181 | :return: list 182 | """ 183 | page_educations = [] 184 | if education_block is not None: 185 | education_block = education_block.find("div", {"class": "resume-block-item-gap"}) \ 186 | .find("div", {"class": "bloko-columns-row"}) 187 | 188 | for education_item in education_block.findAll("div", {"class": "resume-block-item-gap"}): 189 | year = education_item.find("div", {"class": "bloko-column bloko-column_xs-4 bloko-column_s-2 bloko-column_m-2 bloko-column_l-2"}) \ 190 | .getText() 191 | 192 | item_name = education_item.find("div", {"data-qa": "resume-block-education-name"}) \ 193 | .getText() 194 | 195 | item_organization = education_item.find("div", {"data-qa": "resume-block-education-organization"}) 196 | if item_organization is not None: 197 | item_organization = item_organization.getText() 198 | 199 | page_educations.append( 200 | {"year": int(year), 201 | "name": item_name, 202 | "organization": item_organization} 203 | ) 204 | 205 | return page_educations 206 | 207 | 208 | def languages(page): 209 | """ 210 | :param bs4.BeautifulSoup page: resume page 211 | :return: list 212 | """ 213 | page_languages = [] 214 | page = page.find("div", {"class": "resume-block", "data-qa": "resume-block-languages"}) 215 | 216 | if page is not None: 217 | for language in page.findAll("p", {"data-qa": "resume-block-language-item"}): 218 | language = language.getText().split(" — ") 219 | 220 | level = ' - '.join(language[1:]) 221 | language = language[0] 222 | 223 | page_languages.append({"name": language, 224 | "level": level}) 225 | 226 | return page_languages 227 | 228 | 229 | def date(date, format="%d-%m-%Y"): 230 | """ 231 | :param date str: date in format "Month (russian) Year" 232 | :param format str: desired data format 233 | :return: str 234 | """ 235 | if date in ["по настоящее время", "currently"]: 236 | return None 237 | 238 | month, year = date.split() 239 | 240 | if month in MONTHS: 241 | month = MONTHS[month] 242 | 243 | date = f"{month} {year}" 244 | date = datetime.strptime(date, "%B %Y").strftime(format) 245 | 246 | return date 247 | 248 | 249 | def experiences(page, format="%d-%m-%Y"): 250 | """ 251 | :param bs4.BeautifulSoup page: resume page 252 | :param format str: desired data format 253 | :return: list 254 | """ 255 | page_experiences = [] 256 | page = page.find("div", {"class": "resume-block", "data-qa": "resume-block-experience"}) 257 | 258 | if page is not None: 259 | page = page.find("div", {"class": "resume-block-item-gap"}) 260 | for experience_item in page.findAll("div", {"class": "resume-block-item-gap"}): 261 | time_interval = experience_item.find("div", {"class": "bloko-column bloko-column_xs-4 bloko-column_s-2 bloko-column_m-2 bloko-column_l-2"}) 262 | time_interval.div.extract() 263 | 264 | start, end = time_interval.getText().replace("\xa0", " ").split(' — ') 265 | 266 | item_position = experience_item.find("div", {"class": "resume-block__sub-title", "data-qa": "resume-block-experience-position"}) 267 | item_position = "" if item_position is None else item_position.getText() 268 | 269 | item_description = experience_item.find("div", {"data-qa": "resume-block-experience-description"}) 270 | description_child = item_description.findChild() 271 | item_description = item_description.getText() if description_child is None else str(description_child) 272 | 273 | page_experiences.append( 274 | {"start": date(start, format=format), 275 | "end": date(end, format=format), 276 | "position": item_position, 277 | "description": item_description} 278 | ) 279 | 280 | return page_experiences 281 | 282 | 283 | def skill_set(page): 284 | """ 285 | :param bs4.BeautifulSoup page: resume page 286 | :return: list 287 | """ 288 | page = page.find("div", {"data-qa": "skills-table", "class": "resume-block"}) 289 | 290 | page_skill_set = [] 291 | if page is not None: 292 | page_skill_set = page.findAll("div", {"class": "bloko-tag bloko-tag_inline bloko-tag_countable", 293 | "data-qa": "bloko-tag bloko-tag_inline"}) 294 | page_skill_set = [skill.getText() for skill in page_skill_set] 295 | 296 | return page_skill_set 297 | 298 | 299 | def skills(page): 300 | """ 301 | :param bs4.BeautifulSoup page: resume page 302 | :return: str 303 | """ 304 | page = page.find("div", {"data-qa": "resume-block-skills-content"}) 305 | 306 | page_skills = "" 307 | if page is not None: 308 | skills_child = page.findChild() 309 | page_skills = page.getText() if skills_child is None else str(skills_child) 310 | 311 | return page_skills 312 | 313 | 314 | def resume(page): 315 | """ 316 | :param bs4.BeautifulSoup page: resume page 317 | :return: dict 318 | """ 319 | page = page.find("div", {"id": "HH-React-Root"}) 320 | 321 | resume_position = position(page) 322 | resume_education = education(page) 323 | 324 | return { 325 | "birth_date": birth_date(page), 326 | "gender": gender(page), 327 | "area": area(page), 328 | "title": position_title(resume_position), 329 | "specialization": position_specializations(resume_position), 330 | "salary": position_salary(resume_position), 331 | "education_level": education_level(resume_education), 332 | "education": educations(resume_education), 333 | "language": languages(page), 334 | "experience": experiences(page), 335 | "skill_set": skill_set(page), 336 | "skills": skills(page) 337 | } 338 | 339 | return resume 340 | --------------------------------------------------------------------------------