├── requirements.txt ├── README.md ├── cv_parse.py ├── cv_extract.py └── .gitignore /requirements.txt: -------------------------------------------------------------------------------- 1 | pdfminer3k==1.3.1 2 | splitty==0.0.7 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CV Extractor 2 | 3 | Extract data for CV with Python. 4 | 5 | ## How to install 6 | 7 | ``` 8 | git clone https://github.com/rg3915/cv-extractor.git 9 | cd cv-extractor 10 | python3 -m venv .venv 11 | source .venv/bin/activate 12 | pip install -r requirements.txt 13 | python cv_extract.py 14 | ``` 15 | 16 | ## How to sync repo 17 | 18 | ``` 19 | git remote add upstream https://github.com/rg3915/cv-extractor.git 20 | git remote -v 21 | git fetch upstream 22 | git checkout master 23 | git merge upstream/master 24 | git push 25 | ``` 26 | 27 | https://help.github.com/articles/configuring-a-remote-for-a-fork/ 28 | 29 | https://help.github.com/articles/syncing-a-fork/ 30 | 31 | 32 | -------------------------------------------------------------------------------- /cv_parse.py: -------------------------------------------------------------------------------- 1 | from splitty import (clear_list_strings, find_elements, 2 | make_intervals, apply_intervals) 3 | 4 | 5 | def cv_parse(cv: list) -> dict: 6 | topics = ['WORK EXPERIENCE', 'EDUCATION'] 7 | 8 | find_topics = find_elements(cv, topics) 9 | 10 | intervals = make_intervals(find_topics, start=True) 11 | 12 | res = apply_intervals(cv, intervals) 13 | 14 | # Insere uma chave 'meta' no começo da lista 15 | res[0].insert(0, 'meta') 16 | 17 | dic = {x[0]: x[1:] for x in res} 18 | 19 | return dic 20 | 21 | 22 | cvs = ['cv.txt', 'cv2.txt', 'cv3.txt'] 23 | 24 | for cv in cvs: 25 | with open(cv) as text: 26 | print(cv_parse(clear_list_strings(text.read().split('\n')))) 27 | -------------------------------------------------------------------------------- /cv_extract.py: -------------------------------------------------------------------------------- 1 | from pdfminer.pdfparser import PDFParser, PDFDocument 2 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 3 | from pdfminer.converter import PDFPageAggregator 4 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine 5 | 6 | 7 | def pdf_to_string(pdf_file): 8 | fp = open(pdf_file, 'rb') 9 | 10 | parser = PDFParser(fp) 11 | doc = PDFDocument() 12 | parser.set_document(doc) 13 | doc.set_parser(parser) 14 | doc.initialize('') 15 | rsrcmgr = PDFResourceManager() 16 | laparams = LAParams() 17 | laparams.line_margin = 0.3 18 | laparams.word_margin = 0.3 19 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 20 | interpreter = PDFPageInterpreter(rsrcmgr, device) 21 | extracted_text = '' 22 | 23 | for page in doc.get_pages(): 24 | interpreter.process_page(page) 25 | layout = device.get_result() 26 | for lt_obj in layout: 27 | if isinstance(lt_obj, (LTTextBox, LTTextLine)): 28 | extracted_text += lt_obj.get_text() 29 | 30 | return extracted_text 31 | 32 | 33 | with open('cv.txt', 'w') as f: 34 | f.write(pdf_to_string('cv.pdf')) 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | *.pdf 104 | *.txt --------------------------------------------------------------------------------