├── requirements.txt
├── README.md
├── cv_parse.py
├── cv_extract.py
└── .gitignore


/requirements.txt:
--------------------------------------------------------------------------------
1 | pdfminer3k==1.3.1
2 | splitty==0.0.7
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CV Extractor
 2 | 
 3 | Extract data for CV with Python.
 4 | 
 5 | ## How to install
 6 | 
 7 | ```
 8 | git clone https://github.com/rg3915/cv-extractor.git
 9 | cd cv-extractor
10 | python3 -m venv .venv
11 | source .venv/bin/activate
12 | pip install -r requirements.txt
13 | python cv_extract.py
14 | ```
15 | 
16 | ## How to sync repo
17 | 
18 | ```
19 | git remote add upstream https://github.com/rg3915/cv-extractor.git
20 | git remote -v
21 | git fetch upstream
22 | git checkout master
23 | git merge upstream/master
24 | git push
25 | ```
26 | 
27 | https://help.github.com/articles/configuring-a-remote-for-a-fork/
28 | 
29 | https://help.github.com/articles/syncing-a-fork/
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/cv_parse.py:
--------------------------------------------------------------------------------
 1 | from splitty import (clear_list_strings, find_elements,
 2 |                      make_intervals, apply_intervals)
 3 | 
 4 | 
 5 | def cv_parse(cv: list) -> dict:
 6 |     topics = ['WORK EXPERIENCE', 'EDUCATION']
 7 | 
 8 |     find_topics = find_elements(cv, topics)
 9 | 
10 |     intervals = make_intervals(find_topics, start=True)
11 | 
12 |     res = apply_intervals(cv, intervals)
13 | 
14 |     # Insere uma chave 'meta' no começo da lista
15 |     res[0].insert(0, 'meta')
16 | 
17 |     dic = {x[0]: x[1:] for x in res}
18 | 
19 |     return dic
20 | 
21 | 
22 | cvs = ['cv.txt', 'cv2.txt', 'cv3.txt']
23 | 
24 | for cv in cvs:
25 |     with open(cv) as text:
26 |         print(cv_parse(clear_list_strings(text.read().split('\n'))))
27 | 


--------------------------------------------------------------------------------
/cv_extract.py:
--------------------------------------------------------------------------------
 1 | from pdfminer.pdfparser import PDFParser, PDFDocument
 2 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 3 | from pdfminer.converter import PDFPageAggregator
 4 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine
 5 | 
 6 | 
 7 | def pdf_to_string(pdf_file):
 8 |     fp = open(pdf_file, 'rb')
 9 | 
10 |     parser = PDFParser(fp)
11 |     doc = PDFDocument()
12 |     parser.set_document(doc)
13 |     doc.set_parser(parser)
14 |     doc.initialize('')
15 |     rsrcmgr = PDFResourceManager()
16 |     laparams = LAParams()
17 |     laparams.line_margin = 0.3
18 |     laparams.word_margin = 0.3
19 |     device = PDFPageAggregator(rsrcmgr, laparams=laparams)
20 |     interpreter = PDFPageInterpreter(rsrcmgr, device)
21 |     extracted_text = ''
22 | 
23 |     for page in doc.get_pages():
24 |         interpreter.process_page(page)
25 |         layout = device.get_result()
26 |         for lt_obj in layout:
27 |             if isinstance(lt_obj, (LTTextBox, LTTextLine)):
28 |                 extracted_text += lt_obj.get_text()
29 | 
30 |     return extracted_text
31 | 
32 | 
33 | with open('cv.txt', 'w') as f:
34 |     f.write(pdf_to_string('cv.pdf'))
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | *.pdf
104 | *.txt


--------------------------------------------------------------------------------