├── __init__.py
├── extrator
    ├── __init__.py
    ├── base
    │   ├── __init__.py
    │   └── utils.py
    ├── crawler
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── pipeliner.py
    │   └── parser.py
    ├── test
    │   ├── __init__.py
    │   ├── test_env
    │   ├── test_pipeline.py
    │   ├── test_utils.py
    │   ├── test_models.py
    │   └── test_parser.py
    ├── datasources
    │   ├── __init__.py
    │   ├── ddl
    │   │   ├── SEQUENCES.sql
    │   │   ├── TJRJ_MOVIMENTO_ITEM_TJ.sql
    │   │   ├── TJRJ_PROCESSO_MOVIMENTO_TJ.SQL
    │   │   └── TJRJ_PROCESSO_TJ.sql
    │   ├── broker.py
    │   ├── mcpr_models.py
    │   ├── tjrj_models.py
    │   └── models.py
    └── settings.py
├── nosetests.py
├── .coveragerc
├── .travis.yml
├── tox.ini
├── app.sh
├── setup.py
├── .vscode
    ├── settings.json
    └── launch.json
├── requirements.txt
├── README.md
├── LICENSE
├── .gitignore
├── main.py
└── newrelic.ini


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/extrator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/extrator/base/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/extrator/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/extrator/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/extrator/datasources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/nosetests.py:
--------------------------------------------------------------------------------
1 | import nose
2 | nose.run()
3 | 


--------------------------------------------------------------------------------
/extrator/test/test_env:
--------------------------------------------------------------------------------
1 | export DB_HOST=""
2 | export DB_USER=""
3 | export DB_PASSWORD=""
4 | export DB_SID=""
5 | export DB_PORT=""
6 | 
7 | 


--------------------------------------------------------------------------------
/extrator/datasources/ddl/SEQUENCES.sql:
--------------------------------------------------------------------------------
1 | CREATE SEQUENCE SEQ_TJRJ_PROCESSO_TJ;
2 | CREATE SEQUENCE SEQ_TJRJ_PROCESSO_MOVIMENTO_TJ;
3 | CREATE SEQUENCE SEQ_TJRJ_MOVIMENTO_ITEM_TJ;


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 |     # omit anything in a .local directory anywhere
4 |     */.local/*
5 |     # omit everything in /usr
6 |     /usr/*
7 |     */site-packages/*
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.5"
 4 | install:
 5 |   pip install -r requirements.txt;
 6 | script:
 7 |   - source extrator/test/test_env
 8 |   - flake8 .
 9 |   - nosetests
10 | notifications:
11 |   email:
12 |     - felipe.gomes.ferreira@gmail.com


--------------------------------------------------------------------------------
/extrator/datasources/broker.py:
--------------------------------------------------------------------------------
 1 | """Descritivo das interfaces para postar nas
 2 | filas de processamento de inteiro teores"""
 3 | 
 4 | from ..settings import celeryapp
 5 | 
 6 | 
 7 | @celeryapp.task(name='zuleika.classificar')
 8 | def classificar(id, texto):
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | exclude =
 3 |     # No need to traverse our git directory
 4 |     .git,
 5 |     # There's no value in checking cache directories
 6 |     __pycache__,
 7 |     static,
 8 |     migrations,
 9 |     settings.py,
10 |     wsgi.py,
11 |     venv,
12 |     fixtures
13 | 


--------------------------------------------------------------------------------
/app.sh:
--------------------------------------------------------------------------------
 1 | while :
 2 | do
 3 |     data=$(date +%H)
 4 |     if [[ $data -ge "06"  && $data -le "18" ]]
 5 |     then
 6 |         export INSTANCIAS=2
 7 |     else
 8 |         export INSTANCIAS=10
 9 |     fi
10 | 
11 |     echo Iniciando $INSTANCIAS instancias paralelas
12 |     python main.py  
13 | done;


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | __version__ = '0.0.0dev0'
 5 | 
 6 | 
 7 | setup(
 8 |     name='robotj',
 9 |     descripition='Robo de raspagem de processos do TJRJ',
10 |     url='https://github.com/MinisterioPublicoRJ/robotj',
11 |     packages=find_packages(
12 |         exclude=["*.tests", "*.tests.*", "tests.*", "tests"]
13 |     ),
14 |     author='Felipe Ferreira & Rhenan Bartels',
15 |     license='MIT',
16 |     zip_safe=False
17 | )
18 | 


--------------------------------------------------------------------------------
/extrator/datasources/ddl/TJRJ_MOVIMENTO_ITEM_TJ.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE TJRJ_MOVIMENTO_ITEM_TJ (
 2 |     MVIT_DK NUMBER, -- SEQUENCE
 3 |     MVIT_PRMV_DK NUMBER,
 4 |     MVIT_TP_CHAVE nvarchar2(2000),
 5 |     MVIT_TP_VALOR nvarchar2(2000),
 6 |     CONSTRAINT fk_MVIT_PRMV_DK
 7 |     FOREIGN KEY (MVIT_PRMV_DK)
 8 |     REFERENCES TJRJ_PROCESSO_MOVIMENTO_TJ (PRMV_DK),
 9 |     CONSTRAINT PK_MVIT_DK PRIMARY KEY (MVIT_DK)
10 | );
11 | CREATE INDEX MVIT_PRMV_DK_I ON TJRJ_MOVIMENTO_ITEM_TJ (MVIT_PRMV_DK);


--------------------------------------------------------------------------------
/extrator/datasources/mcpr_models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import (
 2 |     MetaData,
 3 |     Table,
 4 |     Column,
 5 |     NUMERIC,
 6 |     String)
 7 | 
 8 | meta = MetaData(schema='mcpr')
 9 | 
10 | TB_DOCUMENTO = Table(
11 |     'mcpr_documento',
12 |     meta,
13 |     Column(
14 |         'docu_dk',
15 |         NUMERIC(
16 |             precision=12,
17 |             scale=0,
18 |             asdecimal=False),
19 |         primary_key=True,
20 |         nullable=False),
21 |     Column('docu_nr_externo', String(length=20)),
22 |     Column('docu_mate_dk', NUMERIC(precision=4, scale=0, asdecimal=False))
23 | )
24 | 


--------------------------------------------------------------------------------
/extrator/datasources/ddl/TJRJ_PROCESSO_MOVIMENTO_TJ.SQL:
--------------------------------------------------------------------------------
 1 | CREATE TABLE TJRJ_PROCESSO_MOVIMENTO_TJ (
 2 |     PRMV_DK NUMBER, -- SEQUENCE
 3 |     PRMV_PRTJ_DK NUMBER,
 4 |     PRMV_TP_MOVIMENTO nvarchar2(400),
 5 |     PRMV_DT_ULTIMA_ATUALIZACAO DATE,
 6 |     PRMV_TX_INTEIRO_TEOR BLOB,
 7 |     PRMV_HASH CHAR(32),
 8 |     CONSTRAINT fk_prmv_prtj_dk 
 9 |     FOREIGN KEY (PRMV_PRTJ_DK)
10 |     REFERENCES TJRJ_PROCESSO_TJ (PRTJ_DK),
11 |     CONSTRAINT PK_PRMV_DK PRIMARY KEY (PRMV_DK)
12 | );
13 | CREATE INDEX PRMV_PRTJ_DK_I ON TJRJ_PROCESSO_MOVIMENTO_TJ (PRMV_PRTJ_DK);
14 | CREATE INDEX PRMV_HASH_I ON TJRJ_PROCESSO_MOVIMENTO_TJ (PRMV_HASH);


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.formatting.provider": "yapf",
 3 |     "python.linting.flake8Enabled": true,
 4 |     "python.linting.ignorePatterns": [
 5 |     ".vscode/*.py",
 6 |     "**/site-packages/**/*.py",
 7 |     "**/migrations/**/*.py"
 8 |     ],
 9 | <<<<<<< Updated upstream
10 |     "python.venvFolders": [
11 |         "envs",
12 |         ".pyenv",
13 |         ".direnv",
14 |         "venv"
15 |     ],
16 |     "files.exclude": {
17 |         "**/.git": true,
18 |         "**/.DS_Store": true,
19 |         "venv": true
20 |     }
21 | =======
22 |     "python.pythonPath": "${workspaceFolder}/venv/bin/python"
23 | >>>>>>> Stashed changes
24 | }


--------------------------------------------------------------------------------
/extrator/base/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from ..settings import LOGGER_FORMAT, LOGGER_LEVEL
 3 | from sqlalchemy.orm import sessionmaker
 4 | 
 5 | engine = {'connection': None}
 6 | engine_cx = {'connection': None}
 7 | 
 8 | 
 9 | def set_log():
10 |     logging.basicConfig(
11 |         format=LOGGER_FORMAT,
12 |         level=LOGGER_LEVEL)
13 | 
14 | 
15 | def logger():
16 |     logger = logging.getLogger('robotj.logger')
17 | 
18 |     return logger
19 | 
20 | 
21 | def conn():
22 |     return engine['connection']
23 | 
24 | 
25 | def session():
26 |     if not ('session' in engine and engine['session']):
27 |         engine['session'] = sessionmaker(bind=conn())
28 | 
29 |     return engine['session']()
30 | 
31 | 
32 | def cxoracle():
33 |     return engine_cx['connection']
34 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | amqp==2.3.2
 2 | astroid==1.6.1
 3 | attrs==17.4.0
 4 | beautifulsoup4==4.6.0
 5 | billiard==3.5.0.3
 6 | celery==4.2.0
 7 | certifi==2018.1.18
 8 | chardet==3.0.4
 9 | coverage==4.5.1
10 | cx-Oracle==6.2.1
11 | decorator==4.2.1
12 | flake8==3.5.0
13 | idna==2.6
14 | ipdb==0.11
15 | ipython==6.2.1
16 | ipython-genutils==0.2.0
17 | isort==4.3.4
18 | jedi==0.11.1
19 | kombu==4.2.1
20 | lazy-object-proxy==1.3.1
21 | logger==1.4
22 | lxml==4.1.1
23 | mccabe==0.6.1
24 | newrelic==3.2.0.91
25 | nose==1.3.7
26 | parso==0.1.1
27 | pexpect==4.4.0
28 | pickleshare==0.7.4
29 | pluggy==0.6.0
30 | prompt-toolkit==1.0.15
31 | ptyprocess==0.5.2
32 | py==1.5.2
33 | pycodestyle==2.3.1
34 | pyflakes==1.6.0
35 | Pygments==2.2.0
36 | pylint==1.8.2
37 | pytest==3.4.2
38 | python-slugify==1.2.4
39 | pytz==2018.4
40 | redis==2.10.6
41 | requests==2.18.4
42 | simplegeneric==0.8.1
43 | six==1.11.0
44 | SQLAlchemy==1.2.5
45 | timeout-decorator==0.4.0
46 | traitlets==4.3.2
47 | unicodecsv==0.14.1
48 | Unidecode==1.0.22
49 | urllib3==1.22
50 | vine==1.1.4
51 | wcwidth==0.1.7
52 | wrapt==1.10.11
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # robotj
 2 | Robo de Extração de Processos do Tribunal de Justiça do Rio de Janeiro
 3 | 
 4 | 
 5 | ## Configurações e variáveis de ambiente
 6 | 
 7 | ```
 8 | export DB_HOST="(Host do Banco de Dados)"
 9 | export DB_USER="(Usuário do Banco de Dados)"
10 | export DB_PASSWORD="(Senha do Usuário do Banco de Dados)"
11 | export DB_SID="(Instância do Banco de Dados)"
12 | export DB_PORT='(Porta do Banco de Dados)'
13 | 
14 | export NEW_RELIC_PROXY_SCHEME="http"
15 | export NEW_RELIC_PROXY_HOST="(Endereço do Proxy NEw Relic, caso necessário)"
16 | export NEW_RELIC_PROXY_PORT="(Porta do Proxy NEw Relic, caso necessário)"
17 | export NEW_RELIC_PROXY_USER="(Usuário do Proxy, caso necessário)"
18 | export NEW_RELIC_PROXY_PASS="(Senha do Usuário no Proxy)"
19 | export NEW_RELIC_LICENSE_KEY=(Chave de Licença New Relic)
20 | export NEW_RELIC_ENVIRONMENT=(Ambiente New Relic: development|em branco)
21 | export NEW_RELIC_CONFIG_FILE=newrelic.ini 
22 | export NEW_RELIC_LOG=newrelic.log
23 | 
24 | export QUEUE="(Nome da Fila REDIS para submissão de classificação de Inteiro Teores)"
25 | export BROKER="redis://:[senha]@[host do broker REDIS]:[porta]"
26 | 
27 | ```


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Felipe Gomes Vieira Ferreira
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/extrator/crawler/utils.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import json
 3 | import re
 4 | 
 5 | from hashlib import md5
 6 | 
 7 | 
 8 | def formata_numero_processo(numero_processo):
 9 |     mascara = "{0}-{1}.{2}.{3}.{4}.{5}"
10 | 
11 |     primeira_parte = slice(0, 7)
12 |     segunda_parte = slice(7, 9)
13 |     terceira_parte = slice(9, 13)
14 |     quarta_parte = slice(13, 14)
15 |     quinta_parte = slice(14, 16)
16 |     sexta_parte = slice(16, 20)
17 | 
18 |     return mascara.format(
19 |         numero_processo[primeira_parte],
20 |         numero_processo[segunda_parte],
21 |         numero_processo[terceira_parte],
22 |         numero_processo[quarta_parte],
23 |         numero_processo[quinta_parte],
24 |         numero_processo[sexta_parte]
25 |     )
26 | 
27 | 
28 | def limpa_conteudo(conteudo_sujo):
29 |     return re.sub(r'\s+', ' ', conteudo_sujo).strip()
30 | 
31 | 
32 | def remove_data_consulta(html):
33 |     html = html.decode('latin-1')
34 |     return re.sub(
35 |         r'TJ/RJ -\r\n                      '
36 |         r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}',
37 |         '',
38 |         html).encode()
39 | 
40 | 
41 | def cria_hash_do_processo(html):
42 |     return md5(html.encode()).hexdigest()
43 | 
44 | 
45 | def cria_hash_do_movimento(item):
46 |     chaves = sorted(item.keys())
47 |     valores = [item[chave] for chave in chaves]
48 |     itens_ordenados = list(zip(chaves, valores))
49 |     item_json = json.dumps(itens_ordenados)
50 |     return hashlib.md5(item_json.encode()).hexdigest()
51 | 


--------------------------------------------------------------------------------
/extrator/crawler/pipeliner.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import timeout_decorator
 4 | from bs4 import BeautifulSoup
 5 | from .utils import formata_numero_processo, cria_hash_do_processo
 6 | from .parser import parse_metadados, area_dos_metadados, parse_itens
 7 | from ..base.utils import logger
 8 | from ..settings import URL_PROCESSO
 9 | 
10 | 
11 | @timeout_decorator.timeout(30)
12 | def pipeline(processo):
13 |     logger().info(processo)
14 |     dados_processo = {}
15 |     numero_processo = formata_numero_processo(processo)
16 |     try:
17 |         resp = requests.get(
18 |             URL_PROCESSO.format(doc_number=numero_processo),
19 |             headers={'X-Forwarded-For': '10.0.250.15'},
20 |             timeout=10
21 |         )
22 |         soup = BeautifulSoup(resp.content, 'lxml')
23 |         linhas = soup.find_all('tr')
24 |         inicio, fim = area_dos_metadados(linhas)
25 |         dados_processo.update(
26 |             parse_metadados(
27 |                 linhas,
28 |                 numero_processo,
29 |                 inicio,
30 |                 fim))
31 |         dados_processo['hash'] = cria_hash_do_processo(
32 |             json.dumps(dados_processo))
33 |         dados_processo.update(parse_itens(soup, processo, inicio + 1))
34 |     except Exception as erro:
35 |         logger().error(
36 |             "Erro de parsing do processo - {0}, com mensagem: {1}".format(
37 |                 numero_processo,
38 |                 erro))
39 |         raise erro
40 |     return dados_processo
41 | 


--------------------------------------------------------------------------------
/extrator/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cx_Oracle
 3 | import logging
 4 | import newrelic.agent
 5 | from celery import Celery
 6 | 
 7 | POOLCOUNT=50
 8 | 
 9 | NEW_RELIC_ENVIRONMENT = os.environ.get("NEW_RELIC_ENVIRONMENT")
10 | 
11 | INSTANCIAS = int(os.environ.get("INSTANCIAS", 2))
12 | 
13 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
14 | 
15 | DS_EXADATA_HOST = os.environ['DB_HOST']
16 | DS_EXADATA_PORT = os.environ['DB_PORT']
17 | DS_EXADATA_SID = os.environ.get('DB_SID', None)
18 | DS_EXADATA_SERVICE_NAME = os.environ.get('DB_SERVICE_NAME', None)
19 | DS_EXADATA_user = os.environ['DB_USER']
20 | DS_EXADATA_password = os.environ['DB_PASSWORD']
21 | 
22 | DS_EXADATA_CONN_SID = cx_Oracle.makedsn(
23 |     DS_EXADATA_HOST,
24 |     DS_EXADATA_PORT,
25 |     sid=DS_EXADATA_SID)
26 | 
27 | if not DS_EXADATA_SID:
28 |     DS_EXADATA_CONN_SID = DS_EXADATA_CONN_SID.replace(
29 |         'SID=None',
30 |         'SERVICE_NAME=%s' % DS_EXADATA_SERVICE_NAME
31 |     )
32 | 
33 | DS_EXADATA_CONN_CSTR = 'oracle://{user}:{password}@{sid}'.format(
34 |     user=DS_EXADATA_user,
35 |     password=DS_EXADATA_password,
36 |     sid=DS_EXADATA_CONN_SID
37 | )
38 | 
39 | LOGGER_FORMAT = '%(asctime)-15s %(message)s'
40 | LOGGER_LEVEL = logging.INFO
41 | 
42 | URL_PROCESSO = ("http://www4.tjrj.jus.br/consultaProcessoWebV2/"
43 |                 "consultaMov.do?v=2&numProcesso={doc_number}&"
44 |                 "acessoIP=internet&tipoUsuario")
45 | 
46 | QUEUE = os.environ.get('QUEUE', None)
47 | BROKER = os.environ.get('BROKER', None)
48 | 
49 | celeryapp = Celery(QUEUE, broker=BROKER)
50 | 


--------------------------------------------------------------------------------
/extrator/datasources/ddl/TJRJ_PROCESSO_TJ.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE TJRJ_PROCESSO_TJ (
 2 |     PRTJ_DK NUMBER, -- SEQUENCE
 3 |     PRTJ_DOCU_DK NUMBER,
 4 |     PRTJ_CD_NUMERO_PROCESSO  NCHAR(20),
 5 |     PRTJ_TX_EXECUTADO  nvarchar2(400),
 6 |     PRTJ_TX_ADVOGADO_S  nvarchar2(400),
 7 |     PRTJ_TX_NUMERO_DO_TOMBO  nvarchar2(400),
 8 |     PRTJ_TX_OFICIO_DE_REGISTRO  nvarchar2(400),
 9 |     PRTJ_TX_FOLHA  nvarchar2(400),
10 |     PRTJ_TX_REQUERIDO  nvarchar2(400),
11 |     PRTJ_TX_EXEQUENTE  nvarchar2(400),
12 |     PRTJ_TX_REPRESENTANTE_LEGAL  nvarchar2(400),
13 |     PRTJ_TX_ACAO  nvarchar2(400),
14 |     PRTJ_TX_COMUNICANTE  nvarchar2(400),
15 |     PRTJ_TX_REQUERENTE  nvarchar2(400),
16 |     PRTJ_TX_BAIRRO  nvarchar2(400),
17 |     PRTJ_TX_LIVRO  nvarchar2(400),
18 |     PRTJ_TX_PAI  nvarchar2(400),
19 |     PRTJ_TX_MAE  nvarchar2(400),
20 |     PRTJ_TX_AVISO_AO_ADVOGADO  nvarchar2(400),
21 |     PRTJ_TX_STATUS  nvarchar2(400),
22 |     PRTJ_TX_COMARCA  nvarchar2(400),
23 |     PRTJ_TX_ASSISTENTE  nvarchar2(400),
24 |     PRTJ_TX_CIDADE  nvarchar2(400),
25 |     PRTJ_TX_AUTOR_DO_FATO  nvarchar2(400),
26 |     PRTJ_TX_ACUSADO  nvarchar2(400),
27 |     PRTJ_TX_IMPETRADO  nvarchar2(400),
28 |     PRTJ_TX_IMPETRANTE  nvarchar2(400),
29 |     PRTJ_TX_NOTIFICADO  nvarchar2(400),
30 |     PRTJ_TX_AUTOR  nvarchar2(400),
31 |     PRTJ_TX_INTIMADO  nvarchar2(400),
32 |     PRTJ_TX_IDOSO  nvarchar2(400),
33 |     PRTJ_TX_AVO_AVO  nvarchar2(400),
34 |     PRTJ_TX_REU  nvarchar2(400),
35 |     PRTJ_TX_RECLAMADO  nvarchar2(400),
36 |     PRTJ_TX_ENDERECO  nvarchar2(400),
37 |     PRTJ_TX_PRAZO  nvarchar2(400),
38 |     PRTJ_TX_CLASSE  nvarchar2(400),
39 |     PRTJ_TX_ASSUNTO  nvarchar2(400),
40 |     PRTJ_DT_ULTIMA_ATUALIZACAO DATE,
41 |     PRTJ_DT_ULTIMA_VISTA DATE,
42 |     PRTJ_HASH CHAR(32),
43 |     CONSTRAINT PK_PRTJ_DK PRIMARY KEY (PRTJ_DK)
44 | );
45 | CREATE INDEX PRTJ_CD_NUMERO_PROCESSO_I ON TJRJ_PROCESSO_TJ (PRTJ_CD_NUMERO_PROCESSO);
46 | CREATE INDEX PRTJ_HASH_I ON TJRJ_PROCESSO_TJ (PRTJ_HASH);
47 | CREATE INDEX PRTJ_DT_DATA_ULTIMA_VISTA_I ON TJRJ_PROCESSO_TJ (PRTJ_DT_ULTIMA_VISTA);


--------------------------------------------------------------------------------
/extrator/test/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch, MagicMock
 2 | from unittest import TestCase
 3 | from ..crawler.pipeliner import pipeline
 4 | from ..settings import URL_PROCESSO
 5 | 
 6 | 
 7 | class Pipeline(TestCase):
 8 |     @patch('robotj.extrator.crawler.pipeliner.parse_itens',
 9 |            return_value={'d': 4})
10 |     @patch('robotj.extrator.crawler.pipeliner.parse_metadados',
11 |            return_value={'a': 1})
12 |     @patch('robotj.extrator.crawler.pipeliner.area_dos_metadados',
13 |            return_value=(0, 1))
14 |     @patch('robotj.extrator.crawler.pipeliner.BeautifulSoup')
15 |     @patch('robotj.extrator.crawler.pipeliner.cria_hash_do_processo')
16 |     @patch('robotj.extrator.crawler.pipeliner.requests')
17 |     @patch('robotj.extrator.crawler.pipeliner.formata_numero_processo')
18 |     def test_pipeline_do_parsing_dos_processos(self, _fnp, _req, _chdp, _bs,
19 |                                                _am, _pm, _pi):
20 |         processo = '1234'
21 |         numero_formatado = '1.2.3.4'
22 |         html = '{"a": 1}'
23 |         _resp_mock = MagicMock()
24 |         _resp_mock.content = html
25 | 
26 |         _soup_mock = MagicMock()
27 | 
28 |         _soup_mock.find_all.return_value = 'rows_mock'
29 | 
30 |         _fnp.return_value = numero_formatado
31 |         _req.get.return_value = _resp_mock
32 |         _chdp.return_value = 'ab12'
33 |         _bs.return_value = _soup_mock
34 | 
35 |         processos = pipeline(processo)
36 | 
37 |         _fnp.assert_called_once_with(processo)
38 |         _req.get.assert_called_once_with(URL_PROCESSO.format(
39 |             doc_number=numero_formatado),
40 |             headers={'X-Forwarded-For': '10.0.250.15'},
41 |             timeout=10)
42 |         _chdp.assert_called_once_with(html)
43 |         _bs.assert_called_once_with(html, 'lxml')
44 |         _soup_mock.find_all.assert_called_once_with('tr')
45 |         _am.assert_called_once_with('rows_mock')
46 |         _pm.assert_called_once_with('rows_mock', '1.2.3.4', 0, 1)
47 |         _pi.assert_called_once_with(_soup_mock, '1234', 1)
48 | 
49 |         self.assertEqual(processos, {'a': 1, 'd': 4, 'hash': 'ab12'})
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/code,django,python
  3 | 
  4 | ### Code ###
  5 | # Visual Studio Code - https://code.visualstudio.com/
  6 | .settings/
  7 | #.vscode/
  8 | tsconfig.json
  9 | jsconfig.json
 10 | 
 11 | ### Django ###
 12 | *.log
 13 | *.pot
 14 | *.pyc
 15 | __pycache__/
 16 | local_settings.py
 17 | db.sqlite3
 18 | media
 19 | 
 20 | # If your build process includes running collectstatic, then you probably don't need or want to include staticfiles/
 21 | # in your Git repository. Update and uncomment the following line accordingly.
 22 | # <django-project-name>/staticfiles/
 23 | 
 24 | /static/
 25 | 
 26 | ### Python ###
 27 | # Byte-compiled / optimized / DLL files
 28 | *.py[cod]
 29 | *$py.class
 30 | 
 31 | # C extensions
 32 | *.so
 33 | 
 34 | # Distribution / packaging
 35 | .Python
 36 | build/
 37 | develop-eggs/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | *.egg-info/
 48 | .installed.cfg
 49 | *.egg
 50 | 
 51 | # PyInstaller
 52 | #  Usually these files are written by a python script from a template
 53 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 54 | *.manifest
 55 | *.spec
 56 | 
 57 | # Installer logs
 58 | pip-log.txt
 59 | pip-delete-this-directory.txt
 60 | 
 61 | # Unit test / coverage reports
 62 | htmlcov/
 63 | .tox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | .pytest_cache/
 68 | nosetests.xml
 69 | coverage.xml
 70 | *.cover
 71 | .hypothesis/
 72 | 
 73 | # Translations
 74 | *.mo
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # pyenv
 93 | .python-version
 94 | 
 95 | # celery beat schedule file
 96 | celerybeat-schedule.*
 97 | 
 98 | # SageMath parsed files
 99 | *.sage.py
100 | 
101 | # Environments
102 | .env*
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/
122 | 
123 | 
124 | # End of https://www.gitignore.io/api/code,django,python
125 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import cx_Oracle
 4 | from sqlalchemy import create_engine
 5 | from extrator.settings import DS_EXADATA_CONN_CSTR
 6 | from extrator.base.utils import engine, set_log, engine_cx
 7 | from extrator.datasources.models import (
 8 |     obter_documentos_externos,
 9 |     atualizar_documento,
10 |     atualizar_vista)
11 | from extrator.crawler.pipeliner import pipeline
12 | from extrator.settings import (
13 |     DS_EXADATA_user,
14 |     DS_EXADATA_password,
15 |     INSTANCIAS,
16 |     DS_EXADATA_CONN_SID)
17 | from multiprocessing.dummy import Pool
18 | 
19 | PARALELO = True
20 | 
21 | 
22 | def main():
23 |     os.environ['NLS_LANG'] = 'American_America.UTF8'
24 |     engine['connection'] = create_engine(
25 |         DS_EXADATA_CONN_CSTR,
26 |         convert_unicode=False,
27 |         pool_recycle=10,
28 |         pool_size=50,
29 |         # echo=True,
30 |         encoding="utf-8"
31 |     )
32 | 
33 |     engine_cx['connection'] = cx_Oracle.connect(
34 |         DS_EXADATA_user,
35 |         DS_EXADATA_password,
36 |         DS_EXADATA_CONN_SID,
37 |         encoding="UTF-8",
38 |         nencoding="UTF-8",
39 |         threaded=True)
40 |     engine_cx['connection'].autocommit = True
41 | 
42 |     set_log()
43 | 
44 |     docs = obter_documentos_externos()
45 | 
46 |     if PARALELO:
47 |         pool = Pool(INSTANCIAS)
48 | 
49 |         return pool.map(processar_armazenar, docs[:10000])
50 |     else:
51 |         retorno = []
52 |         for item in map(processar_armazenar, docs[:10000]):
53 |             retorno += [item]
54 |         return retorno
55 | 
56 | 
57 | def processar_armazenar(doc):
58 | 
59 |     retorno = None
60 | 
61 |     def wrapper(doc):
62 |         global retorno
63 |         try:
64 |             print('.', end='')
65 |             sys.stdout.flush()
66 |             documento = pipeline(doc[0])
67 |             if documento == {}:
68 |                 atualizar_vista(doc[0], doc[1])
69 |                 raise Exception("Documento %s não encontrado no TJRJ" % doc[0])
70 |             atualizar_documento(documento, doc[1])
71 |             retorno = "Atualizado: %s" % str(doc[0])
72 |         except Exception as error:
73 |             atualizar_vista(doc[0], doc[1])
74 |             retorno = "Problema: doc %s - %s" % (str(doc), str(error))
75 |             raise error
76 | 
77 |     try:
78 |         wrapper(doc)
79 |     except Exception:
80 |         pass
81 | 
82 |     return retorno
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/extrator/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from ..crawler.utils import (formata_numero_processo,
 4 |                              limpa_conteudo,
 5 |                              remove_data_consulta,
 6 |                              cria_hash_do_movimento,
 7 |                              cria_hash_do_processo)
 8 | from .fixtures.processos import processo_judicial_1
 9 | 
10 | 
11 | class Utils(TestCase):
12 |     def test_format_document_numner(self):
13 |         numero_processo = "09878976543451238976"
14 | 
15 |         numero_processo_formatado = formata_numero_processo(numero_processo)
16 |         expected = "0987897-65.4345.1.23.8976"
17 | 
18 |         self.assertEqual(numero_processo_formatado, expected)
19 | 
20 |     def test_limpa_conteudo(self):
21 |         conteudo_sujo = ('\r\n                        Av. Presidente Lincol'
22 |                          'n\r\n                        \xa0\r\n            '
23 |                          '            857\r\n                        \xa0\r'
24 |                          '\n                        \r\n                   '
25 |                          '\xa0\r\n                      ')
26 | 
27 |         conteudo_limpo = limpa_conteudo(conteudo_sujo)
28 |         esperado = 'Av. Presidente Lincoln 857'
29 | 
30 |         self.assertEqual(conteudo_limpo, esperado)
31 | 
32 | 
33 | class Hash(TestCase):
34 |     def test_cria_hash_do_conteudo_html_do_processo(self):
35 |         hash_documento = cria_hash_do_processo(processo_judicial_1)
36 |         esperado = '30a5e6dc4717981102f2dfc2598eac27'
37 | 
38 |         self.assertEqual(hash_documento, esperado)
39 | 
40 |     def test_remove_data_de_consulta_do_html(self):
41 |         trecho_processo = '<tr valign="top"><td colspan="2" class="info">'\
42 |             'TJ/RJ -\r\n                      23/03/2018 12:48:23</td>'\
43 |             '</tr>'.encode()
44 | 
45 |         processo_sem_data = remove_data_consulta(trecho_processo)
46 |         esperado = '<tr valign="top"><td colspan="2" class="info"></td>'\
47 |             '</tr>'.encode()
48 | 
49 |         self.assertEqual(processo_sem_data, esperado)
50 | 
51 |     def test_cria_hash_para_um_movimento(self):
52 |         item = {
53 |             'tipo-do-movimento': 'Conclusão ao Juiz',
54 |             'data-da-conclusao': ['21/10/2015'],
55 |             'juiz': ['VIVIANE TOVAR DE MATTOS ABRAHAO']
56 |         }
57 | 
58 |         movimento_hash = cria_hash_do_movimento(item)
59 |         esperado = '03b979f3d68a8b526746c94370039ddb'
60 | 
61 |         self.assertEqual(movimento_hash, esperado)
62 | 


--------------------------------------------------------------------------------
/extrator/datasources/tjrj_models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import (
 2 |     MetaData,
 3 |     Table,
 4 |     Sequence,
 5 |     Column,
 6 |     Integer,
 7 |     String,
 8 |     DateTime)
 9 | 
10 | meta = MetaData(schema='tjrj')
11 | 
12 | SQ_PROCESSO = Sequence('tjrj_sq_prtj_dk')
13 | SQ_MOVIMENTO = Sequence('tjrj_sq_prmv_dk')
14 | SQ_ITEM_MOVIMENTO = Sequence('tjrj_sq_mvit_dk')
15 | 
16 | TB_PROCESSO = Table(
17 |     'tjrj_processo_tj',
18 |     meta,
19 |     Column('prtj_dk', Integer(), primary_key=True),
20 |     Column('prtj_docu_dk', Integer()),
21 |     Column('prtj_cd_numero_processo', String(25)),
22 |     Column('prtj_tx_executado', String(400)),
23 |     Column('prtj_tx_advogado_s', String(400)),
24 |     Column('prtj_tx_numero_do_tombo', String(400)),
25 |     Column('prtj_tx_oficio_de_registro', String(400)),
26 |     Column('prtj_tx_folha', String(400)),
27 |     Column('prtj_tx_requerido', String(400)),
28 |     Column('prtj_tx_exequente', String(400)),
29 |     Column('prtj_tx_representante_legal', String(400)),
30 |     Column('prtj_tx_acao', String(400)),
31 |     Column('prtj_tx_comunicante', String(400)),
32 |     Column('prtj_tx_requerente', String(400)),
33 |     Column('prtj_tx_bairro', String(400)),
34 |     Column('prtj_tx_livro', String(400)),
35 |     Column('prtj_tx_pai', String(400)),
36 |     Column('prtj_tx_mae', String(400)),
37 |     Column('prtj_tx_aviso_ao_advogado', String(400)),
38 |     Column('prtj_tx_status', String(400)),
39 |     Column('prtj_tx_comarca', String(400)),
40 |     Column('prtj_tx_assistente', String(400)),
41 |     Column('prtj_tx_cidade', String(400)),
42 |     Column('prtj_tx_autor_do_fato', String(400)),
43 |     Column('prtj_tx_acusado', String(400)),
44 |     Column('prtj_tx_impetrado', String(400)),
45 |     Column('prtj_tx_impetrante', String(400)),
46 |     Column('prtj_tx_notificado', String(400)),
47 |     Column('prtj_tx_autor', String(400)),
48 |     Column('prtj_tx_intimado', String(400)),
49 |     Column('prtj_tx_idoso', String(400)),
50 |     Column('prtj_tx_avo_avo', String(400)),
51 |     Column('prtj_tx_reu', String(400)),
52 |     Column('prtj_tx_reclamado', String(400)),
53 |     Column('prtj_tx_endereco', String(400)),
54 |     Column('prtj_tx_prazo', String(400)),
55 |     Column('prtj_tx_classe', String(400)),
56 |     Column('prtj_tx_assunto', String(400)),
57 |     Column('prtj_dt_ultima_atualizacao', DateTime()),
58 |     Column('prtj_dt_ultima_vista', DateTime()),
59 |     Column('prtj_hash', String(32))
60 | )
61 | 
62 | 
63 | TB_MOVIMENTO_PROCESSO = Table(
64 |     'tjrj_processo_movimento_tj',
65 |     meta,
66 |     Column('prmv_dk', Integer, primary_key=True),
67 |     Column('prmv_prtj_dk', Integer()),
68 |     Column('prmv_tp_movimento', String(400)),
69 |     Column('prmv_dt_ultima_atualizacao', DateTime()),
70 |     Column('prmv_hash', String(32))
71 | )
72 | 
73 | 
74 | TB_ITEM_MOVIMENTO = Table(
75 |     'tjrj_movimento_item_tj',
76 |     meta,
77 |     Column('mvit_dk', Integer, primary_key=True),
78 |     Column('mvit_prmv_dk', Integer()),
79 |     Column('mvit_tp_chave', String(256)),
80 |     Column('mvit_tp_valor', String(4000)),
81 | )
82 | 


--------------------------------------------------------------------------------
/extrator/crawler/parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import collections
  3 | from slugify import slugify
  4 | from .utils import limpa_conteudo, cria_hash_do_movimento
  5 | 
  6 | 
  7 | PADRAO_MOV = re.compile(r'numMov=(\d+)')
  8 | 
  9 | 
 10 | def parse_metadados(linhas_de_dados, numero_processo, inicio_metadados,
 11 |                     fim_metadados):
 12 |     metadados = {
 13 |         'status': [''],
 14 |         'comarca': [''],
 15 |         'endereco': [''],
 16 |         'bairro': [''],
 17 |         'cidade': [''],
 18 |         'acao': [''],
 19 |         'assunto': [''],
 20 |         'classe': [''],
 21 |         'livro': [''],
 22 |         'folha': [''],
 23 |         'numero-do-tombo': [''],
 24 |         'aviso-ao-advogado': [''],
 25 |         'autor': [''],
 26 |         'requerido': [''],
 27 |         'requerente': [''],
 28 |         'advogado-s': ['']
 29 |     }
 30 | 
 31 |     # Delimita o processo na regiao dos metadados
 32 |     linhas_com_metadados = linhas_de_dados[inicio_metadados:fim_metadados]
 33 | 
 34 |     metadados['numero-processo'] = numero_processo
 35 |     metadados['status'] = limpa_conteudo(
 36 |         linhas_com_metadados[0].find_all('td')[0].get_text()
 37 |     )
 38 | 
 39 |     # Apaga linhas utilizadas
 40 |     del linhas_com_metadados[:2]
 41 | 
 42 |     comarcas = []
 43 |     comecou_comarca = False
 44 |     for tr in list(linhas_com_metadados):
 45 |         linhas_com_metadados.pop(0)
 46 |         colunas = tr.find_all('td')
 47 |         dados = ''.join([c.get_text() for c in colunas])
 48 |         if 'Comarca' in dados or \
 49 |            'Regional' in dados:
 50 |             comecou_comarca = True
 51 | 
 52 |         if comecou_comarca:
 53 |             comarcas += extrai_dados_colunas(colunas)
 54 | 
 55 |         if len(colunas) == 1 and comecou_comarca:
 56 |             break
 57 | 
 58 |     metadados['comarca'] = comarcas
 59 | 
 60 |     for tr in list(linhas_com_metadados):
 61 |         linhas_com_metadados.pop(0)
 62 |         linha = []
 63 |         colunas = tr.find_all('td')
 64 |         linha = extrai_dados_colunas(colunas)
 65 |         if linha:
 66 |             metadados[slugify(linha[0])] = linha[1:]
 67 | 
 68 |     return metadados
 69 | 
 70 | 
 71 | def estripa(texto):
 72 |     return ' '.join(limpa_conteudo(texto).split("\n")).strip()
 73 | 
 74 | 
 75 | def atribui(chave, item, valor):
 76 |     valor = estripa(valor)
 77 |     if valor:
 78 |         item[chave].append(valor)
 79 | 
 80 | 
 81 | def parse_processo_apensado(cols, item, campo):
 82 |     dados = cols[1].find_all('a')
 83 |     if dados:
 84 |         item[campo] = [estripa(link.get_text()) for link in dados]
 85 | 
 86 | 
 87 | def parse_descricao(cols, item, campo):
 88 |     for link in cols[1].find_all('a'):
 89 |         if 'onclick' in link.attrs:
 90 |             conteudo_escondido = link.attrs['onclick']
 91 |             inteiro_teor = PADRAO_MOV.findall(
 92 |                 conteudo_escondido)
 93 |             if inteiro_teor:
 94 |                 item['inteiro-teor'] = inteiro_teor
 95 | 
 96 |     atribui(campo, item, next(cols[1].descendants))
 97 | 
 98 | 
 99 | METODOS_PARSING = {
100 |     'processo-s-apensado-s': parse_processo_apensado,
101 |     'processo-s-no-tribunal-de-justica': parse_processo_apensado,
102 |     'descricao': parse_descricao,
103 | }
104 | 
105 | 
106 | def parse_itens(soup, numero_processo, inicio_itens):
107 |     # Recorta area com os itens
108 |     itens = {}
109 |     itens['numero-processo'] = numero_processo
110 |     lista_de_itens = []
111 |     linhas_de_dados = soup.find_all(attrs={'name': 'formResultado'})[0]\
112 |         .find_all('tr')
113 |     linhas_com_itens = linhas_de_dados[inicio_itens:]
114 | 
115 |     for indice, linha in enumerate(list(linhas_com_itens)):
116 |         if linha.attrs == {'class': ['tipoMovimento']}:
117 |             item = collections.defaultdict(list)
118 |             colunas = linha.find_all('td')
119 |             # Podem existir cabeçalhos de itens sem texto, como o
120 |             # Mandado de Pagamento
121 |             # Nesses caso registraremos como tipo de movimento o
122 |             # título do bloco
123 |             if len(colunas) == 1:
124 |                 texto = colunas[0].get_text().strip()
125 |                 chave = 'tipo-do-movimento'
126 |             else:
127 |                 texto = limpa_conteudo(
128 |                     colunas[1].get_text()
129 |                 )
130 |                 chave = slugify(colunas[0].get_text())
131 | 
132 |             item[chave] = texto
133 | 
134 |             info = linhas_com_itens[indice + 1:]
135 |             cont = 0
136 |             while cont < len(info) and\
137 |                     info[cont].attrs != {'class': ['tipoMovimento']}:
138 | 
139 |                 cols = info[cont].find_all('td')
140 |                 if len(cols) > 1:
141 |                     campo = slugify(cols[0].get_text())
142 |                     if campo == 'tipo-do-movimento':
143 |                         campo = 'sub-tipo-do-movimento'
144 |                     if campo in METODOS_PARSING:
145 |                         METODOS_PARSING[campo](cols, item, campo)
146 |                     else:
147 |                         atribui(campo, item, cols[1].get_text())
148 |                 else:
149 |                     cont += 1
150 |                     continue
151 | 
152 |                 cont += 1
153 | 
154 |             lista_de_itens.append(item)
155 | 
156 |     for item in lista_de_itens:
157 |         if 'inteiro-teor' in item:
158 |             item['inteiro-teor'] = soup.find(
159 |                 'input', {
160 |                     'type': 'HIDDEN',
161 |                     'name': 'descMov{0}'.format(item['inteiro-teor'][0])
162 |                 }).attrs['value']
163 | 
164 |     for item in lista_de_itens:
165 |         item['hash'] = cria_hash_do_movimento(item)
166 | 
167 |     itens['itens'] = lista_de_itens
168 |     return itens
169 | 
170 | 
171 | def area_dos_metadados(linhas_de_dados):
172 |     # Aparentemente esse valor e fixo
173 |     inicio = 0
174 |     atributos_inicio_metadados = {'align': 'center',
175 |                                   'class': ['negrito'],
176 |                                   'colspan': '2'}
177 |     for indice, linha in enumerate(linhas_de_dados):
178 |         coluna = linha.find('td')
179 |         if not inicio and coluna.attrs == atributos_inicio_metadados:
180 |             inicio = indice
181 | 
182 |         if 'Tipo do Movimento:' in linha.get_text():
183 |             fim = indice - 1
184 |             break
185 | 
186 |     return inicio, fim
187 | 
188 | 
189 | def extrai_dados_colunas(colunas):
190 |     linha = []
191 |     for td in colunas:
192 |         linha += list(
193 |             filter(None, [limpa_conteudo(td.get_text()) if td else ''])
194 |         )
195 | 
196 |     return linha
197 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     // Use IntelliSense to learn about possible attributes.
  3 |     // Hover to view descriptions of existing attributes.
  4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  5 |     "version": "0.2.0",
  6 |     "configurations": [
  7 | 
  8 |         {
  9 |             "name": "Python: Current File",
 10 |             "type": "python",
 11 |             "request": "launch",
 12 |             "stopOnEntry": true,
 13 |             "pythonPath": "${config:python.pythonPath}",
 14 |             "program": "${file}",
 15 |             "cwd": "${workspaceFolder}",
 16 |             "env": {},
 17 |             "envFile": "${workspaceFolder}/.env",
 18 |             "debugOptions": [
 19 |                 "RedirectOutput"
 20 |             ]
 21 |         },
 22 |         {
 23 |             "name": "Python: Attach",
 24 |             "type": "python",
 25 |             "request": "attach",
 26 |             "localRoot": "${workspaceFolder}",
 27 |             "remoteRoot": "${workspaceFolder}",
 28 |             "port": 3000,
 29 |             "secret": "my_secret",
 30 |             "host": "localhost"
 31 |         },
 32 |         {
 33 |             "name": "Python: Terminal (integrated)",
 34 |             "type": "python",
 35 |             "request": "launch",
 36 |             "stopOnEntry": true,
 37 |             "pythonPath": "${config:python.pythonPath}",
 38 |             "program": "${file}",
 39 |             "cwd": "nosetests",
 40 |             "console": "integratedTerminal",
 41 |             "env": {},
 42 |             "envFile": "${workspaceFolder}/.env",
 43 |             "debugOptions": [],
 44 |             "internalConsoleOptions": "neverOpen"
 45 |         },
 46 |         {
 47 |             "name": "Python: Terminal (external)",
 48 |             "type": "python",
 49 |             "request": "launch",
 50 |             "stopOnEntry": true,
 51 |             "pythonPath": "${config:python.pythonPath}",
 52 |             "program": "${file}",
 53 |             "cwd": "",
 54 |             "console": "externalTerminal",
 55 |             "env": {},
 56 |             "envFile": "${workspaceFolder}/.env",
 57 |             "debugOptions": [],
 58 |             "internalConsoleOptions": "neverOpen"
 59 |         },
 60 |         {
 61 |             "name": "Python: Nosetests",
 62 |             "type": "python",
 63 |             "request": "launch",
 64 |             "stopOnEntry": true,
 65 |             "pythonPath": "${config:python.pythonPath}",
 66 |             "program": "${workspaceFolder}/nosetests.py",
 67 |             "cwd": "${workspaceFolder}",
 68 |             "console": "integratedTerminal",
 69 |             "env": {},
 70 |             "envFile": "${workspaceFolder}/.env",
 71 |             "debugOptions": [],
 72 |             "internalConsoleOptions": "neverOpen"
 73 |         },
 74 |         {
 75 |             "name": "Python: Django",
 76 |             "type": "python",
 77 |             "request": "launch",
 78 |             "stopOnEntry": true,
 79 |             "pythonPath": "${config:python.pythonPath}",
 80 |             "program": "${workspaceFolder}/manage.py",
 81 |             "cwd": "${workspaceFolder}",
 82 |             "args": [
 83 |                 "runserver",
 84 |                 "--noreload",
 85 |                 "--nothreading"
 86 |             ],
 87 |             "env": {},
 88 |             "envFile": "${workspaceFolder}/.env",
 89 |             "debugOptions": [
 90 |                 "RedirectOutput",
 91 |                 "DjangoDebugging"
 92 |             ]
 93 |         },
 94 |         {
 95 |             "name": "Python: Flask (0.11.x or later)",
 96 |             "type": "python",
 97 |             "request": "launch",
 98 |             "stopOnEntry": false,
 99 |             "pythonPath": "${config:python.pythonPath}",
100 |             "module": "flask",
101 |             "cwd": "${workspaceFolder}",
102 |             "env": {
103 |                 "FLASK_APP": "${workspaceFolder}/app.py"
104 |             },
105 |             "args": [
106 |                 "run",
107 |                 "--no-debugger",
108 |                 "--no-reload"
109 |             ],
110 |             "envFile": "${workspaceFolder}/.env",
111 |             "debugOptions": [
112 |                 "RedirectOutput"
113 |             ]
114 |         },
115 |         {
116 |             "name": "Python: Flask (0.10.x or earlier)",
117 |             "type": "python",
118 |             "request": "launch",
119 |             "stopOnEntry": false,
120 |             "pythonPath": "${config:python.pythonPath}",
121 |             "program": "${workspaceFolder}/run.py",
122 |             "cwd": "${workspaceFolder}",
123 |             "args": [],
124 |             "env": {},
125 |             "envFile": "${workspaceFolder}/.env",
126 |             "debugOptions": [
127 |                 "RedirectOutput"
128 |             ]
129 |         },
130 |         {
131 |             "name": "Python: PySpark",
132 |             "type": "python",
133 |             "request": "launch",
134 |             "stopOnEntry": true,
135 |             "osx": {
136 |                 "pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
137 |             },
138 |             "windows": {
139 |                 "pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd"
140 |             },
141 |             "linux": {
142 |                 "pythonPath": "${env:SPARK_HOME}/bin/spark-submit"
143 |             },
144 |             "program": "${file}",
145 |             "cwd": "${workspaceFolder}",
146 |             "env": {},
147 |             "envFile": "${workspaceFolder}/.env",
148 |             "debugOptions": [
149 |                 "RedirectOutput"
150 |             ]
151 |         },
152 |         {
153 |             "name": "Python: Module",
154 |             "type": "python",
155 |             "request": "launch",
156 |             "stopOnEntry": true,
157 |             "pythonPath": "${config:python.pythonPath}",
158 |             "module": "module.name",
159 |             "cwd": "${workspaceFolder}",
160 |             "env": {},
161 |             "envFile": "${workspaceFolder}/.env",
162 |             "debugOptions": [
163 |                 "RedirectOutput"
164 |             ]
165 |         },
166 |         {
167 |             "name": "Python: Pyramid",
168 |             "type": "python",
169 |             "request": "launch",
170 |             "stopOnEntry": true,
171 |             "pythonPath": "${config:python.pythonPath}",
172 |             "cwd": "${workspaceFolder}",
173 |             "env": {},
174 |             "envFile": "${workspaceFolder}/.env",
175 |             "args": [
176 |                 "${workspaceFolder}/development.ini"
177 |             ],
178 |             "debugOptions": [
179 |                 "RedirectOutput",
180 |                 "Pyramid"
181 |             ]
182 |         },
183 |         {
184 |             "name": "Python: Watson",
185 |             "type": "python",
186 |             "request": "launch",
187 |             "stopOnEntry": true,
188 |             "pythonPath": "${config:python.pythonPath}",
189 |             "program": "${workspaceFolder}/console.py",
190 |             "cwd": "${workspaceFolder}",
191 |             "args": [
192 |                 "dev",
193 |                 "runserver",
194 |                 "--noreload=True"
195 |             ],
196 |             "env": {},
197 |             "envFile": "${workspaceFolder}/.env",
198 |             "debugOptions": [
199 |                 "RedirectOutput"
200 |             ]
201 |         }
202 |     ]
203 | }


--------------------------------------------------------------------------------
/extrator/test/test_models.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from unittest.mock import patch
  3 | from ..datasources.models import (
  4 |     _itens_não_presentes,
  5 |     insere_movimento,
  6 |     atualizar_documento
  7 | )
  8 | 
  9 | 
 10 | class ItensMovimento(TestCase):
 11 |     def setUp(self):
 12 |         self.itens = [
 13 |             {'tipo-do-movimento': 'Declínio de Competência',
 14 |              'data': ['11/01/2016'],
 15 |              'descricao':
 16 |              ['VIJI DA COMARCA DE SÃO MATHEUS - ESPIRITO SANTOS'],
 17 |              'hash': '123'},
 18 |             {'tipo-do-movimento': 'Recebimento',
 19 |              'data-de-recebimento': ['19/11/2015'],
 20 |              'hash': '456'},
 21 |             {'tipo-do-movimento': 'Decisão - Declínio de Competência',
 22 |              'data-decisao': ['21/10/2015'],
 23 |              'descricao': ['Ante o teor de fls. 104, DECLINO DE MINHA'
 24 |                            ' COMPETÊNCIA para o Juízo da Infância e'
 25 |                            ' Juventude da Comarca de São Mateus, no'
 26 |                            ' Espírito Santo. Dê-se baixa e encaminhem-se'
 27 |                            ' imediatamente, com as nossas homenagens.'],
 28 |              'hash': '789'},
 29 |             {'tipo-do-movimento': 'Conclusão ao Juiz',
 30 |              'data-da-conclusao': ['21/10/2015'],
 31 |              'juiz': ['VIVIANE TOVAR DE MATTOS ABRAHAO'],
 32 |              'hash': '012'}
 33 |         ]
 34 | 
 35 |         self.documento = {
 36 |             'numero-processo':
 37 |             '00049995820158190036',
 38 |             'hash': '1234',
 39 |             'itens': [{
 40 |                 'tipo-do-movimento': 'Distribuição Dirigida',
 41 |                 'hash': '1234',
 42 |                 'data-da-distribuicao': ['14/03/2011'],
 43 |                 'serventia':
 44 |                 ['Cartório da 2ª Vara de Família, da Inf., da Juv. '
 45 |                  'e do Idoso -'
 46 |                  ' 2ª Vara de Família Infância e Juventude e do Idoso'],
 47 |                 'processo-s-apensado-s': ['0000159-51.2010.8.19.0045'],
 48 |                 'processo-s-no-tribunal-de-justica':
 49 |                 ['0002346-95.2011.8.19.0045'],
 50 |                 'protocolo-s-no-tribunal-de-justica':
 51 |                 ['201500617620 - Data: 26/10/2015'],
 52 |                 'localizacao-na-serventia':
 53 |                 ['Aguardando Arquivamento']
 54 |             }]
 55 |         }
 56 | 
 57 |     def test_itens_nao_presentes(self):
 58 |         itens_no_banco = ['123', '456']
 59 | 
 60 |         assert len(_itens_não_presentes(self.itens, itens_no_banco)) == 2
 61 | 
 62 |     def test_todos_itens_presentes(self):
 63 |         itens_no_banco = ['123', '456', '789', '012']
 64 | 
 65 |         assert not _itens_não_presentes(self.itens, itens_no_banco)
 66 | 
 67 |     def test_item_extra_presente(self):
 68 |         itens_no_banco = ['123', '456', '789', '012', '444']
 69 | 
 70 |         assert not _itens_não_presentes(self.itens, itens_no_banco)
 71 | 
 72 |     @patch('robotj.extrator.datasources.models.conn')
 73 |     @patch('robotj.extrator.datasources.models._insere_movimento_db')
 74 |     @patch('robotj.extrator.datasources.models._insere_item_movimento_db')
 75 |     def test_inserir_movimento(
 76 |             self,
 77 |             _insere_item_movimento,
 78 |             _insere_movimento,
 79 |             conn):
 80 |         _insere_movimento.return_value = 1
 81 |         _insere_item_movimento.return_value = 1
 82 | 
 83 |         movimento = {
 84 |             'tipo-do-movimento': 'Movimento de Teste',
 85 |             'hash': '1234567890',
 86 |             'chave': 'valor'
 87 |         }
 88 | 
 89 |         resultado = insere_movimento(1, movimento)
 90 | 
 91 |         assert resultado == 1
 92 | 
 93 |         _insere_movimento.assert_called_once_with(1, movimento)
 94 |         _insere_item_movimento.assert_called_once_with(
 95 |             1,
 96 |             'chave',
 97 |             'valor')
 98 | 
 99 |     @patch('robotj.extrator.datasources.models.conn')
100 |     @patch('robotj.extrator.datasources.models._itens_não_presentes')
101 |     @patch('robotj.extrator.datasources.models.atualizar_vista')
102 |     @patch('robotj.extrator.datasources.models._obter_por_numero_processo')
103 |     @patch('robotj.extrator.datasources.models._atualizar_documento_db')
104 |     @patch('robotj.extrator.datasources.models._insere_documento_db')
105 |     @patch('robotj.extrator.datasources.models._obtem_hashs_movimentos')
106 |     @patch('robotj.extrator.datasources.models.insere_movimento')
107 |     def test_atualizar_documento_novo(
108 |             self,
109 |             insere_movimento,
110 |             _obtem_hashs_movimentos,
111 |             _insere_documento_db,
112 |             _atualizar_documento_db,
113 |             _obter_por_numero_processo,
114 |             atualizar_vista,
115 |             _itens_não_presentes,
116 |             conn):
117 | 
118 |         docu_dk = 3
119 | 
120 |         _obter_por_numero_processo.return_value = None
121 |         _insere_documento_db.return_value = 1
122 |         _obtem_hashs_movimentos.return_value = []
123 |         _itens_não_presentes.return_value = self.documento['itens']
124 |         insere_movimento.return_value = None
125 | 
126 |         atualizar_documento(self.documento, docu_dk)
127 | 
128 |         assert not atualizar_vista.called
129 |         assert not _atualizar_documento_db.called
130 | 
131 |         assert _insere_documento_db.called
132 |         assert _obtem_hashs_movimentos.called
133 |         assert _itens_não_presentes.called
134 |         assert insere_movimento.called
135 | 
136 |     @patch('robotj.extrator.datasources.models.conn')
137 |     @patch('robotj.extrator.datasources.models._itens_não_presentes')
138 |     @patch('robotj.extrator.datasources.models.atualizar_vista')
139 |     @patch('robotj.extrator.datasources.models._obter_por_numero_processo')
140 |     @patch('robotj.extrator.datasources.models._atualizar_documento_db')
141 |     @patch('robotj.extrator.datasources.models._insere_documento_db')
142 |     @patch('robotj.extrator.datasources.models._obtem_hashs_movimentos')
143 |     @patch('robotj.extrator.datasources.models.insere_movimento')
144 |     def test_atualizar_documento_existente_igual(
145 |             self,
146 |             insere_movimento,
147 |             _obtem_hashs_movimentos,
148 |             _insere_documento_db,
149 |             _atualizar_documento_db,
150 |             _obter_por_numero_processo,
151 |             atualizar_vista,
152 |             _itens_não_presentes,
153 |             conn):
154 | 
155 |         docu_dk = 3
156 | 
157 |         _obter_por_numero_processo.return_value = (1, '1234')
158 |         atualizar_documento(self.documento, docu_dk)
159 | 
160 |         assert atualizar_vista.called
161 |         assert not _atualizar_documento_db.called
162 |         assert not _insere_documento_db.called
163 |         assert not _obtem_hashs_movimentos.called
164 |         assert not _itens_não_presentes.called
165 |         assert not insere_movimento.called
166 | 
167 |     @patch('robotj.extrator.datasources.models.conn')
168 |     @patch('robotj.extrator.datasources.models._itens_não_presentes')
169 |     @patch('robotj.extrator.datasources.models.atualizar_vista')
170 |     @patch('robotj.extrator.datasources.models._obter_por_numero_processo')
171 |     @patch('robotj.extrator.datasources.models._atualizar_documento_db')
172 |     @patch('robotj.extrator.datasources.models._insere_documento_db')
173 |     @patch('robotj.extrator.datasources.models._obtem_hashs_movimentos')
174 |     @patch('robotj.extrator.datasources.models.insere_movimento')
175 |     def test_atualizar_documento_existente_diferente(
176 |             self,
177 |             insere_movimento,
178 |             _obtem_hashs_movimentos,
179 |             _insere_documento_db,
180 |             _atualizar_documento_db,
181 |             _obter_por_numero_processo,
182 |             atualizar_vista,
183 |             _itens_não_presentes,
184 |             conn):
185 | 
186 |         docu_dk = 3
187 | 
188 |         _obter_por_numero_processo.return_value = ('1134', 1)
189 |         _obtem_hashs_movimentos.return_value = []
190 |         _itens_não_presentes.return_value = self.documento['itens']
191 |         insere_movimento.return_value = None
192 | 
193 |         atualizar_documento(self.documento, docu_dk)
194 | 
195 |         self.assertFalse(atualizar_vista.called)
196 |         self.assertTrue(_atualizar_documento_db.called)
197 | 
198 |         self.assertFalse(_insere_documento_db.called)
199 |         self.assertTrue(_obtem_hashs_movimentos.called)
200 |         self.assertTrue(_itens_não_presentes.called)
201 |         self.assertTrue(insere_movimento.called)
202 | 


--------------------------------------------------------------------------------
/newrelic.ini:
--------------------------------------------------------------------------------
  1 | # ---------------------------------------------------------------------------
  2 | 
  3 | #
  4 | # This file configures the New Relic Python Agent.
  5 | #
  6 | # The path to the configuration file should be supplied to the function
  7 | # newrelic.agent.initialize() when the agent is being initialized.
  8 | #
  9 | # The configuration file follows a structure similar to what you would
 10 | # find for Microsoft Windows INI files. For further information on the
 11 | # configuration file format see the Python ConfigParser documentation at:
 12 | #
 13 | #    http://docs.python.org/library/configparser.html
 14 | #
 15 | # For further discussion on the behaviour of the Python agent that can
 16 | # be configured via this configuration file see:
 17 | #
 18 | #    http://newrelic.com/docs/python/python-agent-configuration
 19 | #
 20 | 
 21 | # ---------------------------------------------------------------------------
 22 | 
 23 | # Here are the settings that are common to all environments.
 24 | 
 25 | [newrelic]
 26 | 
 27 | # You must specify the license key associated with your New
 28 | # Relic account. This key binds the Python Agent's data to your
 29 | # account in the New Relic service.
 30 | 
 31 | # The application name. Set this to be the name of your
 32 | # application as you would like it to show up in New Relic UI.
 33 | # The UI will then auto-map instances of your application into a
 34 | # entry on your home dashboard page.
 35 | app_name = Zuleika
 36 | 
 37 | # When "true", the agent collects performance data about your
 38 | # application and reports this data to the New Relic UI at
 39 | # newrelic.com. This global switch is normally overridden for
 40 | # each environment below.
 41 | monitor_mode = true
 42 | 
 43 | # Sets the name of a file to log agent messages to. Useful for
 44 | # debugging any issues with the agent. This is not set by
 45 | # default as it is not known in advance what user your web
 46 | # application processes will run as and where they have
 47 | # permission to write to. Whatever you set this to you must
 48 | # ensure that the permissions for the containing directory and
 49 | # the file itself are correct, and that the user that your web
 50 | # application runs as can write to the file. If not able to
 51 | # write out a log file, it is also possible to say "stderr" and
 52 | # output to standard error output. This would normally result in
 53 | # output appearing in your web server log.
 54 | #log_file = /tmp/newrelic-python-agent.log
 55 | 
 56 | # Sets the level of detail of messages sent to the log file, if
 57 | # a log file location has been provided. Possible values, in
 58 | # increasing order of detail, are: "critical", "error", "warning",
 59 | # "info" and "debug". When reporting any agent issues to New
 60 | # Relic technical support, the most useful setting for the
 61 | # support engineers is "debug". However, this can generate a lot
 62 | # of information very quickly, so it is best not to keep the
 63 | # agent at this level for longer than it takes to reproduce the
 64 | # problem you are experiencing.
 65 | log_level = error
 66 | 
 67 | # High Security Mode enforces certain security settings, and prevents
 68 | # them from being overridden, so that no sensitive data is sent to New
 69 | # Relic. Enabling High Security Mode means that request parameters are
 70 | # not collected and SQL can not be sent to New Relic in its raw form.
 71 | # To activate High Security Mode, it must be set to 'true' in this
 72 | # local .ini configuration file AND be set to 'true' in the
 73 | # server-side configuration in the New Relic user interface. For
 74 | # details, see
 75 | # https://docs.newrelic.com/docs/subscriptions/high-security
 76 | high_security = false
 77 | 
 78 | # The Python Agent will attempt to connect directly to the New
 79 | # Relic service. If there is an intermediate firewall between
 80 | # your host and the New Relic service that requires you to use a
 81 | # HTTP proxy, then you should set both the "proxy_host" and
 82 | # "proxy_port" settings to the required values for the HTTP
 83 | # proxy. The "proxy_user" and "proxy_pass" settings should
 84 | # additionally be set if proxy authentication is implemented by
 85 | # the HTTP proxy. The "proxy_scheme" setting dictates what
 86 | # protocol scheme is used in talking to the HTTP proxy. This
 87 | # would normally always be set as "http" which will result in the
 88 | # agent then using a SSL tunnel through the HTTP proxy for end to
 89 | # end encryption.
 90 | # proxy_scheme = http
 91 | # proxy_host = hostname
 92 | # proxy_port = 8080
 93 | # proxy_user =
 94 | # proxy_pass =
 95 | 
 96 | # Capturing request parameters is off by default. To enable the
 97 | # capturing of request parameters, first ensure that the setting
 98 | # "attributes.enabled" is set to "true" (the default value), and
 99 | # then add "request.parameters.*" to the "attributes.include"
100 | # setting. For details about attributes configuration, please
101 | # consult the documentation.
102 | attributes.include = *
103 | 
104 | # The transaction tracer captures deep information about slow
105 | # transactions and sends this to the UI on a periodic basis. The
106 | # transaction tracer is enabled by default. Set this to "false"
107 | # to turn it off.
108 | transaction_tracer.enabled = true
109 | 
110 | # Threshold in seconds for when to collect a transaction trace.
111 | # When the response time of a controller action exceeds this
112 | # threshold, a transaction trace will be recorded and sent to
113 | # the UI. Valid values are any positive float value, or (default)
114 | # "apdex_f", which will use the threshold for a dissatisfying
115 | # Apdex controller action - four times the Apdex T value.
116 | transaction_tracer.transaction_threshold = apdex_f
117 | 
118 | # When the transaction tracer is on, SQL statements can
119 | # optionally be recorded. The recorder has three modes, "off"
120 | # which sends no SQL, "raw" which sends the SQL statement in its
121 | # original form, and "obfuscated", which strips out numeric and
122 | # string literals.
123 | transaction_tracer.record_sql = raw
124 | 
125 | # Threshold in seconds for when to collect stack trace for a SQL
126 | # call. In other words, when SQL statements exceed this
127 | # threshold, then capture and send to the UI the current stack
128 | # trace. This is helpful for pinpointing where long SQL calls
129 | # originate from in an application.
130 | transaction_tracer.stack_trace_threshold = 0.5
131 | 
132 | # Determines whether the agent will capture query plans for slow
133 | # SQL queries. Only supported in MySQL and PostgreSQL. Set this
134 | # to "false" to turn it off.
135 | transaction_tracer.explain_enabled = true
136 | 
137 | # Threshold for query execution time below which query plans
138 | # will not not be captured. Relevant only when "explain_enabled"
139 | # is true.
140 | transaction_tracer.explain_threshold = 0.5
141 | 
142 | # Space separated list of function or method names in form
143 | # 'module:function' or 'module:class.function' for which
144 | # additional function timing instrumentation will be added.
145 | transaction_tracer.function_trace =
146 | 
147 | # The error collector captures information about uncaught
148 | # exceptions or logged exceptions and sends them to UI for
149 | # viewing. The error collector is enabled by default. Set this
150 | # to "false" to turn it off.
151 | error_collector.enabled = true
152 | 
153 | # To stop specific errors from reporting to the UI, set this to
154 | # a space separated list of the Python exception type names to
155 | # ignore. The exception name should be of the form 'module:class'.
156 | error_collector.ignore_errors =
157 | 
158 | # Browser monitoring is the Real User Monitoring feature of the UI.
159 | # For those Python web frameworks that are supported, this
160 | # setting enables the auto-insertion of the browser monitoring
161 | # JavaScript fragments.
162 | browser_monitoring.auto_instrument = true
163 | 
164 | # A thread profiling session can be scheduled via the UI when
165 | # this option is enabled. The thread profiler will periodically
166 | # capture a snapshot of the call stack for each active thread in
167 | # the application to construct a statistically representative
168 | # call tree.
169 | thread_profiler.enabled = true
170 | 
171 | # Your application deployments can be recorded through the
172 | # New Relic REST API. To use this feature provide your API key
173 | # below then use the `newrelic-admin record-deploy` command.
174 | # api_key =
175 | 
176 | # ---------------------------------------------------------------------------
177 | 
178 | #
179 | # The application environments. These are specific settings which
180 | # override the common environment settings. The settings related to a
181 | # specific environment will be used when the environment argument to the
182 | # newrelic.agent.initialize() function has been defined to be either
183 | # "development", "test", "staging" or "production".
184 | #
185 | 
186 | 
187 | [newrelic:development]
188 | app_name = Zuleika (dev)
189 | monitor_mode = true
190 | 
191 | [newrelic:production]
192 | monitor_mode = true
193 | 
194 | # ---------------------------------------------------------------------------
195 | 


--------------------------------------------------------------------------------
/extrator/datasources/models.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import cx_Oracle
  3 | from ..base.utils import conn, session, cxoracle
  4 | from .tjrj_models import (
  5 |     TB_PROCESSO,
  6 |     TB_MOVIMENTO_PROCESSO,
  7 |     TB_ITEM_MOVIMENTO,
  8 |     SQ_ITEM_MOVIMENTO,
  9 |     SQ_MOVIMENTO,
 10 |     SQ_PROCESSO
 11 | )
 12 | # from .broker import classificar
 13 | from .mcpr_models import TB_DOCUMENTO
 14 | from sqlalchemy.sql.expression import func
 15 | from sqlalchemy.sql.functions import sysdate
 16 | from sqlalchemy.sql.expression import nullsfirst
 17 | 
 18 | 
 19 | def obtem(documento, chave):
 20 |     return json.dumps(documento.get(chave))
 21 | 
 22 | 
 23 | def _preenche_valores(documento, tabela):
 24 |     tabela = tabela.values(
 25 |         prtj_cd_numero_processo=documento.get('numero-processo'),
 26 |         prtj_tx_executado=obtem(documento, 'executado'),
 27 |         prtj_tx_advogado_s=obtem(documento, 'advovado-s'),
 28 |         prtj_tx_numero_do_tombo=obtem(documento, 'numero-do-tombo'),
 29 |         prtj_tx_oficio_de_registro=obtem(documento, 'oficio-de-registro'),
 30 |         prtj_tx_folha=obtem(documento, 'folha'),
 31 |         prtj_tx_requerido=obtem(documento, 'requerido'),
 32 |         prtj_tx_exequente=obtem(documento, 'exequente'),
 33 |         prtj_tx_representante_legal=obtem(documento, 'representante-legal'),
 34 |         prtj_tx_acao=obtem(documento, 'acao'),
 35 |         prtj_tx_comunicante=obtem(documento, 'comunicante'),
 36 |         prtj_tx_requerente=obtem(documento, 'requerente'),
 37 |         prtj_tx_bairro=obtem(documento, 'bairro'),
 38 |         prtj_tx_livro=obtem(documento, 'livro'),
 39 |         prtj_tx_pai=obtem(documento, 'pai'),
 40 |         prtj_tx_mae=obtem(documento, 'mae'),
 41 |         prtj_tx_aviso_ao_advogado=obtem(documento, 'aviso-ao-advogado'),
 42 |         prtj_tx_status=obtem(documento, 'status'),
 43 |         prtj_tx_comarca=obtem(documento, 'comarca'),
 44 |         prtj_tx_assistente=obtem(documento, 'assistente'),
 45 |         prtj_tx_cidade=obtem(documento, 'cidade'),
 46 |         prtj_tx_autor_do_fato=obtem(documento, 'autor-do-fato'),
 47 |         prtj_tx_acusado=obtem(documento, 'acusado'),
 48 |         prtj_tx_impetrado=obtem(documento, 'impetrado'),
 49 |         prtj_tx_impetrante=obtem(documento, 'impetrante'),
 50 |         prtj_tx_notificado=obtem(documento, 'notificado'),
 51 |         prtj_tx_autor=obtem(documento, 'autor'),
 52 |         prtj_tx_intimado=obtem(documento, 'intimado'),
 53 |         prtj_tx_idoso=obtem(documento, 'idoso'),
 54 |         prtj_tx_avo_avo=obtem(documento, 'avo-avo'),
 55 |         prtj_tx_reu=obtem(documento, 'reu'),
 56 |         prtj_tx_reclamado=obtem(documento, 'reclamado'),
 57 |         prtj_tx_endereco=obtem(documento, 'endereco'),
 58 |         prtj_tx_prazo=obtem(documento, 'prazo'),
 59 |         prtj_tx_classe=obtem(documento, 'classe'),
 60 |         prtj_tx_assunto=obtem(documento, 'assunto'),
 61 |         prtj_dt_ultima_atualizacao=sysdate(),
 62 |         prtj_dt_ultima_vista=sysdate(),
 63 |         prtj_hash=documento.get('hash'),
 64 |     )
 65 |     return tabela
 66 | 
 67 | 
 68 | def obter_documentos_externos():
 69 |     query = session().query(
 70 |         TB_DOCUMENTO.c.docu_nr_externo,
 71 |         TB_DOCUMENTO.c.docu_dk,
 72 |         TB_PROCESSO.c.prtj_dt_ultima_vista).outerjoin(
 73 |         TB_PROCESSO,
 74 |         TB_DOCUMENTO.c.docu_nr_externo == TB_PROCESSO.c.prtj_cd_numero_processo
 75 |     ).filter(
 76 |         TB_DOCUMENTO.c.docu_mate_dk == 4
 77 |     ).filter(
 78 |             func.length(TB_DOCUMENTO.c.docu_nr_externo) == 20).order_by(
 79 |         nullsfirst(TB_PROCESSO.c.prtj_dt_ultima_vista),
 80 |     )
 81 |     return [(doc[0], doc[1]) for doc in query]
 82 | 
 83 | 
 84 | def _obter_por_numero_processo(numero_documento):
 85 |     retorno = session().query(
 86 |         TB_PROCESSO.c.prtj_dk,
 87 |         TB_PROCESSO.c.prtj_hash
 88 |         ).filter(
 89 |             TB_PROCESSO.c.prtj_cd_numero_processo == numero_documento
 90 |         ).first()
 91 | 
 92 |     return retorno
 93 | 
 94 | 
 95 | # ------------------------------------------------------------------------
 96 | # Atualizacao de Movimento
 97 | # ------------------------------------------------------------------------
 98 | 
 99 | 
100 | def _insere_item_movimento_db(dk_movimento, chave, valor):
101 |     insert = TB_ITEM_MOVIMENTO.insert().values(
102 |         mvit_dk=SQ_ITEM_MOVIMENTO.next_value(),
103 |         mvit_prmv_dk=dk_movimento,
104 |         mvit_tp_chave=chave,
105 |         mvit_tp_valor=json.dumps(valor)
106 |     )
107 |     conn().execute(insert)
108 | 
109 | 
110 | def _insere_movimento_blob_db(dk_processo, movimento):
111 |     sql = ("INSERT INTO TJRJ.TJRJ_PROCESSO_MOVIMENTO_TJ "
112 |            "(PRMV_DK, PRMV_PRTJ_DK, PRMV_TP_MOVIMENTO, "
113 |            "PRMV_DT_ULTIMA_ATUALIZACAO, PRMV_TX_INTEIRO_TEOR, PRMV_HASH) "
114 |            "VALUES(tjrj_sq_prmv_dk.NEXTVAL, :DK_PROCESSO, "
115 |            ":TP_MOVIMENTO, SYSDATE, :PRMV_TX_INTEIRO_TEOR,:HASH) "
116 |            "returning PRMV_DK into :x")
117 | 
118 |     cursor = cxoracle().cursor()
119 |     seq = cursor.var(cx_Oracle.NUMBER)
120 |     cursor.setinputsizes(PRMV_TX_INTEIRO_TEOR=cx_Oracle.NCLOB)
121 |     cursor.prepare(
122 |         sql
123 |     )
124 | 
125 |     cursor.execute(
126 |         None,
127 |         DK_PROCESSO=dk_processo,
128 |         TP_MOVIMENTO=movimento['tipo-do-movimento'],
129 |         PRMV_TX_INTEIRO_TEOR=movimento['inteiro-teor'].encode('utf-8'),
130 |         HASH=movimento['hash'],
131 |         x=seq)
132 | 
133 |     cursor.close()
134 | 
135 |     return seq.getvalue()
136 | 
137 | 
138 | def _insere_movimento_db(dk_processo, movimento):
139 |     if 'inteiro-teor' in movimento:
140 |         id_inserido = _insere_movimento_blob_db(dk_processo, movimento)
141 |         # classificar.delay(id_inserido, movimento['inteiro-teor'])
142 |     else:
143 |         insert = TB_MOVIMENTO_PROCESSO.insert().values(
144 |             prmv_dk=SQ_MOVIMENTO.next_value(),
145 |             prmv_prtj_dk=dk_processo,
146 |             prmv_tp_movimento=movimento['tipo-do-movimento'],
147 |             prmv_dt_ultima_atualizacao=sysdate(),
148 |             prmv_hash=movimento['hash']
149 |         )
150 | 
151 |         resultado = conn().execute(insert)
152 |         id_inserido = resultado.inserted_primary_key[0]
153 | 
154 |     return id_inserido
155 | 
156 | 
157 | def insere_movimento(dk_processo, movimento):
158 |     id_inserido = _insere_movimento_db(dk_processo, movimento)
159 | 
160 |     for item in movimento:
161 |         if item in ['hash', 'tipo-do-movimento', 'inteiro-teor']:
162 |             continue
163 |         _insere_item_movimento_db(id_inserido, item, movimento[item])
164 | 
165 |     return id_inserido
166 | 
167 | # ------------------------------------------------------------------------
168 | # Atualizacao de Documento
169 | # ------------------------------------------------------------------------
170 | 
171 | 
172 | def atualizar_documento(documento, docu_dk):
173 |     processo = _obter_por_numero_processo(documento['numero-processo'])
174 | 
175 |     if processo:
176 |         if processo[1] == documento['hash']:
177 |             atualizar_vista(documento['numero-processo'], docu_dk, processo)
178 |             return
179 | 
180 |         id_processo = processo[0]
181 |         _atualizar_documento_db(documento, id_processo)
182 |     else:
183 |         id_processo = _insere_documento_db(documento, docu_dk)
184 | 
185 |     hashs_existentes = _obtem_hashs_movimentos(id_processo)
186 |     movimentos_inserir = _itens_não_presentes(
187 |         documento['itens'],
188 |         hashs_existentes)
189 | 
190 |     for movimento in movimentos_inserir:
191 |         insere_movimento(id_processo, movimento)
192 | 
193 | 
194 | def atualizar_vista(numero_documento, docu_dk, processo=None):
195 |     processo = processo if processo else _obter_por_numero_processo(
196 |         numero_documento)
197 | 
198 |     if processo:
199 |         _atualiza_vista_db(processo[0])
200 |     else:
201 |         _insere_vista_db(numero_documento, docu_dk)
202 | 
203 | 
204 | def _insere_vista_db(numero_documento, docu_dk):
205 |     insert = TB_PROCESSO.insert().values(
206 |         prtj_dk=SQ_PROCESSO.next_value(),
207 |         prtj_docu_dk=docu_dk,
208 |         prtj_cd_numero_processo=numero_documento,
209 |         prtj_dt_ultima_atualizacao=sysdate(),
210 |         prtj_dt_ultima_vista=sysdate(),
211 |     )
212 | 
213 |     conn().execute(insert)
214 | 
215 | 
216 | def _atualiza_vista_db(id_processo):
217 |     update = TB_PROCESSO.update().where(
218 |         TB_PROCESSO.c.prtj_dk == id_processo
219 |     ).values(
220 |         prtj_dt_ultima_vista=sysdate()
221 |     )
222 | 
223 |     conn().execute(update)
224 | 
225 | 
226 | def _insere_documento_db(documento, docu_dk):
227 |     insert = TB_PROCESSO.insert()
228 | 
229 |     insert = _preenche_valores(documento, insert)
230 | 
231 |     insert = insert.values(
232 |         prtj_docu_dk=docu_dk,
233 |         prtj_dk=SQ_PROCESSO.next_value(),
234 |     )
235 | 
236 |     resultado = conn().execute(insert)
237 | 
238 |     return resultado.inserted_primary_key[0]
239 | 
240 | 
241 | def _atualizar_documento_db(documento, prtj_dk):
242 |     update = TB_PROCESSO.update()
243 | 
244 |     update = _preenche_valores(documento, update)
245 | 
246 |     update = update.where(
247 |         TB_PROCESSO.c.prtj_dk == prtj_dk
248 |     )
249 | 
250 |     conn().execute(update)
251 | 
252 | 
253 | def _itens_não_presentes(movimentos, lista_hashs):
254 |     retorno = []
255 |     for movimento in movimentos:
256 |         if movimento['hash'] not in lista_hashs:
257 |             retorno += [movimento]
258 | 
259 |     return retorno
260 | 
261 | 
262 | def _obtem_hashs_movimentos(prtj_dk):
263 |     return [doc[0] for doc in session().query(
264 |         TB_MOVIMENTO_PROCESSO.c.prmv_hash).filter(
265 |         TB_MOVIMENTO_PROCESSO.c.prmv_prtj_dk == prtj_dk
266 |     )]
267 | 


--------------------------------------------------------------------------------
/extrator/test/test_parser.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from unittest.mock import patch
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | from ..crawler.parser import (parse_metadados,
  7 |                               area_dos_metadados,
  8 |                               extrai_dados_colunas,
  9 |                               parse_itens,
 10 |                               parse_processo_apensado)
 11 | from .fixtures.processos import (processo_judicial_1,
 12 |                                  processo_judicial_2,
 13 |                                  processo_judicial_3,
 14 |                                  processo_judicial_4,
 15 |                                  processo_judicial_5,
 16 |                                  processo_judicial_6,
 17 |                                  processo_judicial_7,
 18 |                                  trecho_processo_judicial_1,
 19 |                                  process_com_mandado_pagamento)
 20 | 
 21 | 
 22 | def _prepara_html(html, tag='tr'):
 23 |     soup_obj = BeautifulSoup(html, 'lxml')
 24 |     return soup_obj.find_all(tag)
 25 | 
 26 | 
 27 | class ParserMetadados(TestCase):
 28 |     def test_parse_processos_no_tribunal(self):
 29 |         esperado = {
 30 |             'processo-s-no-tribunal-de-justica': [
 31 |                 '0021913-53.2011.8.19.0000',
 32 |                 '0000159-51.2010.8.19.0045'
 33 |             ]
 34 |         }
 35 | 
 36 |         item = {}
 37 |         parse_processo_apensado(
 38 |             _prepara_html(trecho_processo_judicial_1, 'td'),
 39 |             item,
 40 |             'processo-s-no-tribunal-de-justica')
 41 |         assert item == esperado
 42 | 
 43 |     def test_parse_metadados_processo_judicial(self):
 44 |         metadados = parse_metadados(
 45 |             _prepara_html(processo_judicial_1),
 46 |             '0004999-58.2015.8.19.0036',
 47 |             inicio_metadados=6,
 48 |             fim_metadados=26
 49 |         )
 50 | 
 51 |         esperado = {
 52 |             'numero-processo': '0004999-58.2015.8.19.0036',
 53 |             'status': 'PROCESSO COM BAIXA',
 54 |             'comarca': [
 55 |                 'Comarca de Nilópolis',
 56 |                 '2ª Vara de Família e da Infância e da Juventude e do Idoso',
 57 |                 'Cartório da 2ª Vara de Família, Inf. e da Juv. e do Idoso'],
 58 |             'endereco': ['Getúlio Vargas 571 - 6º andar'],
 59 |             'bairro': ['Olinda'],
 60 |             'aviso-ao-advogado': [''],
 61 |             'cidade': ['Nilópolis'],
 62 |             'acao': [('Medidas Pertinentes Aos Pais Ou '
 63 |                      'Responsável / Seção Cível')],
 64 |             'assunto': [('Medidas Pertinentes Aos Pais Ou Responsável'
 65 |                          ' / Seção Cível')],
 66 |             'classe': [('Perda ou Suspensão ou Restabelecimento do Poder '
 67 |                         'Familiar')],
 68 |             'autor': ['MINISTÉRIO PÚBLICO DO ESTADO DO RIO DE JANEIRO'],
 69 |             'requerido': ['DANIELLE MARIA GOMES BARBOSA'],
 70 |             'requerente': [''],
 71 |             'advogado-s': ['TJ000002 - DEFENSOR PÚBLICO']}
 72 | 
 73 |         for chave, valor in esperado.items():
 74 |             with self.subTest():
 75 |                 self.assertEqual(metadados[chave], valor)
 76 | 
 77 |     def test_parse_metadados_de_outro_processo_com_outras_informacoes(self):
 78 |         metadados = parse_metadados(
 79 |             _prepara_html(processo_judicial_2),
 80 |             '0025375-16.2012.8.19.0054',
 81 |             inicio_metadados=6,
 82 |             fim_metadados=27
 83 |         )
 84 | 
 85 |         esperado = {
 86 |             'numero-processo': '0025375-16.2012.8.19.0054',
 87 |             'status': 'ARQUIVADO EM DEFINITIVO - MAÇO Nº 722, em 20/05/2013',
 88 |             'comarca': [
 89 |                 'Comarca de São João de Meriti',
 90 |                 'Juizado da Infância e Juventude e do Idoso',
 91 |                 'Cartório do Juizado da Infância e Juventude e do Idoso'],
 92 |             'endereco': ['Av. Presidente Lincoln 857'],
 93 |             'bairro': ['Vilar dos Teles'],
 94 |             'cidade': ['São João de Meriti'],
 95 |             'acao': ['Entrada e Permanência de Menores / Seção Cível'],
 96 |             'assunto': ['Entrada e Permanência de Menores / Seção Cível'],
 97 |             'classe': ['Autorização judicial - ECA'],
 98 |             'aviso-ao-advogado': ['tem peça na pasta.'],
 99 |             'autor': [''],
100 |             'livro': [''],
101 |             'folha': [''],
102 |             'numero-do-tombo': [''],
103 |             'requerido': [''],
104 |             'requerente': ['IGREJA EVANGÉLICA NOVA ASSEMBLÉIA DE DEUS'],
105 |             'advogado-s': ['RJ081634 - IRANY SPERANDIO DE MEDEIROS']}
106 | 
107 |         for chave, valor in esperado.items():
108 |             with self.subTest():
109 |                 self.assertEqual(metadados[chave], valor)
110 | 
111 |     def test_parsea_processo_com_informacoes_de_comarca_diferentes(self):
112 |         metadados = parse_metadados(
113 |             _prepara_html(processo_judicial_3),
114 |             '0001762-56.2009.8.19.0026',
115 |             inicio_metadados=7,
116 |             fim_metadados=23
117 |         )
118 | 
119 |         esperado = {
120 |             'numero-processo': '0001762-56.2009.8.19.0026',
121 |             'status': 'ARQUIVADO EM DEFINITIVO - MAÇO Nº 1903, em 22/11/2012',
122 |             'comarca': [
123 |                 'Comarca de Itaperuna',
124 |                 'Vara de Família e da Infância e da Juventude e do Idoso',
125 |                 'Cartório da Vara de Família, Inf. e da Juv. e do Idoso'],
126 |             'endereco': ['Rodovia Br-356 Km 01'],
127 |             'bairro': [''],
128 |             'cidade': ['Itaperuna'],
129 |             'acao': ['Adoção de Criança / Seção Cível'],
130 |             'assunto': ['Adoção de Criança / Seção Cível'],
131 |             'classe': ['Adoção c/c Destituição do Poder Familiar - ECA'],
132 |             'aviso-ao-advogado': [''],
133 |             'autor': [''],
134 |             'requerido': [''],
135 |             'requerente': [''],
136 |             'advogado-s': ['RJ146889 - VIRGINIA MARIA RAMOS DA FONSECA']}
137 | 
138 |         for chave, valor in esperado.items():
139 |             with self.subTest():
140 |                 self.assertEqual(metadados[chave], valor)
141 | 
142 |     def test_parsea_processo_com_link_nos_metadados(self):
143 |         metadados = parse_metadados(
144 |             _prepara_html(processo_judicial_4),
145 |             '0441870-74.2008.8.19.0001',
146 |             inicio_metadados=7,
147 |             fim_metadados=27
148 |         )
149 | 
150 |         esperado = {
151 |             'numero-processo': '0441870-74.2008.8.19.0001',
152 |             'status': 'ARQUIVADO EM DEFINITIVO - MAÇO Nº 9819, em 24/02/2013',
153 |             'comarca': [
154 |                 'Comarca da Capital',
155 |                 '1ª Vara da Infância da Juventude e do Idoso',
156 |                 'Cartório da 1ª Vara da Infância, da Juventude e do Idoso'],
157 |             'endereco': ['Praça Onze de Junho 403 Praça Onze'],
158 |             'bairro': ['Centro'],
159 |             'cidade': ['Rio de Janeiro'],
160 |             'acao': [''],
161 |             'assunto': ['Adoção Nacional / Seção Cível'],
162 |             'classe': ['Adoção c/c Destituição do Poder Familiar - ECA'],
163 |             'aviso-ao-advogado': [''],
164 |             'autor': [''],
165 |             'requerido': ['MARIA GISLEUDA RODRIGUES DA SILVA'],
166 |             'requerente': ['FRANCISCO CAMILO RIBEIRO e outro(s)...'],
167 |             'advogado-s': ['TJ000002 - DEFENSOR PÚBLICO']}
168 | 
169 |         for chave, valor in esperado.items():
170 |             with self.subTest():
171 |                 self.assertEqual(metadados[chave], valor)
172 | 
173 |     def test_parsea_processo_com_link_antes_dos_metadados(self):
174 |         metadados = parse_metadados(
175 |             _prepara_html(processo_judicial_5),
176 |             '0001394-96.2011.8.19.0084',
177 |             inicio_metadados=0,
178 |             fim_metadados=23
179 |         )
180 | 
181 |         esperado = {
182 |             'numero-processo': '0001394-96.2011.8.19.0084',
183 |             'status': '',
184 |             'comarca': [
185 |                 'Comarca de Carapebus / Quissamã',
186 |                 'Vara Única',
187 |                 'Cartório da Vara Única'],
188 |             'endereco': ['Estrada do Correio Imperial 1003'],
189 |             'bairro': ['Piteiras'],
190 |             'cidade': ['Quissamã'],
191 |             'acao': ['Medidas Pertinentes Aos Pais Ou Responsável /'
192 |                      ' Seção Cível'],
193 |             'assunto': ['Medidas Pertinentes Aos Pais Ou Responsável /'
194 |                         ' Seção Cível'],
195 |             'classe': ['Apuração de Infração Administrativa às Normas de'
196 |                        ' Proteção'],
197 |             'aviso-ao-advogado': [''],
198 |             'autor': [''],
199 |             'requerido': [''],
200 |             'requerente': [''],
201 |             'advogado-s': ['RJ125011 - ALBECIR RIBEIRO RJ143662 -'
202 |                            ' PAULO ROMERO AQUINO BARBOSA']}
203 | 
204 |         for chave, valor in esperado.items():
205 |             with self.subTest():
206 |                 self.assertEqual(metadados[chave], valor)
207 | 
208 |     def test_parsea_processo_com_nome_regional_ao_inves_de_comarca(self):
209 |         metadados = parse_metadados(
210 |             _prepara_html(processo_judicial_6),
211 |             '0021491-54.2011.8.19.0202',
212 |             inicio_metadados=6,
213 |             fim_metadados=21
214 |         )
215 | 
216 |         esperado = {
217 |             'numero-processo': '0021491-54.2011.8.19.0202',
218 |             'status': 'ARQUIVADO EM DEFINITIVO - MAÇO Nº 442, em 27/02/2012',
219 |             'comarca': [
220 |                 'Regional de Madureira',
221 |                 '3ª Vara da Infância, da Juventude e do Idoso',
222 |                 'Cartório da 3ª Vara da Infância, da Juventude e do Idoso'],
223 |             'endereco': ['Avenida Ernani Cardoso 152 2º andar'],
224 |             'bairro': ['Cascadura'],
225 |             'cidade': ['Rio de Janeiro'],
226 |             'acao': ['Acolhimento Institucional de Crianças e'
227 |                      ' Adolescentes/seção Cível'],
228 |             'assunto': ['Acolhimento Institucional de Crianças e'
229 |                         ' Adolescentes/seção Cível'],
230 |             'classe': ['Providência - ECA'],
231 |             'aviso-ao-advogado': [''],
232 |             'autor': [''],
233 |             'requerido': [''],
234 |             'requerente': [''],
235 |             'advogado-s': ['']}
236 | 
237 |         for chave, valor in esperado.items():
238 |             with self.subTest():
239 |                 self.assertEqual(metadados[chave], valor)
240 | 
241 |     def test_delimita_linhas_dos_metadados_processo_judicial_1(self):
242 |         inicio, fim = area_dos_metadados(
243 |             _prepara_html(processo_judicial_1)
244 |         )
245 | 
246 |         inicio_esperado = 6
247 |         fim_esperado = 26
248 | 
249 |         self.assertEqual(inicio, inicio_esperado)
250 |         self.assertEqual(fim, fim_esperado)
251 | 
252 |     def test_delimita_linhas_dos_metadados_processo_judicial_3(self):
253 |         """
254 |             O Processo judicial numero 3, diferente dos outros 2 presentes
255 |             nas fixtures, inicia os metadados em uma linha diferente.
256 |         """
257 |         inicio, fim = area_dos_metadados(
258 |             _prepara_html(processo_judicial_3)
259 |         )
260 | 
261 |         inicio_esperado = 7
262 |         fim_esperado = 23
263 | 
264 |         self.assertEqual(inicio, inicio_esperado)
265 |         self.assertEqual(fim, fim_esperado)
266 | 
267 |     def test_extrai_dados_das_colunas(self):
268 |         html = """
269 |                 <tr>
270 |                  <td class="negrito" nowrap="" valign="top">Tipo:</td>
271 |                  <td align="justify" class="normal" valign="top">Conclusão</td>
272 |                  </tr>
273 |                 """
274 |         soup = _prepara_html(html)[0].find_all('td')
275 |         dados_das_colunas = extrai_dados_colunas(soup)
276 |         esperado = ['Tipo:', 'Conclusão']
277 | 
278 |         self.assertEqual(dados_das_colunas, esperado)
279 | 
280 | 
281 | class ComparaItensProcessoMixin:
282 |     def assert_items_equal(self, first, second):
283 |         self.assertEqual(first['numero-processo'], second['numero-processo'])
284 |         items_first = first['itens']
285 |         items_second = second['itens']
286 | 
287 |         self.assertEqual(len(items_first), len(items_second))
288 | 
289 |         for item_first, item_second in zip(items_first, items_second):
290 |             for key, value in item_second.items():
291 |                 with self.subTest():
292 |                     self.assertEqual(item_first[key], value)
293 | 
294 | 
295 | class ParserItems(ComparaItensProcessoMixin, TestCase):
296 |     @patch('robotj.extrator.crawler.parser.cria_hash_do_movimento',
297 |            return_value='1234')
298 |     def test_extrai_itens_do_processo_judicial_1(self, _chdm):
299 |         soup = BeautifulSoup(processo_judicial_1, 'lxml')
300 |         itens = parse_itens(
301 |             soup,
302 |             '0004999-58.2015.8.19.0036',
303 |             inicio_itens=26
304 |         )
305 |         esperado = {
306 |             'numero-processo': '0004999-58.2015.8.19.0036',
307 |             'itens': [{
308 |                 'tipo-do-movimento': 'Declínio de Competência',
309 |                 'hash': '1234',
310 |                 'data': ['11/01/2016'],
311 |                 'descricao':
312 |                 ['VIJI DA COMARCA DE SÃO MATHEUS - ESPIRITO SANTOS']
313 |             }, {
314 |                 'tipo-do-movimento': 'Recebimento',
315 |                 'data-de-recebimento': ['19/11/2015']
316 |             }, {
317 |                 'tipo-do-movimento': 'Decisão - Declínio de Competência',
318 |                 'hash': '1234',
319 |                 'data-decisao':
320 |                 ['21/10/2015'],
321 |                 'descricao': ['Ante o teor de fls. 104, DECLINO DE MINHA'
322 |                               ' COMPETÊNCIA para o Juízo da Infância e'
323 |                               ' Juventude da Comarca de São Mateus, no'
324 |                               ' Espírito Santo. Dê-se baixa e encaminhem-se'
325 |                               ' imediatamente, com as nossas homenagens.']
326 |             }, {
327 |                 'tipo-do-movimento': 'Conclusão ao Juiz',
328 |                 'hash': '1234',
329 |                 'data-da-conclusao': ['21/10/2015'],
330 |                 'juiz': ['VIVIANE TOVAR DE MATTOS ABRAHAO']
331 |             }, {
332 |                 'tipo-do-movimento': 'Decurso de Prazo',
333 |                 'hash': '1234',
334 |                 'data-do-movimento': ['20/10/2015']
335 |             }, {
336 |                 'tipo-do-movimento': 'Recebidos os autos',
337 |                 'hash': '1234',
338 |                 'data-do-recebimento': ['20/10/2015']
339 |             }, {
340 |                 'tipo-do-movimento': 'Remessa',
341 |                 'hash': '1234',
342 |                 'destinatario': ['Ministério Público'],
343 |                 'data-da-remessa': ['06/08/2015'],
344 |                 'prazo': ['15 dia(s)']
345 |             }, {
346 |                 'tipo-do-movimento': 'Recebimento',
347 |                 'hash': '1234',
348 |                 'data-de-recebimento': ['30/07/2015']
349 |             }, {
350 |                 'tipo-do-movimento':
351 |                 'Despacho - Proferido despacho de mero expediente',
352 |                 'hash': '1234',
353 |                 'data-despacho':
354 |                 ['28/07/2015'],
355 |                 'descricao':
356 |                 ['Dê-se vista ao Ministério Público.']
357 |             }, {
358 |                 'tipo-do-movimento': 'Conclusão ao Juiz',
359 |                 'hash': '1234',
360 |                 'data-da-conclusao': ['28/07/2015'],
361 |                 'juiz': ['VIVIANE TOVAR DE MATTOS ABRAHAO']
362 |             }, {
363 |                 'tipo-do-movimento': 'Decurso de Prazo',
364 |                 'hash': '1234',
365 |                 'data-do-movimento': ['27/07/2015']
366 |             }, {
367 |                 'tipo-do-movimento': 'Recebidos os autos',
368 |                 'hash': '1234',
369 |                 'data-do-recebimento': ['21/07/2015']
370 |             }, {
371 |                 'tipo-do-movimento': 'Remessa',
372 |                 'hash': '1234',
373 |                 'destinatario': ['Psicologia'],
374 |                 'data-da-remessa': ['17/07/2015'],
375 |                 'prazo': ['15 dia(s)']
376 |             }, {
377 |                 'tipo-do-movimento': 'Recebidos os autos',
378 |                 'hash': '1234',
379 |                 'data-do-recebimento': ['17/07/2015']
380 |             }, {
381 |                 'tipo-do-movimento': 'Remessa',
382 |                 'hash': '1234',
383 |                 'destinatario': ['Assistente Social'],
384 |                 'data-da-remessa': ['15/06/2015'],
385 |                 'prazo': ['15 dia(s)']
386 |             }, {
387 |                 'tipo-do-movimento': 'Recebimento',
388 |                 'hash': '1234',
389 |                 'data-de-recebimento': ['22/05/2015']
390 |             }, {
391 |                 'tipo-do-movimento':
392 |                 'Despacho - Proferido despacho de mero expediente',
393 |                 'hash': '1234',
394 |                 'data-despacho':
395 |                 ['11/05/2015'],
396 |                 'descricao': ['Atenda-se ao Ministério Público. Promovam-se os'
397 |                               ' estudos social e psicológico com a demandada'
398 |                               ' e os adolescentes.'],
399 |                 'inteiro-teor': ('Atenda-se ao Ministério Público. Promovam-se'
400 |                                  '  os estudos social e psicológico com a'
401 |                                  ' demandada e os adolescentes.'),
402 |             }, {
403 |                 'tipo-do-movimento': 'Conclusão ao Juiz',
404 |                 'hash': '1234',
405 |                 'data-da-conclusao': ['11/05/2015'],
406 |                 'juiz': ['VIVIANE TOVAR DE MATTOS ABRAHAO']
407 |             }, {
408 |                 'tipo-do-movimento': 'Recebidos os autos',
409 |                 'hash': '1234',
410 |                 'data-do-recebimento': ['30/04/2015']
411 |             }, {
412 |                 'tipo-do-movimento': 'Remessa',
413 |                 'hash': '1234',
414 |                 'destinatario': ['Ministério Público'],
415 |                 'data-da-remessa': ['08/04/2015'],
416 |                 'prazo': ['15 dia(s)']
417 |             }, {
418 |                 'tipo-do-movimento': 'Recebimento',
419 |                 'hash': '1234',
420 |                 'data-de-recebimento': ['27/03/2015']
421 |             }, {
422 |                 'tipo-do-movimento':
423 |                 'Despacho - Proferido despacho de mero expediente',
424 |                 'hash': '1234',
425 |                 'data-despacho': ['19/03/2015'],
426 |                 'descricao': ['Dê-se vista ao Ministério Público.'],
427 |                 'inteiro-teor': 'Dê-se vista ao Ministério Público.'
428 |             }, {
429 |                 'tipo-do-movimento': 'Conclusão ao Juiz',
430 |                 'hash': '1234',
431 |                 'data-da-conclusao': ['19/03/2015'],
432 |                 'juiz': ['VIVIANE TOVAR DE MATTOS ABRAHAO']
433 |             }, {
434 |                 'tipo-do-movimento': 'Distribuição Dirigida',
435 |                 'hash': '1234',
436 |                 'data-da-distribuicao': ['19/03/2015'],
437 |                 'serventia': ['Cartório da 2ª Vara de Família, Inf. e da'
438 |                               ' Juv. e do Idoso - 2ª Vara de Família e da'
439 |                               ' Infância e da Juventude e do Idoso'],
440 |                 'localizacao-na-serventia': ['Saída de Acervo']
441 |             }]
442 |         }
443 | 
444 |         self.assert_items_equal(itens, esperado)
445 | 
446 |     @patch('robotj.extrator.crawler.parser.cria_hash_do_movimento',
447 |            return_value='1234')
448 |     def test_extrai_itens_de_processo_com_links_sem_atributo_onclick(self,
449 |                                                                      _chdm):
450 |         soup = BeautifulSoup(processo_judicial_7, 'lxml')
451 |         itens = parse_itens(
452 |             soup,
453 |             '0002346-95.2011.8.19.0045',
454 |             inicio_itens=26
455 |         )
456 |         esperado = {
457 |             'numero-processo':
458 |             '0004999-58.2015.8.19.0036',
459 |             'itens': [{
460 |                 'tipo-do-movimento': 'Distribuição Dirigida',
461 |                 'hash': '1234',
462 |                 'data-da-distribuicao':
463 |                 ['14/03/2011'],
464 |                 'serventia':
465 |                 ['Cartório da 2ª Vara de Família, da Inf., da Juv. '
466 |                  'e do Idoso -'
467 |                  ' 2ª Vara de Família Infância e Juventude e do Idoso'],
468 |                 'processo-s-apensado-s': ['0000159-51.2010.8.19.0045'],
469 |                 'processo-s-no-tribunal-de-justica':
470 |                 ['0002346-95.2011.8.19.0045'],
471 |                 'protocolo-s-no-tribunal-de-justica':
472 |                 ['201500617620 - Data: 26/10/2015'],
473 |                 'localizacao-na-serventia':
474 |                 ['Aguardando Arquivamento']
475 |             }]
476 |         }
477 | 
478 |         for chave, valor in esperado['itens'][-1].items():
479 |             with self.subTest():
480 |                 self.assertEqual(itens['itens'][-1][chave], valor)
481 | 
482 |     @patch('robotj.extrator.crawler.parser.cria_hash_do_movimento',
483 |            return_value='1234')
484 |     def test_parse_process_com_mandado_pagamento(self, _chdm):
485 |         soup = BeautifulSoup(process_com_mandado_pagamento, 'lxml')
486 |         itens = parse_itens(
487 |             soup,
488 |             '0166627-93.2017.8.19.0001',
489 |             inicio_itens=29
490 |         )
491 |         esperado = {
492 |             'numero-processo':
493 |             '0166627-93.2017.8.19.0001',
494 |             'itens': [{
495 |                 'data-da-distribuicao': ['04/07/2017'],
496 |                 'hash':
497 |                 '1234',
498 |                 'localizacao-na-serventia': ['Arquivado na Serventia'],
499 |                 'processo-s-no-conselho-recursal': ['Não há.'],
500 |                 'serventia': [
501 |                     'Cartório do 6º Juizado Especial Cível - '
502 |                     'Lagoa - 6º Juizado Especial Cível - Lagoa'
503 |                 ],
504 |                 'tipo-do-movimento':
505 |                 'Distribuição Sorteio'
506 |             }, {
507 |                 'hash': '1234',
508 |                 'no-mandado': ['742474'],
509 |                 'situacao-mandado': ['Pago'],
510 |                 'tipo-do-movimento': 'Mandado de Pagamento:'
511 |             }, {
512 |                 'data-pagamento': ['18/10/2017'],
513 |                 'hash': '1234',
514 |                 'no-guia': ['081010000041111402'],
515 |                 'situacao-da-guia': ['Disponível'],
516 |                 'tipo-do-movimento': 'Guia de Depósito:',
517 |                 'valor-pago': ['R$ 2.585,33']
518 |             }],
519 |         }
520 | 
521 |         for chave, valor in esperado['itens'][-1].items():
522 |             with self.subTest():
523 |                 self.assertEqual(itens['itens'][-1][chave], valor)
524 | 


--------------------------------------------------------------------------------