├── duralex ├── __init__.py ├── DeleteUUIDVisitor.py ├── DeleteParentVisitor.py ├── DeleteEmptyChildrenVisitor.py ├── SwapDefinitionAndReferenceVisitor.py ├── RemoveQuotePrefixVisitor.py ├── AddParentVisitor.py ├── ForkReferenceVisitor.py ├── ForkEditVisitor.py ├── FixMissingCodeOrLawReferenceVisitor.py ├── SortReferencesVisitor.py ├── alinea_lexer.py ├── ResolveFullyQualifiedDefinitionsVisitor.py ├── amendment_parser.py ├── diff_parser.py ├── ResolveFullyQualifiedReferencesVisitor.py ├── AbstractVisitor.py ├── tree.py └── bill_parser.py ├── requirements.txt ├── article_to_json.jpg ├── setup.py ├── .travis.yml ├── tests ├── ParseHeader3Test.py ├── ParseHeader2Test.py ├── ParseCodePartReferenceTest.py ├── ParseBookReferenceTest.py ├── ParseTitleReferenceTest.py ├── ParseChapterReferenceTest.py ├── ParseSectionReferenceTest.py ├── ParseParagraphReferenceTest.py ├── ParseSubSectionReferenceTest.py ├── ParseMultiplicativeAdverbTest.py ├── ParseSubParagraphDefinitionTest.py ├── ForkReferenceVisitorTest.py ├── ParseArticleDefinitionTest.py ├── ParseMentionDefinitionTest.py ├── ParseHeader2ReferenceTest.py ├── ParseSentenceDefinitionTest.py ├── ParseHeader3ReferenceTest.py ├── main.py ├── ParseAlineaDefinitionTest.py ├── ParseHeader3DefinitionTest.py ├── ParseCodeReferenceTest.py ├── DuralexTestCase.py ├── ParseHeader1DefinitionTest.py ├── ParseWordDefinitionTest.py ├── ParseSentenceReferenceTest.py ├── ParseHeader2DefinitionTest.py ├── ParseLawReferenceTest.py ├── ParseDefinitionListTest.py ├── SortReferencesVisitorTest.py ├── ParseHeader1Test.py ├── ParseArticleReferenceTest.py ├── ParseRawContentTest.py ├── ParseWordReferenceTest.py ├── ForkEditVisitorTest.py ├── ParseAlineaReferenceTest.py └── ResolveFullyQualifiedReferencesVisitorTest.py ├── .gitignore ├── README.md └── bin └── duralex /duralex/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | colorama 2 | html5lib 3 | beautifulsoup4 4 | requests 5 | unidiff 6 | -------------------------------------------------------------------------------- /article_to_json.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Legilibre/DuraLex/HEAD/article_to_json.jpg -------------------------------------------------------------------------------- /duralex/DeleteUUIDVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.AbstractVisitor import AbstractVisitor 2 | 3 | class DeleteUUIDVisitor(AbstractVisitor): 4 | def visit_node(self, node): 5 | if 'uuid' in node: 6 | del node['uuid'] 7 | 8 | super(DeleteUUIDVisitor, self).visit_node(node) 9 | -------------------------------------------------------------------------------- /duralex/DeleteParentVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.AbstractVisitor import AbstractVisitor 2 | 3 | class DeleteParentVisitor(AbstractVisitor): 4 | def visit_node(self, node): 5 | if 'parent' in node: 6 | del node['parent'] 7 | 8 | super(DeleteParentVisitor, self).visit_node(node) 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='DuraLex', 5 | version='0.2', 6 | install_requires=[ 7 | 'html5lib', 8 | 'simplejson', 9 | 'beautifulsoup4' 10 | ], 11 | packages=[ 12 | 'duralex' 13 | ], 14 | scripts=[ 15 | 'bin/duralex' 16 | ] 17 | ) 18 | -------------------------------------------------------------------------------- /duralex/DeleteEmptyChildrenVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.AbstractVisitor import AbstractVisitor 2 | 3 | class DeleteEmptyChildrenVisitor(AbstractVisitor): 4 | def visit_node(self, node): 5 | if 'children' in node and len(node['children']) == 0: 6 | del node['children'] 7 | 8 | super(DeleteEmptyChildrenVisitor, self).visit_node(node) 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | 5 | # command to install dependencies 6 | install: 7 | - pip install -r requirements.txt 8 | 9 | # command to run tests 10 | script: 11 | - cd tests 12 | - python main.py 13 | 14 | notifications: 15 | webhooks: 16 | urls: 17 | - https://webhooks.gitter.im/e/e8aba838e8d75ff07b0f 18 | on_success: change 19 | on_failure: always 20 | on_start: never 21 | -------------------------------------------------------------------------------- /duralex/SwapDefinitionAndReferenceVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.AbstractVisitor import AbstractVisitor 2 | 3 | import duralex.tree as tree 4 | 5 | class SwapDefinitionAndReferenceVisitor(AbstractVisitor): 6 | def visit_edit_node(self, node, post): 7 | defs = filter(lambda n: tree.is_definition(n), node['children']) 8 | 9 | for d in defs: 10 | tree.remove_node(node, d) 11 | tree.push_node(node, d) 12 | -------------------------------------------------------------------------------- /duralex/RemoveQuotePrefixVisitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from duralex.AbstractVisitor import AbstractVisitor 4 | 5 | from duralex.alinea_parser import * 6 | 7 | class RemoveQuotePrefixVisitor(AbstractVisitor): 8 | def visit_quote_node(self, node, post): 9 | if post: 10 | return 11 | 12 | # Art. {articleId}. - 13 | node['words'] = re.sub(r'^Art\. .*?\. - ', '', node['words'], 0, re.UNICODE | re.MULTILINE) 14 | -------------------------------------------------------------------------------- /duralex/AddParentVisitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | 3 | from duralex.AbstractVisitor import AbstractVisitor 4 | 5 | class AddParentVisitor(AbstractVisitor): 6 | def __init__(self): 7 | self.parent = [] 8 | 9 | super(AddParentVisitor, self).__init__() 10 | 11 | def visit_node(self, node): 12 | if 'parent' not in node and len(self.parent): 13 | node['parent'] = self.parent[-1] 14 | 15 | self.parent.append(node) 16 | 17 | super(AddParentVisitor, self).visit_node(node) 18 | 19 | del self.parent[-1] 20 | -------------------------------------------------------------------------------- /duralex/ForkReferenceVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.AbstractVisitor import AbstractVisitor 2 | 3 | from duralex.alinea_parser import * 4 | 5 | import duralex.tree 6 | 7 | class ForkReferenceVisitor(AbstractVisitor): 8 | def visit_node(self, node): 9 | if duralex.tree.is_reference(node) and 'children' in node and len(node['children']) > 1: 10 | ref_nodes = [n for n in node['children'] if duralex.tree.is_reference(n)] 11 | for i in range(1, len(ref_nodes)): 12 | ref = ref_nodes[i] 13 | fork = copy_node(node, recursive=False) 14 | remove_node(node, ref) 15 | push_node(fork, ref) 16 | push_node(node['parent'], fork) 17 | 18 | super(ForkReferenceVisitor, self).visit_node(node) 19 | -------------------------------------------------------------------------------- /tests/ParseHeader3Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader3Test(DuralexTestCase): 8 | def test_header3_raw_content(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header2, 12 | u"b) Ceci est un header3." 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'header3', 17 | 'order': 2, 18 | 'children': [ 19 | { 20 | 'type': u'raw-content', 21 | 'content': u'Ceci est un header3.' 22 | } 23 | ] 24 | } 25 | ]} 26 | ) 27 | -------------------------------------------------------------------------------- /tests/ParseHeader2Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader2Test(DuralexTestCase): 8 | def test_header2_raw_content(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header2, 12 | u"42° Ceci est un header2." 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'header2', 17 | 'order': 42, 18 | 'children': [ 19 | { 20 | 'content': u'Ceci est un header2.', 21 | 'type': u'raw-content' 22 | } 23 | ], 24 | } 25 | ]} 26 | ) 27 | -------------------------------------------------------------------------------- /tests/ParseCodePartReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseCodePartReferenceTest(DuralexTestCase): 8 | def test_code_part(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_code_part_reference, 12 | u"la troisième partie du code de l'éducation" 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'code-part-reference', 17 | 'order': 3, 18 | 'children': [ 19 | { 20 | 'type': u'code-reference', 21 | 'id': u'code de l\'éducation' 22 | } 23 | ] 24 | } 25 | ]} 26 | ) 27 | -------------------------------------------------------------------------------- /tests/ParseBookReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseBookReferenceTest(DuralexTestCase): 8 | def test_book(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_book_reference, 12 | u"le livre III" 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'book-reference', 17 | 'order': 3 18 | } 19 | ]} 20 | ) 21 | 22 | def test_book_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_book_reference, 26 | u"du livre V" 27 | ), 28 | {'children': [ 29 | { 30 | 'type': u'book-reference', 31 | 'order': 5 32 | } 33 | ]} 34 | ) 35 | -------------------------------------------------------------------------------- /tests/ParseTitleReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseTitleReferenceTest(DuralexTestCase): 8 | def test_title(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_title_reference, 12 | u"le titre IV" 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'title-reference', 17 | 'order': 4 18 | } 19 | ]} 20 | ) 21 | 22 | def test_title_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_title_reference, 26 | u"du titre IV" 27 | ), 28 | {'children': [ 29 | { 30 | 'type': u'title-reference', 31 | 'order': 4 32 | } 33 | ]} 34 | ) 35 | -------------------------------------------------------------------------------- /tests/ParseChapterReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseChapterReferenceTest(DuralexTestCase): 8 | def test_chapter(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_chapter_reference, 12 | u"le chapitre IV" 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'chapter-reference', 17 | 'order': 4 18 | } 19 | ]} 20 | ) 21 | 22 | def test_chapter_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_chapter_reference, 26 | u"du chapitre IV" 27 | ), 28 | {'children': [ 29 | { 30 | 'type': u'chapter-reference', 31 | 'order': 4 32 | } 33 | ]} 34 | ) 35 | -------------------------------------------------------------------------------- /tests/ParseSectionReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseSectionReferenceTest(DuralexTestCase): 8 | def test_the_section_order(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_section_reference, 12 | "la section 2" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'section-reference', 17 | 'order': 2 18 | } 19 | ]} 20 | ) 21 | 22 | def test_of_the_section_order(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_section_reference, 26 | "de la section 2" 27 | ), 28 | {'children':[ 29 | { 30 | 'type': u'section-reference', 31 | 'order': 2 32 | } 33 | ]} 34 | ) 35 | -------------------------------------------------------------------------------- /tests/ParseParagraphReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseParagraphReferenceTest(DuralexTestCase): 8 | def test_paragraph(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_paragraph_reference, 12 | u"le paragraphe 42" 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'paragraph-reference', 17 | 'order': 42 18 | } 19 | ]} 20 | ) 21 | 22 | def test_paragraph_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_paragraph_reference, 26 | u"du paragraphe 42" 27 | ), 28 | {'children': [ 29 | { 30 | 'type': u'paragraph-reference', 31 | 'order': 42 32 | } 33 | ]} 34 | ) 35 | -------------------------------------------------------------------------------- /tests/ParseSubSectionReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseSubSectionReferenceTest(DuralexTestCase): 8 | def test_the_subsection_order(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_subsection_reference, 12 | "la sous-section 2" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'subsection-reference', 17 | 'order': 2 18 | } 19 | ]} 20 | ) 21 | 22 | def test_of_the_subsection_order(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_subsection_reference, 26 | "de la sous-section 2" 27 | ), 28 | {'children':[ 29 | { 30 | 'type': u'subsection-reference', 31 | 'order': 2 32 | } 33 | ]} 34 | ) 35 | -------------------------------------------------------------------------------- /duralex/ForkEditVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.AbstractVisitor import AbstractVisitor 2 | 3 | from duralex.alinea_parser import * 4 | 5 | import duralex.tree 6 | 7 | class ForkEditVisitor(AbstractVisitor): 8 | def visit_node(self, node): 9 | if 'type' in node and node['type'] == 'edit' and 'children' in node and len(node['children']) > 1: 10 | ref_nodes = [n for n in node['children'] if duralex.tree.is_reference(n)] 11 | def_nodes = [n for n in node['children'] if duralex.tree.is_definition(n)] 12 | edit_node = copy_node(node, recursive=False) 13 | parent = node['parent'] 14 | remove_node(parent, node) 15 | for ref_node in ref_nodes: 16 | if len(def_nodes) > 0: 17 | for def_node in def_nodes: 18 | ref_node = copy_node(ref_node) 19 | def_node = copy_node(def_node) 20 | fork = copy_node(edit_node) 21 | push_node(fork, ref_node) 22 | push_node(fork, def_node) 23 | push_node(parent, fork) 24 | else: 25 | ref_node = copy_node(ref_node) 26 | fork = copy_node(edit_node) 27 | push_node(fork, ref_node) 28 | push_node(parent, fork) 29 | else: 30 | super(ForkEditVisitor, self).visit_node(node) 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | data 92 | -------------------------------------------------------------------------------- /tests/ParseMultiplicativeAdverbTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseMultiplicativeAdverbTest(DuralexTestCase): 8 | def test_header2_bis(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header2_definition, 12 | "un 3° bis" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'header2-definition', 17 | 'order': 3, 18 | 'isBis': True 19 | } 20 | ]} 21 | ) 22 | 23 | def test_header2_ter(self): 24 | self.assertEqualAST( 25 | self.call_parse_func( 26 | parser.parse_header2_definition, 27 | "un 3° ter" 28 | ), 29 | {'children':[ 30 | { 31 | 'type': u'header2-definition', 32 | 'order': 3, 33 | 'isTer': True 34 | } 35 | ]} 36 | ) 37 | 38 | def test_header2_quater(self): 39 | self.assertEqualAST( 40 | self.call_parse_func( 41 | parser.parse_header2_definition, 42 | "un 3° quater" 43 | ), 44 | {'children':[ 45 | { 46 | 'type': u'header2-definition', 47 | 'order': 3, 48 | 'isQuater': True 49 | } 50 | ]} 51 | ) 52 | -------------------------------------------------------------------------------- /duralex/FixMissingCodeOrLawReferenceVisitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | 3 | from duralex.AbstractVisitor import AbstractVisitor 4 | 5 | from duralex.tree import * 6 | 7 | # If an edit reference does not feature a code-reference or law-reference node, we won't be able to find the actual 8 | # original texte to apply the edits to. To fix this, this visitor will : 9 | # * target only article-reference nodes with no law-reference and code-reference ancestor/descendant, 10 | # * find, copy and insert as it's own child the first previous law-reference or code-reference whichever comes first in 11 | # reversed traversal 12 | class FixMissingCodeOrLawReferenceVisitor(AbstractVisitor): 13 | def __init__(self): 14 | self.law_or_code_ref = None 15 | super(FixMissingCodeOrLawReferenceVisitor, self).__init__() 16 | 17 | def visit_law_reference_node(self, node, post): 18 | if post: 19 | return 20 | self.law_or_code_ref = node 21 | 22 | def visit_code_reference_node(self, node, post): 23 | if post: 24 | return 25 | self.law_or_code_ref = node 26 | 27 | def visit_article_reference_node(self, node, post): 28 | if post: 29 | return 30 | ancestor_refs = [n for n in get_node_ancestors(node) + get_node_descendants(node) if 31 | not is_root(n) and n['type'] in [TYPE_CODE_REFERENCE, TYPE_LAW_REFERENCE] 32 | ] 33 | if len(ancestor_refs) == 0: 34 | while len(node['children']) != 0: 35 | node = node['children'][0] 36 | node['children'] = [copy_node(self.law_or_code_ref, False)] 37 | -------------------------------------------------------------------------------- /tests/ParseSubParagraphDefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseSubParagraphDefinitionTest(DuralexTestCase): 8 | def test_one_subparagraph_with_quote(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_subparagraph_definition, 12 | ("un sous-paragraphe ainsi rédigé : \n" 13 | "\"sous-paragraphe 1\"") 14 | ), 15 | {'children': [ 16 | { 17 | 'type': u'subparagraph-definition', 18 | 'children': [ 19 | { 20 | 'type': u'quote', 21 | 'words': u'sous-paragraphe 1' 22 | } 23 | ], 24 | } 25 | ]} 26 | ) 27 | 28 | def test_one_subparagraph_order_with_quote(self): 29 | self.assertEqualAST( 30 | self.call_parse_func( 31 | parser.parse_subparagraph_definition, 32 | ("un sous-paragraphe 3 ainsi rédigé : \n" 33 | "\"sous-paragraphe 1\"") 34 | ), 35 | {'children': [ 36 | { 37 | 'type': u'subparagraph-definition', 38 | 'order': 3, 39 | 'children': [ 40 | { 41 | 'type': u'quote', 42 | 'words': u'sous-paragraphe 1' 43 | } 44 | ], 45 | } 46 | ]} 47 | ) 48 | -------------------------------------------------------------------------------- /tests/ForkReferenceVisitorTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | from duralex.ForkReferenceVisitor import ForkReferenceVisitor 6 | 7 | class ForkReferenceVisitorTest(DuralexTestCase): 8 | def test(self): 9 | self.assertEqualAST( 10 | self.call_visitor(ForkReferenceVisitor, self.make_tree({'children': [ 11 | { 12 | 'type': u'alinea-reference', 13 | 'order': 3, 14 | 'children': [ 15 | { 16 | 'type': u'article-reference', 17 | 'id': u'2' 18 | }, 19 | { 20 | 'type': u'article-reference', 21 | 'id': u'3' 22 | } 23 | ] 24 | } 25 | ]})), 26 | {'children': [ 27 | { 28 | 'type': u'alinea-reference', 29 | 'order': 3, 30 | 'children': [ 31 | { 32 | 'id': u'2', 33 | 'type': u'article-reference' 34 | } 35 | ], 36 | }, 37 | { 38 | 'order': 3, 39 | 'type': u'alinea-reference', 40 | 'children': [ 41 | { 42 | 'id': u'3', 43 | 'type': u'article-reference' 44 | } 45 | ] 46 | } 47 | ]} 48 | ) 49 | -------------------------------------------------------------------------------- /duralex/SortReferencesVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.alinea_parser import * 2 | 3 | from duralex.AbstractVisitor import AbstractVisitor 4 | 5 | import duralex.tree 6 | 7 | class SortReferencesVisitor(AbstractVisitor): 8 | def visit_node(self, node): 9 | if not self.sort_references(node): 10 | super(SortReferencesVisitor, self).visit_node(node) 11 | 12 | def sort_references(self, node): 13 | root_refs = filter_nodes(node, lambda n: duralex.tree.is_reference(n) and 'parent' in n and (not duralex.tree.is_reference(n['parent']))) 14 | 15 | if len(root_refs) == 0: 16 | return False 17 | 18 | for root_ref in root_refs: 19 | root_ref_parent = root_ref['parent'] 20 | refs = filter_nodes(root_ref, lambda n: duralex.tree.is_reference(n)) 21 | sorted_refs = sorted(refs, key=lambda r: duralex.tree.TYPE_REFERENCE.index(r['type'])) 22 | filtered_refs = [sorted_refs[0]] 23 | for ref in sorted_refs: 24 | if 'parent' in ref: 25 | remove_node(ref['parent'], ref) 26 | # the deepest *-reference of the same type wins 27 | # FIXME: should we raise because we're not supposed to have the same *-reference twice? 28 | if ref['type'] == filtered_refs[-1]['type']: 29 | filtered_refs.pop() 30 | filtered_refs.append(ref) 31 | for i in range(0, len(filtered_refs)): 32 | ref = filtered_refs[i] 33 | if i == 0: 34 | push_node(root_ref_parent, ref) 35 | else: 36 | push_node(filtered_refs[i - 1], ref) 37 | 38 | return True 39 | -------------------------------------------------------------------------------- /tests/ParseArticleDefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseArticleDefinitionTest(DuralexTestCase): 8 | def test_an_article(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_article_definition, 12 | "un article" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'article-definition' 17 | } 18 | ]} 19 | ) 20 | 21 | def test_an_article_2(self): 22 | self.assertEqualAST( 23 | self.call_parse_func( 24 | parser.parse_article_definition, 25 | "l'article" 26 | ), 27 | {'children':[ 28 | { 29 | 'type': u'article-definition' 30 | } 31 | ]} 32 | ) 33 | 34 | def test_an_article_with_id(self): 35 | self.assertEqualAST( 36 | self.call_parse_func( 37 | parser.parse_article_definition, 38 | "un article 42" 39 | ), 40 | {'children':[ 41 | { 42 | 'type': u'article-definition', 43 | 'id': u'42' 44 | } 45 | ]} 46 | ) 47 | 48 | def test_an_article_with_id_2(self): 49 | self.assertEqualAST( 50 | self.call_parse_func( 51 | parser.parse_article_definition, 52 | "l'article 42" 53 | ), 54 | {'children':[ 55 | { 56 | 'type': u'article-definition', 57 | 'id': u'42' 58 | } 59 | ]} 60 | ) 61 | -------------------------------------------------------------------------------- /tests/ParseMentionDefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseMentionDefinitionTest(DuralexTestCase): 8 | def test_mention_with_single_quote(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_mention_definition, 12 | ("la mention : \"ceci est une mention\"") 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'mention-definition', 17 | 'children': [ 18 | { 19 | 'type': u'quote', 20 | 'words': u'ceci est une mention' 21 | } 22 | ] 23 | } 24 | ]} 25 | ) 26 | 27 | def test_mention_with_n_quotes(self): 28 | self.assertEqualAST( 29 | self.call_parse_func( 30 | parser.parse_mention_definition, 31 | ("la mention : \n" 32 | "\"ceci est le début de la mention\"\n" 33 | "\"ceci est la suite de la mention\"") 34 | ), 35 | {'children': [ 36 | { 37 | 'type': u'mention-definition', 38 | 'children': [ 39 | { 40 | 'type': u'quote', 41 | 'words': u'ceci est le début de la mention' 42 | }, 43 | { 44 | 'type': u'quote', 45 | 'words': u'ceci est la suite de la mention' 46 | } 47 | ] 48 | } 49 | ]} 50 | ) 51 | -------------------------------------------------------------------------------- /tests/ParseHeader2ReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader2ReferenceTest(DuralexTestCase): 8 | def test_header2_number(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header2_reference, 12 | "au 42°" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'header2-reference', 17 | 'order': 42 18 | } 19 | ]} 20 | ) 21 | 22 | def test_before_header2_number(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_header2_reference, 26 | "avant le 1°" 27 | ), 28 | {'children':[ 29 | { 30 | 'type': u'header2-reference', 31 | 'position': u'before', 32 | 'order': 1 33 | } 34 | ]} 35 | ) 36 | 37 | def test_header2_order_letter_adverb_article_code(self): 38 | self.assertEqualAST( 39 | self.call_parse_func( 40 | parser.parse_header2_reference, 41 | "le 3° de l'article L. 711-2 du code de l'éducation" 42 | ), 43 | {'children': [ 44 | { 45 | 'order': 3, 46 | 'type': u'header2-reference', 47 | 'children': [ 48 | { 49 | 'children': [ 50 | { 51 | 'id': u'code de l\'éducation', 52 | 'type': u'code-reference' 53 | } 54 | ], 55 | 'id': u'L. 711-2', 56 | 'type': u'article-reference' 57 | } 58 | ], 59 | } 60 | ]} 61 | ) 62 | -------------------------------------------------------------------------------- /duralex/alinea_lexer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | TOKEN_DELIMITERS = re.compile(u'(\xa0|\s|\(|\)|\.|\!|\'|,|")') 6 | TOKEN_NEW_LINE = '\n' 7 | TOKEN_SINGLE_QUOTE = u'\'' 8 | TOKEN_DOUBLE_QUOTE_OPEN = u'"' 9 | TOKEN_DOUBLE_QUOTE_CLOSE = u'"' 10 | TOKEN_MONTH_NAMES = [ 11 | u'janvier', 12 | u'février', 13 | u'mars', 14 | u'avril', 15 | u'mai', 16 | u'juin', 17 | u'juillet', 18 | u'août', 19 | u'septembre', 20 | u'octobre', 21 | u'novembre', 22 | u'décembre' 23 | ] 24 | TOKEN_MULTIPLICATIVE_ADVERBS = [ 25 | u'bis', 26 | u'ter', 27 | u'quater', 28 | u'quinquies', 29 | u'sexies', 30 | u'septies', 31 | u'octies', 32 | u'novies', 33 | u'decies', 34 | u'undecies', 35 | u'duodecies', 36 | u'terdecies', 37 | u'quaterdecies', 38 | u'quindecies', 39 | u'sexdecies', 40 | u'septdecies', 41 | u'octodecies', 42 | u'novodecies', 43 | u'vicies', 44 | u'unvicies', 45 | u'duovicies', 46 | u'tervicies', 47 | u'quatervicies', 48 | u'quinvicies', 49 | u'sexvicies', 50 | u'septvicies' 51 | ] 52 | 53 | def tokenize(text): 54 | try: 55 | text = text.decode('utf-8') 56 | except: 57 | pass 58 | 59 | tokens = TOKEN_DELIMITERS.split(text) 60 | # remove empty strings 61 | tokens = [s for s in tokens if s != ''] 62 | return tokens 63 | 64 | def skip_tokens(tokens, i, f): 65 | while i < len(tokens) and f(tokens[i]): 66 | i += 1 67 | return i 68 | 69 | def skip_spaces(tokens, i): 70 | return skip_tokens(tokens, i, lambda t: re.compile('\s+').match(t)) 71 | 72 | def skip_to_next_word(tokens, i): 73 | return skip_tokens(tokens, i, lambda t: not re.compile('[\wà]+', re.IGNORECASE | re.UNICODE).match(t)) 74 | 75 | def skip_to_token(tokens, i, token): 76 | return skip_tokens(tokens, i, lambda t: t != token) 77 | 78 | def skip_to_end_of_line(tokens, i): 79 | if i > 0 and i < len(tokens) and tokens[i - 1] == TOKEN_NEW_LINE: 80 | return i 81 | 82 | return skip_to_token(tokens, i, TOKEN_NEW_LINE) 83 | 84 | def skip_to_quote_start(tokens, i): 85 | return skip_to_token(tokens, i, TOKEN_DOUBLE_QUOTE_OPEN) 86 | -------------------------------------------------------------------------------- /tests/ParseSentenceDefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseSentenceDefinitionTest(DuralexTestCase): 8 | def test_one_sentence_with_quotes(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_sentence_definition, 12 | ("une phrase ainsi rédigée :\n" 13 | "\"phrase 1\"\n") 14 | ), 15 | {'children':[ 16 | { 17 | 'children': [ 18 | { 19 | 'type': 'quote', 20 | 'words': 'phrase 1' 21 | } 22 | ], 23 | 'type': 'sentence-definition' 24 | } 25 | ]} 26 | ) 27 | 28 | def test_three_sentences_with_quotes(self): 29 | self.assertEqualAST( 30 | self.call_parse_func( 31 | parser.parse_sentence_definition, 32 | ("trois phrases ainsi rédigées :\n" 33 | "\"phrase 1\"\n" 34 | "\"phrase 2\"\n" 35 | "\"phrase 3\"\n") 36 | ), 37 | {'children':[ 38 | { 39 | 'children': [ 40 | { 41 | 'type': 'quote', 42 | 'words': 'phrase 1' 43 | } 44 | ], 45 | 'type': 'sentence-definition' 46 | }, 47 | { 48 | 'children': [ 49 | { 50 | 'type': 'quote', 51 | 'words': 'phrase 2' 52 | } 53 | ], 54 | 'type': 'sentence-definition' 55 | }, 56 | { 57 | 'children': [ 58 | { 59 | 'type': 'quote', 60 | 'words': 'phrase 3' 61 | } 62 | ], 63 | 'type': 'sentence-definition' 64 | } 65 | ]} 66 | ) 67 | -------------------------------------------------------------------------------- /duralex/ResolveFullyQualifiedDefinitionsVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.alinea_parser import * 2 | 3 | from duralex.AbstractVisitor import AbstractVisitor 4 | 5 | import duralex.tree 6 | 7 | class ResolveFullyQualifiedDefinitionsVisitor(AbstractVisitor): 8 | def visit_node(self, node): 9 | self.resolve_fully_qualified_definitions(node) 10 | super(ResolveFullyQualifiedDefinitionsVisitor, self).visit_node(node) 11 | 12 | def resolve_fully_qualified_definitions(self, node): 13 | if 'type' in node and node['type'] == 'edit': 14 | def_nodes = filter_nodes(node, lambda x : duralex.tree.is_definition(x)) 15 | # if we have more than 1 definition in a single edit, we assume: 16 | # - they have different types 17 | # - the final type of definition is the combination of all those types 18 | if len(def_nodes) > 1: 19 | content_nodes = filter(lambda x : len(x['children']) > 0, def_nodes) 20 | type_nodes = filter(lambda x : len(x['children']) == 0, def_nodes) 21 | types = [] 22 | for type_node in type_nodes: 23 | remove_node(node, type_node) 24 | types.append(type_node) 25 | del type_node['count'] 26 | # if 'count' in type_node and type_node['count'] == len(content_nodes): 27 | # FIXME: else we should issue a warning because the count doesn't match and the type qualifier cannot 28 | # apply 29 | for content_node in content_nodes: 30 | children = [] 31 | for child in content_node['children']: 32 | children.append(child) 33 | remove_node(content_node, child) 34 | remove_node(node, content_node) 35 | sorted_types = sorted(types + [content_node], key=lambda x : duralex.tree.TYPE_DEFINITION.index(x['type'])) 36 | type_node = node 37 | for sorted_type in sorted_types: 38 | t = copy_node(sorted_type) 39 | push_node(type_node, t) 40 | type_node = t 41 | for child in children: 42 | push_node(type_node, child) 43 | -------------------------------------------------------------------------------- /tests/ParseHeader3ReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader3ReferenceTest(DuralexTestCase): 8 | def test_header3(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header3_reference, 12 | "au e" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'header3-reference', 17 | 'order': 5 18 | } 19 | ]} 20 | ) 21 | 22 | def test_before_header3(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_header3_reference, 26 | "avant le d" 27 | ), 28 | {'children':[ 29 | { 30 | 'type': u'header3-reference', 31 | 'position': u'before', 32 | 'order': 4 33 | } 34 | ]} 35 | ) 36 | 37 | def test_header3_header2_article_code(self): 38 | self.assertEqualAST( 39 | self.call_parse_func( 40 | parser.parse_header3_reference, 41 | "le b du 3° de l'article L. 711-2 du code de l'éducation" 42 | ), 43 | {'children': [ 44 | { 45 | 'order': 2, 46 | 'type': u'header3-reference', 47 | 'children': [ 48 | { 49 | 'order': 3, 50 | 'type': u'header2-reference', 51 | 'children': [ 52 | { 53 | 'children': [ 54 | { 55 | 'id': u'code de l\'éducation', 56 | 'type': u'code-reference' 57 | } 58 | ], 59 | 'id': u'L. 711-2', 60 | 'type': u'article-reference' 61 | } 62 | ], 63 | } 64 | ] 65 | } 66 | ]} 67 | ) 68 | -------------------------------------------------------------------------------- /tests/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import unittest 5 | 6 | from ParseHeader1Test import ParseHeader1Test 7 | from ParseHeader2Test import ParseHeader2Test 8 | from ParseHeader3Test import ParseHeader3Test 9 | from ParseRawContentTest import ParseRawContentTest 10 | from ParseArticleReferenceTest import ParseArticleReferenceTest 11 | from ParseEditTest import ParseEditTest 12 | from ParseWordReferenceTest import ParseWordReferenceTest 13 | from ParseAlineaReferenceTest import ParseAlineaReferenceTest 14 | from ParseAlineaDefinitionTest import ParseAlineaDefinitionTest 15 | from ParseHeader2ReferenceTest import ParseHeader2ReferenceTest 16 | from ParseHeader2DefinitionTest import ParseHeader2DefinitionTest 17 | from ParseCodeReferenceTest import ParseCodeReferenceTest 18 | from ParseLawReferenceTest import ParseLawReferenceTest 19 | from ParseMultiplicativeAdverbTest import ParseMultiplicativeAdverbTest 20 | from ParseSentenceDefinitionTest import ParseSentenceDefinitionTest 21 | from ParseSentenceReferenceTest import ParseSentenceReferenceTest 22 | from ParseWordDefinitionTest import ParseWordDefinitionTest 23 | from ParseArticleDefinitionTest import ParseArticleDefinitionTest 24 | from ParseMentionDefinitionTest import ParseMentionDefinitionTest 25 | from ParseHeader1DefinitionTest import ParseHeader1DefinitionTest 26 | from ParseDefinitionListTest import ParseDefinitionListTest 27 | from ParseHeader3DefinitionTest import ParseHeader3DefinitionTest 28 | from ParseHeader3ReferenceTest import ParseHeader3ReferenceTest 29 | from ParseSectionReferenceTest import ParseSectionReferenceTest 30 | from ParseSubSectionReferenceTest import ParseSubSectionReferenceTest 31 | from ParseChapterReferenceTest import ParseChapterReferenceTest 32 | from ParseParagraphReferenceTest import ParseParagraphReferenceTest 33 | from ParseSubParagraphDefinitionTest import ParseSubParagraphDefinitionTest 34 | from ParseCodePartReferenceTest import ParseCodePartReferenceTest 35 | from ParseTitleReferenceTest import ParseTitleReferenceTest 36 | from ParseBookReferenceTest import ParseBookReferenceTest 37 | from ResolveFullyQualifiedReferencesVisitorTest import ResolveFullyQualifiedReferencesVisitorTest 38 | from SortReferencesVisitorTest import SortReferencesVisitorTest 39 | from ForkReferenceVisitorTest import ForkReferenceVisitorTest 40 | from ForkEditVisitorTest import ForkEditVisitorTest 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /tests/ParseAlineaDefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseAlineaDefinitionTest(DuralexTestCase): 8 | def test_one_alinea_with_quote(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_alinea_definition, 12 | ("un alinéa ainsi rédigé : \n" 13 | "\"alinéa 1\"") 14 | ), 15 | {'children': [ 16 | { 17 | 'type': u'alinea-definition', 18 | 'children': [ 19 | { 20 | 'type': u'quote', 21 | 'words': u'alinéa 1' 22 | } 23 | ], 24 | } 25 | ]} 26 | ) 27 | 28 | def test_n_alineas_with_n_quotes(self): 29 | self.assertEqualAST( 30 | self.call_parse_func( 31 | parser.parse_alinea_definition, 32 | ("quatre alinéas ainsi rédigés : \n" 33 | "\"alinéa 1\"\n" 34 | "\"alinéa 2\"\n" 35 | "\"alinéa 3\"\n" 36 | "\"alinéa 4\"") 37 | ), 38 | {'children': [ 39 | { 40 | 'type': u'alinea-definition', 41 | 'children': [ 42 | { 43 | 'type': u'quote', 44 | 'words': u'alinéa 1' 45 | } 46 | ], 47 | }, 48 | { 49 | 'type': u'alinea-definition', 50 | 'children': [ 51 | { 52 | 'type': u'quote', 53 | 'words': u'alinéa 2' 54 | } 55 | ], 56 | }, 57 | { 58 | 'type': u'alinea-definition', 59 | 'children': [ 60 | { 61 | 'type': u'quote', 62 | 'words': u'alinéa 3' 63 | } 64 | ], 65 | }, 66 | { 67 | 'type': u'alinea-definition', 68 | 'children': [ 69 | { 70 | 'type': u'quote', 71 | 'words': u'alinéa 4' 72 | } 73 | ], 74 | } 75 | ]} 76 | ) 77 | -------------------------------------------------------------------------------- /tests/ParseHeader3DefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader3DefinitionTest(DuralexTestCase): 8 | def test_header3(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header3_definition, 12 | ("un a") 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'header3-definition', 17 | 'order': 1 18 | } 19 | ]} 20 | ) 21 | 22 | def test_header3_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_header3_definition, 26 | ("un e") 27 | ), 28 | {'children': [ 29 | { 30 | 'type': u'header3-definition', 31 | 'order': 5 32 | } 33 | ]} 34 | ) 35 | 36 | def test_scope_with_quotes(self): 37 | self.assertEqualAST( 38 | self.call_parse_func( 39 | parser.parse_header3_definition, 40 | (u"des c à e ainsi rédigés :\n" 41 | u"\"ceci est le contenu du header3 3\"\n" 42 | u"\"ceci est le contenu du header3 4\"\n" 43 | u"\"ceci est le contenu du header3 5\"") 44 | ), 45 | {'children': [ 46 | { 47 | 'type': u'header3-definition', 48 | 'order': 3, 49 | 'children': [ 50 | { 51 | 'type': u'quote', 52 | 'words': u'ceci est le contenu du header3 3' 53 | } 54 | ], 55 | }, 56 | { 57 | 'type': u'header3-definition', 58 | 'order': 4, 59 | 'children': [ 60 | { 61 | 'type': u'quote', 62 | 'words': u'ceci est le contenu du header3 4' 63 | } 64 | ], 65 | }, 66 | { 67 | 'type': u'header3-definition', 68 | 'order': 5, 69 | 'children': [ 70 | { 71 | 'type': u'quote', 72 | 'words': u'ceci est le contenu du header3 5' 73 | } 74 | ], 75 | } 76 | ]} 77 | ) 78 | -------------------------------------------------------------------------------- /tests/ParseCodeReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseCodeReferenceTest(DuralexTestCase): 8 | def test_code_with_name(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_code_reference, 12 | "le code de l\'éducation" 13 | ), 14 | {'children':[ 15 | { 16 | 'id': u'code de l\'éducation', 17 | 'type': u'code-reference' 18 | } 19 | ]} 20 | ) 21 | 22 | def test_code_with_name_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_code_reference, 26 | "du code de l\'éducation" 27 | ), 28 | {'children':[ 29 | { 30 | 'id': u'code de l\'éducation', 31 | 'type': u'code-reference' 32 | } 33 | ]} 34 | ) 35 | 36 | def test_the_same_code(self): 37 | self.assertEqualAST( 38 | self.call_parse_func( 39 | parser.parse_code_reference, 40 | "le même code", 41 | {'children':[ 42 | { 43 | 'id': u'code de l\'éducation', 44 | 'type': u'code-reference' 45 | } 46 | ]} 47 | ), 48 | {'children':[ 49 | { 50 | 'id': u'code de l\'éducation', 51 | 'type': u'code-reference' 52 | }, 53 | { 54 | 'id': u'code de l\'éducation', 55 | 'type': u'code-reference' 56 | } 57 | ]} 58 | ) 59 | 60 | def test_the_same_code_2(self): 61 | self.assertEqualAST( 62 | self.call_parse_func( 63 | parser.parse_code_reference, 64 | "du même code", 65 | {'children':[ 66 | { 67 | 'id': u'code de l\'éducation', 68 | 'type': u'code-reference' 69 | } 70 | ]} 71 | ), 72 | {'children':[ 73 | { 74 | 'id': u'code de l\'éducation', 75 | 'type': u'code-reference' 76 | }, 77 | { 78 | 'id': u'code de l\'éducation', 79 | 'type': u'code-reference' 80 | } 81 | ]} 82 | ) 83 | -------------------------------------------------------------------------------- /duralex/amendment_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import duralex.alinea_lexer as lexer 4 | 5 | from duralex.bill_parser import clean_html 6 | from duralex.tree import * 7 | from duralex.alinea_parser import is_number_word, word_to_number, is_number, parse_int, parse_alineas 8 | 9 | AMENDMENT_STATUS = { 10 | u'rejeté': 'rejected', 11 | u'retiré': 'removed', 12 | u'non soutenu': 'undefended', 13 | u'retiré avant séance': 'removed', 14 | u'adopté': 'approved' 15 | } 16 | 17 | def parse(data, tree): 18 | amendements = [] 19 | # ast = create_node(ast, {'type': 'amendments'}) 20 | for amendement in data['amendements']: 21 | amendements.append(parse_amendment(amendement['amendement'], tree)) 22 | return tree 23 | 24 | def parse_amendment(data, parent): 25 | subject = data['sujet'] 26 | text = clean_html(data['texte']) 27 | 28 | tokens = lexer.tokenize(subject + '\n' + text) 29 | node = create_node(parent, { 30 | 'type': 'amendment', 31 | 'id': data['numero'], 32 | 'content': text, 33 | 'status': AMENDMENT_STATUS[data['sort'].lower()], 34 | 'description': clean_html(data['expose']), 35 | 'signatories': [{'name': s.strip()} for s in data['signataires'].split(', ')], 36 | 'url': data['source'] 37 | }) 38 | 39 | # The "subject" declares the target bill article reference for this admendment. 40 | # That reference will be referenced later on using syntaxes such as "cet article" ("this article"). 41 | parse_subject(tokens, 0, node) 42 | parse_alineas(node['content'], node) 43 | # If the admendment content actually need that bill article reference, they already have it copied by now. 44 | # So we simply we remove it. 45 | remove_node(node, node['children'][0]) 46 | 47 | return node 48 | 49 | def parse_subject(tokens, i, parent): 50 | node = create_node(parent, { 51 | 'type': TYPE_BILL_ARTICLE_REFERENCE 52 | }) 53 | 54 | i = parse_ref_position(tokens, i, node) 55 | 56 | # ART. PREMIER 57 | if tokens[i] == 'ART' and is_number_word(tokens[i + 3]): 58 | node['order'] = word_to_number(tokens[i + 3]) 59 | i += lexer.skip_to_end_of_line(tokens, i) 60 | # ART. {order} 61 | elif tokens[i] == 'ART' and is_number(tokens[i + 3]): 62 | node['order'] = parse_int(tokens[i + 3]) 63 | i += lexer.skip_to_end_of_line(tokens, i) 64 | 65 | return i 66 | 67 | def parse_ref_position(tokens, i, node): 68 | if i >= len(tokens): 69 | return 70 | 71 | if tokens[i] == u'AVANT': 72 | node['position'] = u'before' 73 | i += 2 74 | elif tokens[i] == u'APRÈS': 75 | node['position'] = u'after' 76 | i += 2 77 | 78 | return i 79 | -------------------------------------------------------------------------------- /duralex/diff_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | 3 | import re 4 | 5 | from unidiff import PatchSet 6 | 7 | import duralex.tree 8 | 9 | def parse(data, tree): 10 | patches = PatchSet.from_string(data) 11 | for patch in patches: 12 | parse_patch(patch, tree) 13 | 14 | def parse_article_reference(patch, tree): 15 | law_ref = duralex.tree.create_node(tree, { 16 | 'type': duralex.tree.TYPE_LAW_REFERENCE, 17 | 'id': parse_law_id(patch.source_file), 18 | }) 19 | 20 | article_ref = duralex.tree.create_node(law_ref, { 21 | 'type': duralex.tree.TYPE_ARTICLE_REFERENCE, 22 | 'id': parse_article_id(patch.source_file), 23 | }) 24 | 25 | return law_ref 26 | 27 | def parse_law_id(filename): 28 | return re.search(r"loi_([-0-9]+)", filename).group(1) 29 | 30 | def parse_article_id(filename): 31 | return re.search(r"Article_([-0-9]+)\.", filename).group(1) 32 | 33 | def parse_patch(patch, tree): 34 | amendment = duralex.tree.create_node(tree, { 35 | 'type': duralex.tree.TYPE_AMENDMENT, 36 | 'id': '1', 37 | }) 38 | law_ref = parse_article_reference(patch, None) 39 | 40 | if patch.target_file == '/dev/null': 41 | # The patch.source_file has been deleted. 42 | edit = duralex.tree.create_node(amendment, { 43 | 'type': duralex.tree.TYPE_EDIT, 44 | 'editType': 'delete', 45 | }) 46 | duralex.tree.push_node(edit, law_ref) 47 | elif patch.source_file == '/dev/null': 48 | # The patch.target_file has been added. 49 | edit = duralex.tree.create_node(amendment, { 50 | 'type': duralex.tree.TYPE_EDIT, 51 | 'editType': 'add', 52 | }) 53 | duralex.tree.push_node(edit, law_ref) 54 | else: 55 | for hunk in patch: 56 | parse_hunk(hunk, amendment, law_ref) 57 | 58 | def parse_hunk(hunk, parent, ref): 59 | line_type = '' 60 | edit = None 61 | word_def = None 62 | 63 | for line in hunk: 64 | if line.line_type != line_type: 65 | if edit and "editType" in edit: 66 | duralex.tree.push_node(parent, edit) 67 | edit = duralex.tree.create_node(None, { 68 | 'type': duralex.tree.TYPE_EDIT, 69 | }) 70 | duralex.tree.push_node(edit, duralex.tree.copy_node(ref)) 71 | word_def = duralex.tree.create_node(edit, { 72 | 'type': duralex.tree.TYPE_WORD_DEFINITION, 73 | }) 74 | if line.line_type == '+': 75 | edit['editType'] = 'add' 76 | elif line.line_type == '-': 77 | edit['editType'] = 'delete' 78 | line_type = line.line_type 79 | 80 | quote = duralex.tree.create_node(word_def, { 81 | 'type': duralex.tree.TYPE_QUOTE, 82 | 'words': line.value, 83 | }) 84 | 85 | if edit and "editType" in edit: 86 | duralex.tree.push_node(parent, edit) 87 | -------------------------------------------------------------------------------- /tests/DuralexTestCase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | import sys 5 | import os 6 | import json 7 | import difflib 8 | import uuid 9 | 10 | sys.path.insert(0, os.path.join(os.path.realpath(os.path.dirname(__file__)), '..')) 11 | 12 | import duralex.alinea_parser as parser 13 | import duralex.alinea_lexer as lexer 14 | import duralex.tree 15 | 16 | from duralex.DeleteEmptyChildrenVisitor import DeleteEmptyChildrenVisitor 17 | from duralex.DeleteParentVisitor import DeleteParentVisitor 18 | from duralex.DeleteUUIDVisitor import DeleteUUIDVisitor 19 | from duralex.AddParentVisitor import AddParentVisitor 20 | 21 | from colorama import init, Fore 22 | 23 | init() 24 | 25 | class DuralexTestCase(unittest.TestCase): 26 | def pretty_diff_output(self, lines): 27 | out = '\n' 28 | 29 | for line in lines: 30 | if line[0] == '-': 31 | out += Fore.RED + line 32 | elif line[0] == '+': 33 | out += Fore.GREEN + line 34 | else: 35 | out += Fore.RESET + line 36 | out = out + Fore.RESET + '\n' 37 | 38 | return out 39 | 40 | def call_parse_func(self, fn, data, tree=None): 41 | if not tree: 42 | tree = duralex.tree.create_node(None, {}) 43 | fn(lexer.tokenize(data), 0, tree) 44 | return tree 45 | 46 | def add_parent(self, tree): 47 | AddParentVisitor().visit(tree) 48 | return tree 49 | 50 | def add_children(self, tree): 51 | if 'children' not in tree: 52 | tree['children'] = [] 53 | for child in tree['children']: 54 | self.add_children(child) 55 | return tree 56 | 57 | def add_uuid(self, tree): 58 | if 'uuid' not in tree: 59 | tree['uuid'] = str(uuid.uuid4()) 60 | for child in tree['children']: 61 | self.add_uuid(child) 62 | return tree 63 | 64 | def make_tree(self, tree): 65 | tree = self.add_parent(tree) 66 | tree = self.add_children(tree) 67 | tree = self.add_uuid(tree) 68 | return tree 69 | 70 | def call_visitor(self, visitor, tree): 71 | tree = self.make_tree(tree) 72 | visitor().visit(tree) 73 | return tree 74 | 75 | def assertEqualAST(self, a, b): 76 | DeleteParentVisitor().visit(a) 77 | DeleteEmptyChildrenVisitor().visit(a) 78 | DeleteUUIDVisitor().visit(a) 79 | DeleteParentVisitor().visit(b) 80 | DeleteEmptyChildrenVisitor().visit(b) 81 | DeleteUUIDVisitor().visit(b) 82 | 83 | a = json.dumps(a, sort_keys=True, indent=2, ensure_ascii=False) 84 | b = json.dumps(b, sort_keys=True, indent=2, ensure_ascii=False) 85 | 86 | diff = difflib.unified_diff(a.splitlines(), b.splitlines(), fromfile='computed', tofile='expected') 87 | diff_lines = list(diff) 88 | self.assertEqual(len(diff_lines), 0, '\n' + a + self.pretty_diff_output(diff_lines)) 89 | -------------------------------------------------------------------------------- /duralex/ResolveFullyQualifiedReferencesVisitor.py: -------------------------------------------------------------------------------- 1 | from duralex.alinea_parser import * 2 | 3 | from duralex.AbstractVisitor import AbstractVisitor 4 | 5 | class ResolveFullyQualifiedReferencesVisitor(AbstractVisitor): 6 | def __init__(self): 7 | self.ctx = [] 8 | super(ResolveFullyQualifiedReferencesVisitor, self).__init__() 9 | 10 | def visit_node(self, node): 11 | if not self.resolve_fully_qualified_references(node): 12 | super(ResolveFullyQualifiedReferencesVisitor, self).visit_node(node) 13 | 14 | def resolve_fully_qualified_references(self, node): 15 | # If we are on an edit node that has edit ancestors 16 | # if 'type' in node and len(filter(lambda x : x['type'] == 'edit', get_node_ancestors(node))) > 0: 17 | # # FIXME 18 | # None 19 | 20 | # If we have an 'edit' node in an 'edit' node, the parent gives its 21 | # context to its descendants. 22 | if (not duralex.tree.is_reference(node) and len(node['children']) >= 1 and node['children'][0]['type'] == 'edit' 23 | and node['children'][0]['editType'] == 'edit' 24 | and len(filter_nodes(node, lambda n: 'type' in n and n['type'] == 'edit')) > 1): 25 | context = node['children'][0]['children'][0] 26 | remove_node(node, node['children'][0]) 27 | self.ctx.append([copy_node(ctx_node, False) for ctx_node in filter_nodes(context, lambda x: duralex.tree.is_reference(x))]) 28 | for child in node['children']: 29 | self.visit_node(child) 30 | self.ctx.pop() 31 | return True 32 | # If we have a context and there is no ref type at all and we're not on a 'swap' edit 33 | elif len(self.ctx) > 0 and node['type'] == 'edit' and len(filter_nodes(node, lambda x : duralex.tree.is_reference(x))) == 0: 34 | n = [copy_node(item) for sublist in self.ctx for item in sublist] 35 | n = sorted(n, key=lambda x : duralex.tree.TYPE_REFERENCE.index(x['type'])) 36 | unshift_node(node, n[0]) 37 | for i in range(1, len(n)): 38 | unshift_node(n[i - 1], n[i]) 39 | return True 40 | # If we have a context and we're on root ref type 41 | elif len(self.ctx) > 0 and duralex.tree.is_reference(node) and not duralex.tree.is_reference(node['parent']): 42 | n = [copy_node(item) for sublist in self.ctx for item in sublist] 43 | n = sorted(n, key=lambda x : duralex.tree.TYPE_REFERENCE.index(x['type'])) 44 | unshift_node(node['parent'], n[0]) 45 | for i in range(1, len(n)): 46 | unshift_node(n[i - 1], n[i]) 47 | remove_node(node['parent'], node) 48 | if node['type'] == 'incomplete-reference': 49 | if 'position' in node: 50 | n[len(n) - 1]['position'] = node['position'] 51 | else: 52 | unshift_node(n[len(n) - 1], node) 53 | return True 54 | 55 | return False 56 | -------------------------------------------------------------------------------- /duralex/AbstractVisitor.py: -------------------------------------------------------------------------------- 1 | import duralex.tree as tree 2 | 3 | class AbstractVisitor(object): 4 | def __init__(self): 5 | self.visitors = { 6 | tree.TYPE_EDIT: self.visit_edit_node, 7 | tree.TYPE_CODE_REFERENCE: self.visit_code_reference_node, 8 | tree.TYPE_BOOK_REFERENCE: self.visit_book_reference_node, 9 | tree.TYPE_LAW_REFERENCE: self.visit_law_reference_node, 10 | tree.TYPE_TITLE_REFERENCE: self.visit_title_reference_node, 11 | tree.TYPE_ARTICLE_REFERENCE: self.visit_article_reference_node, 12 | tree.TYPE_HEADER1_REFERENCE: self.visit_header1_reference_node, 13 | tree.TYPE_HEADER2_REFERENCE: self.visit_header2_reference_node, 14 | tree.TYPE_HEADER3_REFERENCE: self.visit_header3_reference_node, 15 | tree.TYPE_ALINEA_REFERENCE: self.visit_alinea_reference_node, 16 | tree.TYPE_SENTENCE_REFERENCE: self.visit_sentence_reference_node, 17 | tree.TYPE_WORD_REFERENCE: self.visit_words_reference_node, 18 | tree.TYPE_WORD_DEFINITION: self.visit_words_definition_node, 19 | tree.TYPE_ARTICLE_DEFINITION: self.visit_article_definition_node, 20 | tree.TYPE_QUOTE: self.visit_quote_node, 21 | tree.TYPE_BILL_ARTICLE_REFERENCE: self.visit_bill_article_reference_node, 22 | tree.TYPE_BILL_ARTICLE: self.visit_bill_article_node, 23 | } 24 | 25 | def visit_code_reference_node(self, node, post): 26 | pass 27 | 28 | def visit_book_reference_node(self, node, post): 29 | pass 30 | 31 | def visit_law_reference_node(self, node, post): 32 | pass 33 | 34 | def visit_title_reference_node(self, node, post): 35 | pass 36 | 37 | def visit_article_reference_node(self, node, post): 38 | pass 39 | 40 | def visit_header1_reference_node(self, node, post): 41 | pass 42 | 43 | def visit_header2_reference_node(self, node, post): 44 | pass 45 | 46 | def visit_header3_reference_node(self, node, post): 47 | pass 48 | 49 | def visit_alinea_reference_node(self, node, post): 50 | pass 51 | 52 | def visit_sentence_reference_node(self, node, post): 53 | pass 54 | 55 | def visit_words_reference_node(self, node, post): 56 | pass 57 | 58 | def visit_edit_node(self, node, post): 59 | pass 60 | 61 | def visit_words_definition_node(self, node, post): 62 | pass 63 | 64 | def visit_article_definition_node(self, node, post): 65 | pass 66 | 67 | def visit_quote_node(self, node, post): 68 | pass 69 | 70 | def visit_bill_article_reference_node(self, node, post): 71 | pass 72 | 73 | def visit_bill_article_node(self, node, post): 74 | pass 75 | 76 | def visit_node(self, node): 77 | if 'type' in node and node['type'] in self.visitors: 78 | self.visitors[node['type']](node, False) 79 | 80 | if 'children' in node: 81 | for child in node['children']: 82 | self.visit_node(child) 83 | 84 | if 'type' in node and node['type'] in self.visitors: 85 | self.visitors[node['type']](node, True) 86 | 87 | def visit(self, node): 88 | self.visit_node(node) 89 | -------------------------------------------------------------------------------- /tests/ParseHeader1DefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader1DefinitionTest(DuralexTestCase): 8 | def test_header1(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header1_definition, 12 | ("un I") 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'header1-definition', 17 | 'order': 1 18 | } 19 | ]} 20 | ) 21 | 22 | def test_header1_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_header1_definition, 26 | ("un IV") 27 | ), 28 | {'children': [ 29 | { 30 | 'type': u'header1-definition', 31 | 'order': 4 32 | } 33 | ]} 34 | ) 35 | 36 | def test_header1_with_quote(self): 37 | self.assertEqualAST( 38 | self.call_parse_func( 39 | parser.parse_header1_definition, 40 | ("un III ainsi rédigé :\n" 41 | "\"ceci est le contenu du header1\"") 42 | ), 43 | {'children': [ 44 | { 45 | 'type': u'header1-definition', 46 | 'order': 3, 47 | 'children': [ 48 | { 49 | 'type': u'quote', 50 | 'words': u'ceci est le contenu du header1' 51 | } 52 | ], 53 | } 54 | ]} 55 | ) 56 | 57 | def test_scope_with_quotes(self): 58 | self.assertEqualAST( 59 | self.call_parse_func( 60 | parser.parse_header1_definition, 61 | (u"des III à V ainsi rédigés :\n" 62 | u"\"ceci est le contenu du header1 3\"\n" 63 | u"\"ceci est le contenu du header1 4\"\n" 64 | u"\"ceci est le contenu du header1 5\"") 65 | ), 66 | {'children': [ 67 | { 68 | 'type': u'header1-definition', 69 | 'order': 3, 70 | 'children': [ 71 | { 72 | 'type': u'quote', 73 | 'words': u'ceci est le contenu du header1 3' 74 | } 75 | ], 76 | }, 77 | { 78 | 'type': u'header1-definition', 79 | 'order': 4, 80 | 'children': [ 81 | { 82 | 'type': u'quote', 83 | 'words': u'ceci est le contenu du header1 4' 84 | } 85 | ], 86 | }, 87 | { 88 | 'type': u'header1-definition', 89 | 'order': 5, 90 | 'children': [ 91 | { 92 | 'type': u'quote', 93 | 'words': u'ceci est le contenu du header1 5' 94 | } 95 | ], 96 | } 97 | ]} 98 | ) 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DuraLex 2 | 3 | [![Build Status](https://img.shields.io/travis/Legilibre/DuraLex.svg)](https://travis-ci.org/Legilibre/DuraLex) 4 | [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/Legilibre/DuraLex) 5 | 6 | DuraLex is a French bill compiler. It takes an official bill document written in plain natural French and transforms 7 | it into an automatable semantic data structure. This data structure describes the content of the bill, including but 8 | not limited to: 9 | 10 | * the id and type of the bill 11 | * articles and sections/headers 12 | * each edit with the corresponding operators (add, remove, replace...) and operands (words, articles...) 13 | * references to existing laws, codes, articles, headers... 14 | * definition of new articles, headers... 15 | 16 | DuraLex is the backend for [SedLex](https://github.com/Legilibre/SedLex). 17 | 18 | ## Installation 19 | 20 | Requirements: 21 | 22 | * Python 3+ 23 | * pip 24 | 25 | ```bash 26 | pip install -r requirements.txt 27 | ``` 28 | 29 | ## Usage 30 | 31 | ```bash 32 | usage: duralex [-h] [--file FILE] [--url URL] [--amendments] [--quiet] [--uuid] 33 | 34 | optional arguments: 35 | -h, --help show this help message and exit 36 | --file FILE the path of the bill to process 37 | --url URL the URL of the bill to process 38 | --quiet no stdout output 39 | --uuid add a unique ID on each node 40 | --amendments fetch and include amendments for the specified bill 41 | ``` 42 | 43 | Examples: 44 | 45 | ```bash 46 | ./duralex --file pion1561.html 47 | ``` 48 | ```bash 49 | ./duralex --url http://www.assemblee-nationale.fr/14/propositions/pion1561.asp 50 | ``` 51 | ```bash 52 | curl -s http://www.assemblee-nationale.fr/14/propositions/pion1561.asp | ./duralex 53 | ``` 54 | ```bash 55 | cat http://www.assemblee-nationale.fr/14/propositions/pion1561.asp | ./duralex 56 | ``` 57 | 58 | ## Intermediary representation 59 | 60 | ### Principle 61 | 62 | DuraLex turns plain text into a standardized JSON tree structure intermediary representation. 63 | This standardized intermediary representation can then be used as an input for other (third party) tools. 64 | 65 | ![article to json](article_to_json.jpg) 66 | 67 | ### Example 68 | 69 | The following bill article: 70 | 71 | ``` 72 | L'article 11 de la loi n° 78-753 du 17 juillet 1978 portant diverses mesures d'amélioration des relations entre l'administration et le public et diverses dispositions d'ordre administratif, social et fiscal est abrogé. 73 | ``` 74 | 75 | will give the following intermediary representation: 76 | 77 | ```json 78 | { 79 | "children": [ 80 | { 81 | "children": [ 82 | { 83 | "children": [ 84 | { 85 | "children": [ 86 | { 87 | "id": "11", 88 | "type": "article-reference" 89 | } 90 | ], 91 | "lawDate": "1978-7-17", 92 | "id": "78-753", 93 | "type": "law-reference" 94 | } 95 | ], 96 | "editType": "delete", 97 | "type": "edit" 98 | } 99 | ], 100 | "isNew": false, 101 | "order": 1, 102 | "type": "article" 103 | } 104 | ] 105 | } 106 | ``` 107 | 108 | ## Tests 109 | 110 | ```bash 111 | cd tests 112 | python main.py 113 | ``` 114 | 115 | ## Related projects 116 | 117 | * https://github.com/Legilibre/SedLex 118 | * https://github.com/Legilibre/NuitCodeCitoyen 119 | * https://github.com/Legilibre/Archeo-Lex 120 | * https://github.com/regardscitoyens/the-law-factory-parser 121 | -------------------------------------------------------------------------------- /tests/ParseWordDefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseWordDefinitionTest(DuralexTestCase): 8 | def test_the_word(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_word_definition, 12 | ("le mot \"test\"") 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'word-definition', 17 | 'children': [ 18 | { 19 | 'type': u'quote', 20 | 'words': u'test' 21 | } 22 | ] 23 | } 24 | ]} 25 | ) 26 | 27 | def test_the_words(self): 28 | self.assertEqualAST( 29 | self.call_parse_func( 30 | parser.parse_word_definition, 31 | ("les mots \"ceci est un test\"") 32 | ), 33 | {'children':[ 34 | { 35 | 'type': u'word-definition', 36 | 'children': [ 37 | { 38 | 'type': u'quote', 39 | 'words': u'ceci est un test' 40 | } 41 | ] 42 | } 43 | ]} 44 | ) 45 | 46 | def test_the_number(self): 47 | self.assertEqualAST( 48 | self.call_parse_func( 49 | parser.parse_word_definition, 50 | ("le nombre \"42\"") 51 | ), 52 | {'children':[ 53 | { 54 | 'type': u'word-definition', 55 | 'children': [ 56 | { 57 | 'type': u'quote', 58 | 'words': u'42' 59 | } 60 | ] 61 | } 62 | ]} 63 | ) 64 | 65 | def test_the_figure(self): 66 | self.assertEqualAST( 67 | self.call_parse_func( 68 | parser.parse_word_definition, 69 | ("le nombre \"4\"") 70 | ), 71 | {'children':[ 72 | { 73 | 'type': u'word-definition', 74 | 'children': [ 75 | { 76 | 'type': u'quote', 77 | 'words': u'4' 78 | } 79 | ] 80 | } 81 | ]} 82 | ) 83 | 84 | def test_the_reference(self): 85 | self.assertEqualAST( 86 | self.call_parse_func( 87 | parser.parse_word_definition, 88 | ("la référence \"ceci est une référence\"") 89 | ), 90 | {'children':[ 91 | { 92 | 'type': u'word-definition', 93 | 'children': [ 94 | { 95 | 'type': u'quote', 96 | 'words': u'ceci est une référence' 97 | } 98 | ] 99 | } 100 | ]} 101 | ) 102 | 103 | def test_the_references(self): 104 | self.assertEqualAST( 105 | self.call_parse_func( 106 | parser.parse_word_definition, 107 | ("la références \"ceci est une référence\"") 108 | ), 109 | {'children':[ 110 | { 111 | 'type': u'word-definition', 112 | 'children': [ 113 | { 114 | 'type': u'quote', 115 | 'words': u'ceci est une référence' 116 | } 117 | ] 118 | } 119 | ]} 120 | ) 121 | -------------------------------------------------------------------------------- /tests/ParseSentenceReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseSentenceReferenceTest(DuralexTestCase): 8 | def test_position_sentence(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_sentence_reference, 12 | u"la première phrase" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'sentence-reference', 17 | 'order': 1 18 | } 19 | ]} 20 | ) 21 | 22 | def test_position_sentence_2(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_sentence_reference, 26 | u"à la première phrase" 27 | ), 28 | {'children':[ 29 | { 30 | 'type': u'sentence-reference', 31 | 'order': 1 32 | } 33 | ]} 34 | ) 35 | 36 | def test_position_sentence_3(self): 37 | self.assertEqualAST( 38 | self.call_parse_func( 39 | parser.parse_sentence_reference, 40 | u"la seconde phrase" 41 | ), 42 | {'children':[ 43 | { 44 | 'type': u'sentence-reference', 45 | 'order': 2 46 | } 47 | ]} 48 | ) 49 | 50 | def test_position_sentence_article_id_code_name(self): 51 | self.assertEqualAST( 52 | self.call_parse_func( 53 | parser.parse_sentence_reference, 54 | u"la première phrase de l'article L. 114-5" 55 | ), 56 | {'children':[ 57 | { 58 | 'type': u'sentence-reference', 59 | 'order': 1, 60 | 'children': [ 61 | { 62 | 'type': u'article-reference', 63 | 'id': u'L. 114-5' 64 | } 65 | ] 66 | } 67 | ]} 68 | ) 69 | 70 | def test_position_sentence_article_id_code_name(self): 71 | self.assertEqualAST( 72 | self.call_parse_func( 73 | parser.parse_sentence_reference, 74 | u"la première phrase de l'article L. 114-5 du code de la recherche" 75 | ), 76 | {'children':[ 77 | { 78 | 'type': u'sentence-reference', 79 | 'order': 1, 80 | 'children': [ 81 | { 82 | 'type': u'article-reference', 83 | 'id': u'L. 114-5', 84 | 'children': [ 85 | { 86 | 'type': u'code-reference', 87 | 'id': u'code de la recherche' 88 | } 89 | ] 90 | } 91 | ] 92 | } 93 | ]} 94 | ) 95 | 96 | def test_the_end_of_the_nth_sentence(self): 97 | self.assertEqualAST( 98 | self.call_parse_func( 99 | parser.parse_sentence_reference, 100 | u"la fin de la première phrase" 101 | ), 102 | {'children':[ 103 | { 104 | 'type': u'sentence-reference', 105 | 'scope': 'end', 106 | 'order': 1, 107 | } 108 | ]} 109 | ) 110 | 111 | def test_the_first_two_sentences(self): 112 | self.assertEqualAST( 113 | self.call_parse_func( 114 | parser.parse_sentence_reference, 115 | u"les deux premières phrases" 116 | ), 117 | {'children':[ 118 | { 119 | 'type': u'sentence-reference', 120 | 'order': [0, 2] 121 | } 122 | ]} 123 | ) 124 | -------------------------------------------------------------------------------- /tests/ParseHeader2DefinitionTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader2DefinitionTest(DuralexTestCase): 8 | def test_header2_order_with_quote(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header2_definition, 12 | "un 1° ainsi rédigé : \n\"ceci est un test\"" 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'header2-definition', 17 | 'order': 1, 18 | 'children': [ 19 | { 20 | 'type': u'quote', 21 | 'words': u'ceci est un test' 22 | } 23 | ], 24 | } 25 | ]} 26 | ) 27 | 28 | def test_header2_ellipsis_with_quote(self): 29 | self.assertEqualAST( 30 | self.call_parse_func( 31 | parser.parse_header2_definition, 32 | "un ... ° ainsi rédigé : \n\"ceci est un test\"" 33 | ), 34 | {'children': [ 35 | { 36 | 'type': u'header2-definition', 37 | 'order': '...', 38 | 'children': [ 39 | { 40 | 'type': u'quote', 41 | 'words': u'ceci est un test' 42 | } 43 | ], 44 | } 45 | ]} 46 | ) 47 | 48 | def test_header2_order_suborder(self): 49 | self.assertEqualAST( 50 | self.call_parse_func( 51 | parser.parse_header2_definition, 52 | "un 1° A bis" 53 | ), 54 | {'children': [ 55 | { 56 | 'type': u'header2-definition', 57 | 'order': 1, 58 | 'isBis': True, 59 | 'subOrder': 'A' 60 | } 61 | ]} 62 | ) 63 | 64 | def test_scope_with_quotes(self): 65 | self.assertEqualAST( 66 | self.call_parse_func( 67 | parser.parse_header2_definition, 68 | (u"des 5° à 8° ainsi rédigés :\n" 69 | u"\"ceci est le contenu du header2 5\"\n" 70 | u"\"ceci est le contenu du header2 6\"\n" 71 | u"\"ceci est le contenu du header2 7\"\n" 72 | u"\"ceci est le contenu du header2 8\"") 73 | ), 74 | {'children': [ 75 | { 76 | 'type': u'header2-definition', 77 | 'order': 5, 78 | 'children': [ 79 | { 80 | 'type': u'quote', 81 | 'words': u'ceci est le contenu du header2 5' 82 | } 83 | ], 84 | }, 85 | { 86 | 'type': u'header2-definition', 87 | 'order': 6, 88 | 'children': [ 89 | { 90 | 'type': u'quote', 91 | 'words': u'ceci est le contenu du header2 6' 92 | } 93 | ], 94 | }, 95 | { 96 | 'type': u'header2-definition', 97 | 'order': 7, 98 | 'children': [ 99 | { 100 | 'type': u'quote', 101 | 'words': u'ceci est le contenu du header2 7' 102 | } 103 | ], 104 | }, 105 | { 106 | 'type': u'header2-definition', 107 | 'order': 8, 108 | 'children': [ 109 | { 110 | 'type': u'quote', 111 | 'words': u'ceci est le contenu du header2 8' 112 | } 113 | ], 114 | }, 115 | ]} 116 | ) 117 | -------------------------------------------------------------------------------- /bin/duralex: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding=utf-8 -*- 3 | 4 | import codecs 5 | import os 6 | import json 7 | import sys 8 | import argparse 9 | 10 | import requests 11 | 12 | sys.path.insert(0, os.path.join(os.path.realpath(os.path.dirname(__file__)), '..')) 13 | 14 | import duralex.alinea_parser 15 | import duralex.bill_parser 16 | import duralex.amendment_parser 17 | import duralex.diff_parser 18 | from duralex.DeleteEmptyChildrenVisitor import DeleteEmptyChildrenVisitor 19 | from duralex.DeleteParentVisitor import DeleteParentVisitor 20 | from duralex.DeleteUUIDVisitor import DeleteUUIDVisitor 21 | from duralex.ForkReferenceVisitor import ForkReferenceVisitor 22 | from duralex.SortReferencesVisitor import SortReferencesVisitor 23 | from duralex.ResolveFullyQualifiedReferencesVisitor import ResolveFullyQualifiedReferencesVisitor 24 | from duralex.ResolveFullyQualifiedDefinitionsVisitor import ResolveFullyQualifiedDefinitionsVisitor 25 | from duralex.RemoveQuotePrefixVisitor import RemoveQuotePrefixVisitor 26 | from duralex.FixMissingCodeOrLawReferenceVisitor import FixMissingCodeOrLawReferenceVisitor 27 | from duralex.SwapDefinitionAndReferenceVisitor import SwapDefinitionAndReferenceVisitor 28 | 29 | def decode(data, encoding = None): 30 | if encoding: 31 | return data.decode(encoding) 32 | 33 | try: 34 | data = data.decode('utf-8') 35 | except: 36 | try: 37 | data = data.decode('iso-8859-1') 38 | except: 39 | pass 40 | 41 | return data 42 | 43 | def handle_data(data, args): 44 | if data.startswith('diff'): 45 | tree = duralex.tree.create_node(None, {}) 46 | duralex.diff_parser.parse(data, tree) 47 | else: 48 | bill_data = duralex.bill_parser.parse_bill(data, args.url) 49 | tree = duralex.tree.create_node(None, {}) 50 | for field in ['id', 'type', 'legislature', 'url', 'description', 'date', 'place']: 51 | if field in bill_data: 52 | tree[field] = bill_data[field] 53 | 54 | duralex.alinea_parser.parse(bill_data, tree) 55 | 56 | if args.amendments: 57 | if args.amendments == '-': 58 | amendment_url = ( 59 | 'https://www.nosdeputes.fr/' 60 | + str(bill_data['legislature']) 61 | + '/amendements/' 62 | + str(bill_data['id']) 63 | + '/json' 64 | ) 65 | amendments = requests.get(amendment_url).text 66 | else: 67 | amendments = open(args.amendments, 'r').read() 68 | amendments = decode(amendments) 69 | amendments = json.loads(amendments) 70 | duralex.amendment_parser.parse(amendments, tree) 71 | 72 | ForkReferenceVisitor().visit(tree) 73 | ResolveFullyQualifiedDefinitionsVisitor().visit(tree) 74 | ResolveFullyQualifiedReferencesVisitor().visit(tree) 75 | FixMissingCodeOrLawReferenceVisitor().visit(tree) 76 | SortReferencesVisitor().visit(tree) 77 | SwapDefinitionAndReferenceVisitor().visit(tree) 78 | RemoveQuotePrefixVisitor().visit(tree) 79 | 80 | if not args.uuid: 81 | DeleteUUIDVisitor().visit(tree) 82 | 83 | DeleteParentVisitor().visit(tree) 84 | DeleteEmptyChildrenVisitor().visit(tree) 85 | 86 | if not args.quiet: 87 | json_data = json.dumps(tree, sort_keys=True, indent=2, ensure_ascii=False) 88 | sys.stdout.write(json_data) 89 | 90 | def main(argv=None): 91 | parser = argparse.ArgumentParser(prog='duralex') 92 | parser.add_argument('--file', help='the path of the bill to process', type=argparse.FileType('r'), default='-') 93 | parser.add_argument('--url', help='the URL of the bill to process') 94 | parser.add_argument('--quiet', action='store_true', help='no stdout output') 95 | parser.add_argument('--uuid', action='store_true', help='add a unique ID on each node') 96 | parser.add_argument('--amendments', nargs='?', const='-', default=False, help='fetch and parse amendements') 97 | parser.add_argument('--debug', action='store_true') 98 | 99 | args = parser.parse_args() 100 | 101 | if args.url: 102 | res = requests.get(args.url) 103 | data = decode(res.content, res.apparent_encoding) 104 | elif args.file: 105 | data = decode(args.file.read()) 106 | 107 | handle_data(data, args) 108 | 109 | return 0 110 | 111 | if __name__ == "__main__": 112 | if sys.version_info < (3,0): 113 | raise Exception( 114 | 'DuraLex requires Python 3.0+, current version is ' 115 | + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) 116 | ) 117 | 118 | sys.exit(main()) 119 | -------------------------------------------------------------------------------- /duralex/tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import uuid 4 | 5 | TYPE_HEADER1 = u'header1' 6 | TYPE_HEADER2 = u'header2' 7 | TYPE_HEADER3 = u'header3' 8 | TYPE_BILL_ARTICLE = u'bill-article' 9 | TYPE_AMENDMENT = u'amendment' 10 | TYPE_EDIT = u'edit' 11 | TYPE_QUOTE = u'quote' 12 | TYPE_LAW_PROJECT = u'law-project' 13 | TYPE_LAW_PROPOSAL = u'law-proposal' 14 | 15 | TYPE_TITLE_DEFINITION = u'title-definition' 16 | TYPE_ARTICLE_DEFINITION = u'article-definition' 17 | TYPE_HEADER1_DEFINITION = u'header1-definition' 18 | TYPE_HEADER2_DEFINITION = u'header2-definition' 19 | TYPE_HEADER3_DEFINITION = u'header3-definition' 20 | TYPE_SUBPARAGRAPH_DEFINITION = u'subparagraph-definition' 21 | TYPE_ALINEA_DEFINITION = u'alinea-definition' 22 | TYPE_SENTENCE_DEFINITION = u'sentence-definition' 23 | TYPE_MENTION_DEFINITION = u'mention-definition' 24 | TYPE_WORD_DEFINITION = u'word-definition' 25 | 26 | TYPE_DEFINITION = [ 27 | TYPE_TITLE_DEFINITION, 28 | TYPE_ARTICLE_DEFINITION, 29 | TYPE_HEADER1_DEFINITION, 30 | TYPE_HEADER2_DEFINITION, 31 | TYPE_HEADER3_DEFINITION, 32 | TYPE_SUBPARAGRAPH_DEFINITION, 33 | TYPE_ALINEA_DEFINITION, 34 | TYPE_SENTENCE_DEFINITION, 35 | TYPE_MENTION_DEFINITION, 36 | TYPE_WORD_DEFINITION, 37 | ] 38 | 39 | TYPE_BILL_ARTICLE_REFERENCE = u'bill-article-reference' 40 | TYPE_CODE_REFERENCE = u'code-reference' 41 | TYPE_CODE_PART_REFERENCE = u'code-part-reference' 42 | TYPE_BOOK_REFERENCE = u'book-reference' 43 | TYPE_LAW_REFERENCE = u'law-reference' 44 | TYPE_TITLE_REFERENCE = u'title-reference' 45 | TYPE_CHAPTER_REFERENCE = u'chapter-reference' 46 | TYPE_SECTION_REFERENCE = u'section-reference' 47 | TYPE_SUBSECTION_REFERENCE = u'subsection-reference' 48 | TYPE_PARAGRAPH_REFERENCE = u'paragraph-reference' 49 | TYPE_ARTICLE_REFERENCE = u'article-reference' 50 | TYPE_HEADER1_REFERENCE = u'header1-reference' 51 | TYPE_HEADER2_REFERENCE = u'header2-reference' 52 | TYPE_HEADER3_REFERENCE = u'header3-reference' 53 | TYPE_ALINEA_REFERENCE = u'alinea-reference' 54 | TYPE_SENTENCE_REFERENCE = u'sentence-reference' 55 | TYPE_WORD_REFERENCE = u'word-reference' 56 | TYPE_INCOMPLETE_REFERENCE = u'incomplete-reference' 57 | 58 | TYPE_REFERENCE = [ 59 | TYPE_CODE_REFERENCE, 60 | TYPE_CODE_PART_REFERENCE, 61 | TYPE_BOOK_REFERENCE, 62 | TYPE_LAW_REFERENCE, 63 | TYPE_TITLE_REFERENCE, 64 | TYPE_CHAPTER_REFERENCE, 65 | TYPE_SECTION_REFERENCE, 66 | TYPE_SUBSECTION_REFERENCE, 67 | TYPE_PARAGRAPH_REFERENCE, 68 | TYPE_ARTICLE_REFERENCE, 69 | TYPE_HEADER1_REFERENCE, 70 | TYPE_HEADER2_REFERENCE, 71 | TYPE_HEADER3_REFERENCE, 72 | TYPE_ALINEA_REFERENCE, 73 | TYPE_SENTENCE_REFERENCE, 74 | TYPE_WORD_REFERENCE, 75 | TYPE_INCOMPLETE_REFERENCE, 76 | TYPE_BILL_ARTICLE_REFERENCE, 77 | ] 78 | 79 | def unshift_node(parent, node): 80 | node['parent'] = parent 81 | if 'children' not in parent: 82 | parent['children'] = [] 83 | parent['children'] = [node] + parent['children'] 84 | 85 | def push_node(parent, node): 86 | if 'parent' in node: 87 | remove_node(node['parent'], node) 88 | node['parent'] = parent 89 | if 'children' not in parent: 90 | parent['children'] = [] 91 | parent['children'].append(node) 92 | 93 | def create_node(parent, node): 94 | if 'children' not in node: 95 | node['children'] = [] 96 | node['uuid'] = str(uuid.uuid4()) 97 | 98 | if parent: 99 | push_node(parent, node) 100 | 101 | return node 102 | 103 | def compare_nodes(a, b): 104 | return a['uuid'] == b['uuid'] if 'uuid' in a and 'uuid' in b else a == b 105 | 106 | def remove_node(parent, node): 107 | if not parent: 108 | raise Exception('invalid parent') 109 | if 'parent' not in node or node['parent'] != parent: 110 | raise Exception('parent node does not match') 111 | 112 | for i in range(0, len(parent['children'])): 113 | if compare_nodes(node, parent['children'][i]): 114 | del parent['children'][i] 115 | del node['parent'] 116 | return True 117 | 118 | return False 119 | 120 | def copy_node(node, recursive=True): 121 | c = node.copy() 122 | if 'uuid' in c: 123 | c['uuid'] = str(uuid.uuid4()) 124 | if 'parent' in c: 125 | del c['parent'] 126 | c['children'] = [] 127 | if 'children' in node and recursive: 128 | for child in node['children']: 129 | push_node(c, copy_node(child)) 130 | return c 131 | 132 | def get_node_depth(node): 133 | if not 'parent' in node: 134 | return 0 135 | return 1 + get_node_depth(node['parent']) 136 | 137 | def get_root(node): 138 | while 'parent' in node: 139 | node = node['parent'] 140 | 141 | return node 142 | 143 | def filter_nodes(root, fn): 144 | return filter_nodes_rec(root, fn, []) 145 | 146 | def filter_nodes_rec(root, fn, results): 147 | if fn(root): 148 | results.append(root) 149 | 150 | if 'children' in root: 151 | for child in root['children']: 152 | filter_nodes_rec(child, fn, results) 153 | 154 | return results 155 | 156 | def is_definition(node): 157 | return 'type' in node and node['type'] in TYPE_DEFINITION 158 | 159 | def is_reference(node): 160 | return 'type' in node and node['type'] in TYPE_REFERENCE 161 | 162 | def is_root(node): 163 | return 'parent' not in node 164 | 165 | def get_node_descendants(node): 166 | return filter_nodes(node, lambda n: True) 167 | 168 | def get_node_ancestors(node): 169 | a = [] 170 | 171 | if 'parent' not in node: 172 | return a 173 | 174 | node = node['parent'] 175 | while node and 'type' in node: 176 | a.append(node) 177 | node = node['parent'] if 'parent' in node else None 178 | return a 179 | -------------------------------------------------------------------------------- /tests/ParseLawReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseLawReferenceTest(DuralexTestCase): 8 | def test_ordonnance_with_id(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_law_reference, 12 | u"l'ordonnance n° 2008-1305 du 11 décembre 2008" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'law-reference', 17 | 'id': u'2008-1305', 18 | 'lawDate': u'2008-12-11', 19 | 'lawType': u'ordonnance' 20 | } 21 | ]} 22 | ) 23 | 24 | def test_ordonnance_with_id_2(self): 25 | self.assertEqualAST( 26 | self.call_parse_func( 27 | parser.parse_law_reference, 28 | u"de l'ordonnance n° 2008-1305 du 11 décembre 2008" 29 | ), 30 | {'children':[ 31 | { 32 | 'type': u'law-reference', 33 | 'id': u'2008-1305', 34 | 'lawDate': u'2008-12-11', 35 | 'lawType': u'ordonnance' 36 | } 37 | ]} 38 | ) 39 | 40 | def test_law_with_id(self): 41 | self.assertEqualAST( 42 | self.call_parse_func( 43 | parser.parse_law_reference, 44 | u"la loi n° 2007-1199" 45 | ), 46 | {'children':[ 47 | { 48 | 'type': u'law-reference', 49 | 'id': u'2007-1199' 50 | } 51 | ]} 52 | ) 53 | 54 | def test_the_same_law(self): 55 | self.assertEqualAST( 56 | self.call_parse_func( 57 | parser.parse_law_reference, 58 | u"de la même loi", 59 | {'children':[ 60 | { 61 | 'type': u'law-reference', 62 | 'id': u'2007-1199' 63 | } 64 | ]} 65 | ), 66 | {'children':[ 67 | { 68 | 'type': u'law-reference', 69 | 'id': u'2007-1199' 70 | }, 71 | { 72 | 'type': u'law-reference', 73 | 'id': u'2007-1199' 74 | } 75 | ]} 76 | ) 77 | 78 | def test_the_same_law_word_ref(self): 79 | self.assertEqualAST( 80 | self.call_parse_func( 81 | parser.parse_law_reference, 82 | u"de la même loi, les mots \"ceci est un test\"", 83 | {'children':[ 84 | { 85 | 'type': u'law-reference', 86 | 'id': u'2007-1199' 87 | } 88 | ]} 89 | ), 90 | {'children':[ 91 | { 92 | 'type': u'law-reference', 93 | 'id': u'2007-1199' 94 | }, 95 | { 96 | 'type': u'law-reference', 97 | 'id': u'2007-1199', 98 | 'children': [ 99 | { 100 | 'type': u'word-reference', 101 | 'children': [ 102 | { 103 | 'type': u'quote', 104 | 'words': u'ceci est un test' 105 | } 106 | ] 107 | } 108 | ] 109 | } 110 | ]} 111 | ) 112 | 113 | def test_law_with_id_2(self): 114 | self.assertEqualAST( 115 | self.call_parse_func( 116 | parser.parse_law_reference, 117 | u"de la loi n° 2007-1199" 118 | ), 119 | {'children':[ 120 | { 121 | 'type': u'law-reference', 122 | 'id': u'2007-1199' 123 | } 124 | ]} 125 | ) 126 | 127 | def test_law_with_id_and_date(self): 128 | self.assertEqualAST( 129 | self.call_parse_func( 130 | parser.parse_law_reference, 131 | u"la loi n° 2007-1199 du 10 août 2007" 132 | ), 133 | {'children':[ 134 | { 135 | 'type': u'law-reference', 136 | 'id': u'2007-1199', 137 | 'lawDate': u'2007-8-10' 138 | } 139 | ]} 140 | ) 141 | 142 | def test_law_with_id_and_date_2(self): 143 | self.assertEqualAST( 144 | self.call_parse_func( 145 | parser.parse_law_reference, 146 | u"de la loi n° 2007-1199 du 10 août 2007" 147 | ), 148 | {'children':[ 149 | { 150 | 'type': u'law-reference', 151 | 'id': u'2007-1199', 152 | 'lawDate': u'2007-8-10' 153 | } 154 | ]} 155 | ) 156 | 157 | def test_law_with_id_and_code_name(self): 158 | self.assertEqualAST( 159 | self.call_parse_func( 160 | parser.parse_law_reference, 161 | u"l'ordonnance n° 2008-1305 du 11 décembre 2008 modifiant la partie législative du code de la recherche" 162 | ), 163 | {'children':[ 164 | { 165 | 'type': u'law-reference', 166 | 'lawType': u'ordonnance', 167 | 'id': u'2008-1305', 168 | 'lawDate': u'2008-12-11', 169 | 'children': [ 170 | { 171 | 'type': u'code-reference', 172 | 'id': u'code de la recherche' 173 | } 174 | ] 175 | } 176 | ]} 177 | ) 178 | -------------------------------------------------------------------------------- /tests/ParseDefinitionListTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseDefinitionListTest(DuralexTestCase): 8 | def test_n_sentences_and_n_alineas_with_quotes(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_definition_list, 12 | (u"cinq phrases et cinq alinéas ainsi rédigés : \n" 13 | u"\"alinéa 1\"\n" 14 | u"\"alinéa 2\"\n" 15 | u"\"alinéa 3\"\n" 16 | u"\"alinéa 4\"\n" 17 | u"\"alinéa 5\"\n") 18 | ), 19 | {'children': [ 20 | { 21 | 'count': 5, 22 | 'type': u'sentence-definition' 23 | }, 24 | { 25 | 'children': [ 26 | { 27 | 'type': u'quote', 28 | 'words': u'alinéa 1' 29 | } 30 | ], 31 | 'type': u'alinea-definition' 32 | }, 33 | { 34 | 'children': [ 35 | { 36 | 'type': u'quote', 37 | 'words': u'alinéa 2' 38 | } 39 | ], 40 | 'type': u'alinea-definition' 41 | }, 42 | { 43 | 'children': [ 44 | { 45 | 'type': u'quote', 46 | 'words': u'alinéa 3' 47 | } 48 | ], 49 | 'type': u'alinea-definition' 50 | }, 51 | { 52 | 'children': [ 53 | { 54 | 'type': u'quote', 55 | 'words': u'alinéa 4' 56 | } 57 | ], 58 | 'type': u'alinea-definition' 59 | }, 60 | { 61 | 'children': [ 62 | { 63 | 'type': u'quote', 64 | 'words': u'alinéa 5' 65 | } 66 | ], 67 | 'type': u'alinea-definition' 68 | } 69 | ]} 70 | ) 71 | 72 | def test_n_header1_with_n_quotes(self): 73 | self.assertEqualAST( 74 | self.call_parse_func( 75 | parser.parse_definition_list, 76 | ("un III et un IV ainsi rédigés :\n" 77 | "\"ceci est le contenu du premier header1\"\n" 78 | "\"ceci est le contenu du second header1\"") 79 | ), 80 | {'children': [ 81 | { 82 | 'type': u'header1-definition', 83 | 'order': 3, 84 | 'children': [ 85 | { 86 | 'type': u'quote', 87 | 'words': u'ceci est le contenu du premier header1' 88 | } 89 | ], 90 | }, 91 | { 92 | 'type': u'header1-definition', 93 | 'order': 4, 94 | 'children': [ 95 | { 96 | 'type': u'quote', 97 | 'words': u'ceci est le contenu du second header1' 98 | } 99 | ], 100 | } 101 | ]} 102 | ) 103 | 104 | def test_n_header2_with_n_quotes(self): 105 | self.assertEqualAST( 106 | self.call_parse_func( 107 | parser.parse_definition_list, 108 | ("un 2° et un 3° ainsi rédigés :\n" 109 | "\"ceci est le contenu du premier header2\"\n" 110 | "\"ceci est le contenu du second header2\"") 111 | ), 112 | {'children': [ 113 | { 114 | 'type': u'header2-definition', 115 | 'order': 2, 116 | 'children': [ 117 | { 118 | 'type': u'quote', 119 | 'words': u'ceci est le contenu du premier header2' 120 | } 121 | ], 122 | }, 123 | { 124 | 'type': u'header2-definition', 125 | 'order': 3, 126 | 'children': [ 127 | { 128 | 'type': u'quote', 129 | 'words': u'ceci est le contenu du second header2' 130 | } 131 | ], 132 | } 133 | ]} 134 | ) 135 | 136 | def test_n_alineas_with_quotes(self): 137 | self.assertEqualAST( 138 | self.call_parse_func( 139 | parser.parse_definition_list, 140 | (u"trois alinéas ainsi rédigés : \n" 141 | u"\"alinéa 1\"\n" 142 | u"\"alinéa 2\"\n" 143 | u"\"alinéa 3\"") 144 | ), 145 | {'children': [ 146 | { 147 | 'children': [ 148 | { 149 | 'type': u'quote', 150 | 'words': u'alinéa 1' 151 | } 152 | ], 153 | 'type': u'alinea-definition' 154 | }, 155 | { 156 | 'children': [ 157 | { 158 | 'type': u'quote', 159 | 'words': u'alinéa 2' 160 | } 161 | ], 162 | 'type': u'alinea-definition' 163 | }, 164 | { 165 | 'children': [ 166 | { 167 | 'type': u'quote', 168 | 'words': u'alinéa 3' 169 | } 170 | ], 171 | 'type': u'alinea-definition' 172 | } 173 | ]} 174 | ) 175 | 176 | -------------------------------------------------------------------------------- /tests/SortReferencesVisitorTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | from duralex.SortReferencesVisitor import SortReferencesVisitor 6 | 7 | class SortReferencesVisitorTest(DuralexTestCase): 8 | def test_law_article(self): 9 | self.assertEqualAST( 10 | self.call_visitor(SortReferencesVisitor, {'children': [ 11 | { 12 | 'lawDate': u'1978-7-17', 13 | 'id': u'78-753', 14 | 'type': u'law-reference', 15 | 'children': [ 16 | { 17 | 'type': u'article-reference', 18 | 'id': u'11', 19 | } 20 | ] 21 | } 22 | ]}), 23 | {'children': [ 24 | { 25 | 'lawDate': u'1978-7-17', 26 | 'id': u'78-753', 27 | 'type': u'law-reference', 28 | 'children': [ 29 | { 30 | 'type': u'article-reference', 31 | 'id': u'11' 32 | } 33 | ] 34 | } 35 | ]} 36 | ) 37 | 38 | def test_article_law(self): 39 | self.assertEqualAST( 40 | self.call_visitor(SortReferencesVisitor, {'children': [ 41 | { 42 | 'type': u'article-reference', 43 | 'id': u'11', 44 | 'children': [ 45 | { 46 | 'lawDate': u'1978-7-17', 47 | 'id': u'78-753', 48 | 'type': u'law-reference', 49 | } 50 | ] 51 | } 52 | ]}), 53 | {'children': [ 54 | { 55 | 'lawDate': u'1978-7-17', 56 | 'id': u'78-753', 57 | 'type': u'law-reference', 58 | 'children': [ 59 | { 60 | 'type': u'article-reference', 61 | 'id': u'11' 62 | } 63 | ] 64 | } 65 | ]} 66 | ) 67 | 68 | def test_paragraph_subsection_section_chapter_title_book(self): 69 | self.assertEqualAST( 70 | self.call_visitor(SortReferencesVisitor, {'children': [ 71 | { 72 | 'children': [ 73 | { 74 | 'children': [ 75 | { 76 | 'children': [ 77 | { 78 | 'children': [ 79 | { 80 | 'children': [ 81 | { 82 | 'order': 1, 83 | 'type': u'book-reference' 84 | } 85 | ], 86 | 'order': 3, 87 | 'type': u'title-reference' 88 | } 89 | ], 90 | 'order': 2, 91 | 'type': u'chapter-reference' 92 | } 93 | ], 94 | 'order': 2, 95 | 'type': u'section-reference' 96 | } 97 | ], 98 | 'order': 2, 99 | 'type': u'subsection-reference' 100 | } 101 | ], 102 | 'order': 3, 103 | 'type': u'paragraph-reference' 104 | } 105 | ]}), 106 | {'children': [ 107 | { 108 | 'children': [ 109 | { 110 | 'children': [ 111 | { 112 | 'children': [ 113 | { 114 | 'children': [ 115 | { 116 | 'children': [ 117 | { 118 | 'order': 3, 119 | 'type': u'paragraph-reference' 120 | } 121 | ], 122 | 'order': 2, 123 | 'type': u'subsection-reference' 124 | } 125 | ], 126 | 'order': 2, 127 | 'type': u'section-reference' 128 | } 129 | ], 130 | 'order': 2, 131 | 'type': u'chapter-reference' 132 | } 133 | ], 134 | 'order': 3, 135 | 'type': u'title-reference' 136 | } 137 | ], 138 | 'order': 1, 139 | 'type': u'book-reference' 140 | } 141 | ]} 142 | ) 143 | 144 | def test_article_ref_article_ref(self): 145 | self.assertEqualAST( 146 | self.call_visitor(SortReferencesVisitor, {'children': [ 147 | { 148 | 'type': u'article-reference', 149 | 'id': u'11', 150 | 'children': [ 151 | { 152 | 'type': u'article-reference', 153 | 'id': u'42' 154 | } 155 | ] 156 | } 157 | ]}), 158 | {'children': [ 159 | { 160 | 'type': u'article-reference', 161 | 'id': u'42' 162 | } 163 | ]} 164 | ) 165 | -------------------------------------------------------------------------------- /tests/ParseHeader1Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseHeader1Test(DuralexTestCase): 8 | def test_header1_raw_content(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_header1, 12 | (u"I. Ceci est un header1.") 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'header1', 17 | 'order': 1, 18 | 'children': [ 19 | { 20 | 'content': u'Ceci est un header1.', 21 | 'type': u'raw-content' 22 | } 23 | ] 24 | } 25 | ]} 26 | ) 27 | 28 | def test_header1_incomplete_edit_header2_edit(self): 29 | self.assertEqualAST( 30 | self.call_parse_func( 31 | parser.parse_header1, 32 | (u"L'article L. 123-5 du code de l'éducation est ainsi modifié :\n" 33 | u"1° À la première phrase, les mots : \"mots d'origine\" sont remplacés par les mots : \"mots de remplacement\".") 34 | ), 35 | {'children':[ 36 | { 37 | 'editType': u'edit', 38 | 'type': u'edit', 39 | 'children': [ 40 | { 41 | 'id': u'L. 123-5', 42 | 'type': u'article-reference', 43 | 'children': [ 44 | { 45 | 'type': u'code-reference', 46 | 'id': u'code de l\'éducation' 47 | } 48 | ] 49 | } 50 | ] 51 | }, 52 | { 53 | 'type': u'header2', 54 | 'order': 1, 55 | 'children': [ 56 | { 57 | 'editType': u'replace', 58 | 'type': u'edit', 59 | 'children': [ 60 | { 61 | 'type': u'sentence-reference', 62 | 'order': 1, 63 | 'children': [ 64 | { 65 | 'type': u'word-reference', 66 | 'children': [ 67 | { 68 | 'type': u'quote', 69 | 'words': u'mots d\'origine' 70 | } 71 | ] 72 | } 73 | ] 74 | }, 75 | { 76 | 'type': u'word-definition', 77 | 'children': [ 78 | { 79 | 'type': u'quote', 80 | 'words': u'mots de remplacement' 81 | } 82 | ] 83 | } 84 | ] 85 | } 86 | ] 87 | } 88 | ]} 89 | ) 90 | 91 | def test_header1_incomplete_edit_header2_incomplete_edit_header3_edit(self): 92 | self.assertEqualAST( 93 | self.call_parse_func( 94 | parser.parse_header1, 95 | (u"L'article L. 123-5 du code de l'éducation est ainsi modifié :\n" 96 | u"1° L'avant-dernier alinéa est ainsi modifié :\n" 97 | u"a) À la première phrase, les mots : \"mots d'origine\" sont remplacés par les mots : \"mots de remplacement\".") 98 | ), 99 | {'children':[ 100 | { 101 | 'editType': u'edit', 102 | 'type': u'edit', 103 | 'children': [ 104 | { 105 | 'id': u'L. 123-5', 106 | 'type': u'article-reference', 107 | 'children': [ 108 | { 109 | 'type': u'code-reference', 110 | 'id': u'code de l\'éducation' 111 | } 112 | ] 113 | } 114 | ] 115 | }, 116 | { 117 | 'type': u'header2', 118 | 'order': 1, 119 | 'children': [ 120 | { 121 | 'editType': u'edit', 122 | 'type': u'edit', 123 | 'children': [ 124 | { 125 | 'order': -2, 126 | 'type': u'alinea-reference' 127 | } 128 | ] 129 | }, 130 | { 131 | 'type': u'header3', 132 | 'order': 1, 133 | 'children': [ 134 | { 135 | 'editType': u'replace', 136 | 'type': u'edit', 137 | 'children': [ 138 | { 139 | 'type': u'sentence-reference', 140 | 'order': 1, 141 | 'children': [ 142 | { 143 | 'type': u'word-reference', 144 | 'children': [ 145 | { 146 | 'type': u'quote', 147 | 'words': u'mots d\'origine' 148 | } 149 | ] 150 | } 151 | ] 152 | }, 153 | { 154 | 'type': u'word-definition', 155 | 'children': [ 156 | { 157 | 'type': u'quote', 158 | 'words': u'mots de remplacement' 159 | } 160 | ] 161 | } 162 | ] 163 | } 164 | ] 165 | } 166 | ] 167 | } 168 | ]} 169 | ) 170 | -------------------------------------------------------------------------------- /tests/ParseArticleReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseArticleReferenceTest(DuralexTestCase): 8 | def test_article_number(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_article_reference, 12 | "l'article 3" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'article-reference', 17 | 'id': u'3' 18 | } 19 | ]} 20 | ) 21 | 22 | def test_article_id(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_article_reference, 26 | "l'article L. 121-3" 27 | ), 28 | {'children':[ 29 | { 30 | 'type': u'article-reference', 31 | 'id': u'L. 121-3' 32 | } 33 | ]} 34 | ) 35 | 36 | def test_article_id_2(self): 37 | self.assertEqualAST( 38 | self.call_parse_func( 39 | parser.parse_article_reference, 40 | "à l'article L. 121-3" 41 | ), 42 | {'children':[ 43 | { 44 | 'type': u'article-reference', 45 | 'id': u'L. 121-3' 46 | } 47 | ]} 48 | ) 49 | 50 | def test_article_id_law_id(self): 51 | self.assertEqualAST( 52 | self.call_parse_func( 53 | parser.parse_article_reference, 54 | u"l'article 11 de la loi n° 78-753" 55 | ), 56 | {'children':[ 57 | { 58 | 'type': u'article-reference', 59 | 'id': u'11', 60 | 'children': [ 61 | { 62 | 'id': u'78-753', 63 | 'type': u'law-reference', 64 | } 65 | ] 66 | } 67 | ]} 68 | ) 69 | 70 | def test_article_id_law_id_law_date(self): 71 | self.assertEqualAST( 72 | self.call_parse_func( 73 | parser.parse_article_reference, 74 | u"l'article 11 de la loi n° 78-753 du 17 juillet 1978" 75 | ), 76 | {'children':[ 77 | { 78 | 'type': u'article-reference', 79 | 'id': u'11', 80 | 'children': [ 81 | { 82 | 'lawDate': u'1978-7-17', 83 | 'id': u'78-753', 84 | 'type': u'law-reference', 85 | } 86 | ] 87 | } 88 | ]} 89 | ) 90 | 91 | def test_article_id_code_name(self): 92 | self.assertEqualAST( 93 | self.call_parse_func( 94 | parser.parse_article_reference, 95 | u"l'article L. 111-5 du code de l'éducation" 96 | ), 97 | {'children': [ 98 | { 99 | 'id': u'L. 111-5', 100 | 'type': u'article-reference', 101 | 'children': [ 102 | { 103 | 'id': u'code de l\'éducation', 104 | 'type': u'code-reference' 105 | } 106 | ] 107 | } 108 | ]} 109 | ) 110 | 111 | def test_the_same_article(self): 112 | self.assertEqualAST( 113 | self.call_parse_func( 114 | parser.parse_article_reference, 115 | u"le même article", 116 | {'children':[ 117 | { 118 | 'id': u'L. 111-5', 119 | 'type': u'article-reference' 120 | } 121 | ]} 122 | ), 123 | {'children':[ 124 | { 125 | 'id': u'L. 111-5', 126 | 'type': u'article-reference' 127 | }, 128 | { 129 | 'id': u'L. 111-5', 130 | 'type': u'article-reference' 131 | } 132 | ]} 133 | ) 134 | 135 | def test_the_same_article_2(self): 136 | self.assertEqualAST( 137 | self.call_parse_func( 138 | parser.parse_article_reference, 139 | u"du même article", 140 | {'children':[ 141 | { 142 | 'id': u'L. 111-5', 143 | 'type': u'article-reference' 144 | } 145 | ]} 146 | ), 147 | {'children':[ 148 | { 149 | 'id': u'L. 111-5', 150 | 'type': u'article-reference' 151 | }, 152 | { 153 | 'id': u'L. 111-5', 154 | 'type': u'article-reference' 155 | } 156 | ]} 157 | ) 158 | 159 | def test_article_id_same_code(self): 160 | self.assertEqualAST( 161 | self.call_parse_func( 162 | parser.parse_article_reference, 163 | u"l'article L. 123-2 du même code", 164 | {'children':[ 165 | { 166 | 'id': u'code de l\'éducation', 167 | 'type': u'code-reference' 168 | } 169 | ]} 170 | ), 171 | {'children': [ 172 | { 173 | 'id': u'code de l\'éducation', 174 | 'type': u'code-reference' 175 | }, 176 | { 177 | 'id': u'L. 123-2', 178 | 'type': u'article-reference', 179 | 'children': [ 180 | { 181 | 'id': u'code de l\'éducation', 182 | 'type': u'code-reference' 183 | } 184 | ] 185 | } 186 | ]} 187 | ) 188 | 189 | def test_article_id_list(self): 190 | self.assertEqualAST( 191 | self.call_parse_func( 192 | parser.parse_article_reference, 193 | u"les articles 3, 4 et 5" 194 | ), 195 | {'children':[ 196 | { 197 | 'type': u'article-reference', 198 | 'id': u'3' 199 | }, 200 | { 201 | 'type': u'article-reference', 202 | 'id': u'4' 203 | }, 204 | { 205 | 'type': u'article-reference', 206 | 'id': u'5' 207 | } 208 | ]} 209 | ) 210 | 211 | def test_article_id_list_code_name(self): 212 | self.assertEqualAST( 213 | self.call_parse_func( 214 | parser.parse_article_reference, 215 | u"les articles 3, 4 et 5 du code de l'éducation" 216 | ), 217 | {'children':[ 218 | { 219 | 'type': u'article-reference', 220 | 'id': u'3', 221 | 'children': [ 222 | { 223 | 'type': u'code-reference', 224 | 'id': u'code de l\'éducation' 225 | } 226 | ] 227 | }, 228 | { 229 | 'type': u'article-reference', 230 | 'id': u'4', 231 | 'children': [ 232 | { 233 | 'type': u'code-reference', 234 | 'id': u'code de l\'éducation' 235 | } 236 | ] 237 | }, 238 | { 239 | 'type': u'article-reference', 240 | 'id': u'5', 241 | 'children': [ 242 | { 243 | 'type': u'code-reference', 244 | 'id': u'code de l\'éducation' 245 | } 246 | ] 247 | } 248 | ]} 249 | ) 250 | 251 | def test_the_end_of_article_number(self): 252 | self.assertEqualAST( 253 | self.call_parse_func( 254 | parser.parse_article_reference, 255 | "la fin de l'article 3" 256 | ), 257 | {'children':[ 258 | { 259 | 'scope': 'end', 260 | 'type': u'article-reference', 261 | 'id': u'3' 262 | } 263 | ]} 264 | ) 265 | -------------------------------------------------------------------------------- /tests/ParseRawContentTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseRawContentTest(DuralexTestCase): 8 | def test_header1_raw_content_header2_raw_content(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | lambda tokens, i, parent: parser.parse_for_each(parser.parse_header1, tokens, 0, parent), 12 | (u"I. - Contenu du header1 :\n" 13 | u"1° Contenu du header2.") 14 | ), 15 | {'children':[ 16 | { 17 | 'order': 1, 18 | 'type': u'header1', 19 | 'children': [ 20 | { 21 | 'content': u'Contenu du header1 :', 22 | 'type': u'raw-content' 23 | }, 24 | { 25 | 'type': u'header2', 26 | 'order': 1, 27 | 'children': [ 28 | { 29 | 'type': u'raw-content', 30 | 'content': u'Contenu du header2.' 31 | } 32 | ] 33 | } 34 | ] 35 | } 36 | ]} 37 | ) 38 | 39 | def test_header1_raw_content_header2_raw_content_header3_raw_content(self): 40 | self.assertEqualAST( 41 | self.call_parse_func( 42 | lambda tokens, i, parent: parser.parse_for_each(parser.parse_header1, tokens, 0, parent), 43 | (u"I. - Contenu du header1 :\n" 44 | u"1° Contenu du header2 :\n" 45 | u"a) Contenu du header3") 46 | ), 47 | {'children':[ 48 | { 49 | 'order': 1, 50 | 'type': u'header1', 51 | 'children': [ 52 | { 53 | 'content': u'Contenu du header1 :', 54 | 'type': u'raw-content' 55 | }, 56 | { 57 | 'type': u'header2', 58 | 'order': 1, 59 | 'children': [ 60 | { 61 | 'type': u'raw-content', 62 | 'content': u'Contenu du header2 :' 63 | }, 64 | { 65 | 'order': 1, 66 | 'type': u'header3', 67 | 'children': [ 68 | { 69 | 'content': u'Contenu du header3', 70 | 'type': u'raw-content' 71 | } 72 | ] 73 | } 74 | ] 75 | } 76 | ] 77 | } 78 | ]} 79 | ) 80 | 81 | def test_n_header1_raw_content_n_header2_raw_content_n_header3_raw_content(self): 82 | self.assertEqualAST( 83 | self.call_parse_func( 84 | lambda tokens, i, parent: parser.parse_for_each(parser.parse_header1, tokens, 0, parent), 85 | (u"I. - Contenu du grand 1 :\n" 86 | u"1° Contenu du grand 1 petit 1 :\n" 87 | u"a) Contenu du grand 1 petit 1 a\n" 88 | u"b) Contenu du grand 1 petit 1 b\n" 89 | u"2° Contenu du grand 1 petit 2.\n" 90 | u"II. - Contenu du grand 2 :\n" 91 | u"1° Contenu du grand 2 petit 1.\n" 92 | u"a) Contenu du grand 2 petit 1 a\n" 93 | u"b) Contenu du grand 2 petit 1 b\n" 94 | u"c) Contenu du grand 2 petit 1 c\n") 95 | ), 96 | {'children':[ 97 | { 98 | 'order': 1, 99 | 'type': u'header1', 100 | 'children': [ 101 | { 102 | 'content': u'Contenu du grand 1 :', 103 | 'type': u'raw-content' 104 | }, 105 | { 106 | 'order': 1, 107 | 'type': u'header2', 108 | 'children': [ 109 | { 110 | 'content': u'Contenu du grand 1 petit 1 :', 111 | 'type': u'raw-content' 112 | }, 113 | { 114 | 'order': 1, 115 | 'type': u'header3', 116 | 'children': [ 117 | { 118 | 'content': u'Contenu du grand 1 petit 1 a', 119 | 'type': u'raw-content' 120 | } 121 | ] 122 | }, 123 | { 124 | 'order': 2, 125 | 'type': u'header3', 126 | 'children': [ 127 | { 128 | 'content': u'Contenu du grand 1 petit 1 b', 129 | 'type': u'raw-content' 130 | } 131 | ] 132 | } 133 | ] 134 | }, 135 | { 136 | 'order': 2, 137 | 'type': u'header2', 138 | 'children': [ 139 | { 140 | 'content': u'Contenu du grand 1 petit 2.', 141 | 'type': u'raw-content' 142 | } 143 | ] 144 | } 145 | ] 146 | }, 147 | { 148 | 'order': 2, 149 | 'type': u'header1', 150 | 'children': [ 151 | { 152 | 'content': u'Contenu du grand 2 :', 153 | 'type': u'raw-content' 154 | }, 155 | { 156 | 'order': 1, 157 | 'type': u'header2', 158 | 'children': [ 159 | { 160 | 'content': u'Contenu du grand 2 petit 1.', 161 | 'type': u'raw-content' 162 | }, 163 | { 164 | 'order': 1, 165 | 'type': u'header3', 166 | 'children': [ 167 | { 168 | 'content': u'Contenu du grand 2 petit 1 a', 169 | 'type': u'raw-content' 170 | } 171 | ] 172 | }, 173 | { 174 | 'order': 2, 175 | 'type': u'header3', 176 | 'children': [ 177 | { 178 | 'content': u'Contenu du grand 2 petit 1 b', 179 | 'type': u'raw-content' 180 | } 181 | ] 182 | }, 183 | { 184 | 'order': 3, 185 | 'type': u'header3', 186 | 'children': [ 187 | { 188 | 'content': u'Contenu du grand 2 petit 1 c', 189 | 'type': u'raw-content' 190 | } 191 | ] 192 | } 193 | ] 194 | } 195 | ] 196 | } 197 | ]} 198 | ) 199 | -------------------------------------------------------------------------------- /tests/ParseWordReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseWordReferenceTest(DuralexTestCase): 8 | def test_single_word(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_word_reference, 12 | u"le mot : \"test\"" 13 | ), 14 | {'children':[ 15 | { 16 | 'type': u'word-reference', 17 | 'children': [ 18 | { 19 | 'type': u'quote', 20 | 'words': u'test' 21 | } 22 | ] 23 | } 24 | ]} 25 | ) 26 | 27 | def test_words(self): 28 | self.assertEqualAST( 29 | self.call_parse_func( 30 | parser.parse_word_reference, 31 | u"les mots : \"ceci est un test\"" 32 | ), 33 | {'children':[ 34 | { 35 | 'type': u'word-reference', 36 | 'children': [ 37 | { 38 | 'type': u'quote', 39 | 'words': u'ceci est un test' 40 | } 41 | ] 42 | } 43 | ]} 44 | ) 45 | 46 | def test_reference(self): 47 | self.assertEqualAST( 48 | self.call_parse_func( 49 | parser.parse_word_reference, 50 | u"la référence : \"L. 321-5\"" 51 | ), 52 | {'children':[ 53 | { 54 | 'type': u'word-reference', 55 | 'children': [ 56 | { 57 | 'type': u'quote', 58 | 'words': u'L. 321-5' 59 | } 60 | ] 61 | } 62 | ]} 63 | ) 64 | 65 | def test_references(self): 66 | self.assertEqualAST( 67 | self.call_parse_func( 68 | parser.parse_word_reference, 69 | u"les références : \"ceci est un test\"" 70 | ), 71 | {'children':[ 72 | { 73 | 'type': u'word-reference', 74 | 'children': [ 75 | { 76 | 'type': u'quote', 77 | 'words': u'ceci est un test' 78 | } 79 | ] 80 | } 81 | ]} 82 | ) 83 | 84 | def test_after_words(self): 85 | self.assertEqualAST( 86 | self.call_parse_func( 87 | parser.parse_word_reference, 88 | u"après les mots : \"ceci est un test\"" 89 | ), 90 | {'children':[ 91 | { 92 | 'type': u'word-reference', 93 | 'position': u'after', 94 | 'children': [ 95 | { 96 | 'type': u'quote', 97 | 'words': u'ceci est un test' 98 | } 99 | ] 100 | } 101 | ]} 102 | ) 103 | 104 | def test_after_word(self): 105 | self.assertEqualAST( 106 | self.call_parse_func( 107 | parser.parse_word_reference, 108 | u"Après le mot : \"candidats\"" 109 | ), 110 | {'children':[ 111 | { 112 | 'type': u'word-reference', 113 | 'position': u'after', 114 | 'children': [ 115 | { 116 | 'type': u'quote', 117 | 'words': u'candidats' 118 | } 119 | ] 120 | } 121 | ]} 122 | ) 123 | 124 | def test_words_reference_position_in_article(self): 125 | self.assertEqualAST( 126 | self.call_parse_func( 127 | parser.parse_word_reference, 128 | u"après les mots : \"aux dispositions de l'article L. 123-5\", la fin de l'article L. 112-3 du code de la recherche" 129 | ), 130 | {'children':[ 131 | { 132 | 'type': u'word-reference', 133 | 'position': u'after', 134 | 'children': [ 135 | { 136 | 'type': u'quote', 137 | 'words': u'aux dispositions de l\'article L. 123-5' 138 | }, 139 | { 140 | 'type': u'article-reference', 141 | 'id': u'L. 112-3', 142 | 'scope': 'end', 143 | 'children': [ 144 | { 145 | 'type': u'code-reference', 146 | 'id': u'code de la recherche' 147 | } 148 | ] 149 | } 150 | ] 151 | } 152 | ]} 153 | ) 154 | 155 | def test_alinea_ref_word_ref(self): 156 | self.assertEqualAST( 157 | self.call_parse_func( 158 | parser.parse_reference, 159 | u"au deuxième alinéa, le mot : \"test\"" 160 | ), 161 | {'children':[ 162 | { 163 | 'type': u'alinea-reference', 164 | 'order': 2, 165 | 'children': [ 166 | { 167 | 'type': u'word-reference', 168 | 'children': [ 169 | { 170 | 'type': u'quote', 171 | 'words': u'test' 172 | } 173 | ] 174 | } 175 | ] 176 | } 177 | ]} 178 | ) 179 | 180 | def test_alinea_ref_article_ref_word_ref(self): 181 | self.assertEqualAST( 182 | self.call_parse_func( 183 | parser.parse_reference, 184 | u"au deuxième alinéa de l'article L. 42, le mot : \"test\"" 185 | ), 186 | {'children':[ 187 | { 188 | 'type': u'alinea-reference', 189 | 'order': 2, 190 | 'children': [ 191 | { 192 | 'type': u'article-reference', 193 | 'id': u'L. 42', 194 | 'children': [ 195 | { 196 | 'type': u'word-reference', 197 | 'children': [ 198 | { 199 | 'type': u'quote', 200 | 'words': u'test' 201 | } 202 | ] 203 | } 204 | ] 205 | } 206 | ] 207 | } 208 | ]} 209 | ) 210 | 211 | def test_alinea_ref_article_ref_law_ref_word_ref(self): 212 | self.assertEqualAST( 213 | self.call_parse_func( 214 | parser.parse_reference, 215 | u"au deuxième alinéa de l'article L. 42 de la loi n° 77-729, le mot : \"test\"" 216 | ), 217 | {'children':[ 218 | { 219 | 'type': u'alinea-reference', 220 | 'order': 2, 221 | 'children': [ 222 | { 223 | 'type': u'article-reference', 224 | 'id': u'L. 42', 225 | 'children': [ 226 | { 227 | 'type': u'law-reference', 228 | 'id': u'77-729', 229 | 'children': [ 230 | { 231 | 'type': u'word-reference', 232 | 'children': [ 233 | { 234 | 'type': u'quote', 235 | 'words': u'test' 236 | } 237 | ] 238 | } 239 | ] 240 | 241 | } 242 | ] 243 | } 244 | ] 245 | } 246 | ]} 247 | ) 248 | -------------------------------------------------------------------------------- /tests/ForkEditVisitorTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | from duralex.ForkEditVisitor import ForkEditVisitor 6 | 7 | class ForkEditVisitorTest(DuralexTestCase): 8 | def test(self): 9 | self.assertEqualAST( 10 | self.call_visitor(ForkEditVisitor, self.make_tree({'children': [ 11 | { 12 | 'type': 'edit', 13 | 'children': [ 14 | { 15 | 'type': u'alinea-reference', 16 | 'order': 3, 17 | 'children': [ 18 | { 19 | 'id': u'2', 20 | 'type': u'article-reference' 21 | } 22 | ], 23 | }, 24 | { 25 | 'order': 3, 26 | 'type': u'alinea-reference', 27 | 'children': [ 28 | { 29 | 'id': u'3', 30 | 'type': u'article-reference' 31 | } 32 | ] 33 | } 34 | ] 35 | } 36 | ]})), 37 | {'children': [ 38 | { 39 | 'type': 'edit', 40 | 'children': [ 41 | { 42 | 'type': u'alinea-reference', 43 | 'order': 3, 44 | 'children': [ 45 | { 46 | 'id': u'2', 47 | 'type': u'article-reference' 48 | } 49 | ], 50 | } 51 | ] 52 | }, 53 | { 54 | 'type': 'edit', 55 | 'children': [ 56 | { 57 | 'order': 3, 58 | 'type': u'alinea-reference', 59 | 'children': [ 60 | { 61 | 'id': u'3', 62 | 'type': u'article-reference' 63 | } 64 | ] 65 | } 66 | ] 67 | } 68 | ]} 69 | ) 70 | 71 | def test_2(self): 72 | self.assertEqualAST( 73 | self.call_visitor(ForkEditVisitor, self.make_tree({'children': [ 74 | { 75 | 'type': 'edit', 76 | 'children': [ 77 | { 78 | 'type': u'alinea-reference', 79 | 'order': 3, 80 | 'children': [ 81 | { 82 | 'id': u'2', 83 | 'type': u'article-reference' 84 | } 85 | ], 86 | }, 87 | { 88 | 'order': 3, 89 | 'type': u'alinea-reference', 90 | 'children': [ 91 | { 92 | 'id': u'3', 93 | 'type': u'article-reference' 94 | } 95 | ] 96 | }, 97 | { 98 | 'order': 4, 99 | 'type': u'alinea-reference', 100 | 'children': [ 101 | { 102 | 'id': u'3', 103 | 'type': u'article-reference' 104 | } 105 | ] 106 | } 107 | ] 108 | } 109 | ]})), 110 | {'children': [ 111 | { 112 | 'type': 'edit', 113 | 'children': [ 114 | { 115 | 'type': u'alinea-reference', 116 | 'order': 3, 117 | 'children': [ 118 | { 119 | 'id': u'2', 120 | 'type': u'article-reference' 121 | } 122 | ], 123 | } 124 | ] 125 | }, 126 | { 127 | 'type': 'edit', 128 | 'children': [ 129 | { 130 | 'order': 3, 131 | 'type': u'alinea-reference', 132 | 'children': [ 133 | { 134 | 'id': u'3', 135 | 'type': u'article-reference' 136 | } 137 | ] 138 | } 139 | ] 140 | }, 141 | { 142 | 'type': 'edit', 143 | 'children': [ 144 | { 145 | 'order': 4, 146 | 'type': u'alinea-reference', 147 | 'children': [ 148 | { 149 | 'id': u'3', 150 | 'type': u'article-reference' 151 | } 152 | ] 153 | } 154 | ] 155 | } 156 | ]} 157 | ) 158 | 159 | def test(self): 160 | self.assertEqualAST( 161 | self.call_visitor(ForkEditVisitor, self.make_tree({'children': [ 162 | { 163 | 'type': 'edit', 164 | 'children': [ 165 | { 166 | 'type': u'alinea-reference', 167 | 'order': 3, 168 | 'children': [ 169 | { 170 | 'id': u'2', 171 | 'type': u'article-reference' 172 | } 173 | ], 174 | }, 175 | { 176 | 'order': 3, 177 | 'type': u'alinea-reference', 178 | 'children': [ 179 | { 180 | 'id': u'3', 181 | 'type': u'article-reference' 182 | } 183 | ] 184 | }, 185 | { 186 | 'type': u'word-definition', 187 | 'children': [ 188 | { 189 | 'type': u'quote', 190 | 'words': u'ceci est un test' 191 | } 192 | ] 193 | } 194 | ] 195 | } 196 | ]})), 197 | {'children': [ 198 | { 199 | 'type': 'edit', 200 | 'children': [ 201 | { 202 | 'type': u'alinea-reference', 203 | 'order': 3, 204 | 'children': [ 205 | { 206 | 'id': u'2', 207 | 'type': u'article-reference' 208 | } 209 | ], 210 | }, 211 | { 212 | 'type': u'word-definition', 213 | 'children': [ 214 | { 215 | 'type': u'quote', 216 | 'words': u'ceci est un test' 217 | } 218 | ] 219 | } 220 | ] 221 | }, 222 | { 223 | 'type': 'edit', 224 | 'children': [ 225 | { 226 | 'order': 3, 227 | 'type': u'alinea-reference', 228 | 'children': [ 229 | { 230 | 'id': u'3', 231 | 'type': u'article-reference' 232 | } 233 | ] 234 | }, 235 | { 236 | 'type': u'word-definition', 237 | 'children': [ 238 | { 239 | 'type': u'quote', 240 | 'words': u'ceci est un test' 241 | } 242 | ] 243 | } 244 | ] 245 | } 246 | ]} 247 | ) 248 | -------------------------------------------------------------------------------- /tests/ParseAlineaReferenceTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | import duralex.alinea_parser as parser 6 | 7 | class ParseAlineaReferenceTest(DuralexTestCase): 8 | def test_alinea(self): 9 | self.assertEqualAST( 10 | self.call_parse_func( 11 | parser.parse_alinea_reference, 12 | u"l'alinéa 42" 13 | ), 14 | {'children': [ 15 | { 16 | 'type': u'alinea-reference', 17 | 'order': 42 18 | } 19 | ]} 20 | ) 21 | 22 | def test_alinea_number(self): 23 | self.assertEqualAST( 24 | self.call_parse_func( 25 | parser.parse_alinea_reference, 26 | u"alinéa 3" 27 | ), 28 | {'children': [ 29 | { 30 | 'order': 3, 31 | 'type': u'alinea-reference' 32 | } 33 | ]} 34 | ) 35 | 36 | def test_last_alinea(self): 37 | self.assertEqualAST( 38 | self.call_parse_func( 39 | parser.parse_alinea_reference, 40 | u"du dernier alinéa" 41 | ), 42 | {'children': [ 43 | { 44 | 'order': -1, 45 | 'type': u'alinea-reference' 46 | } 47 | ]} 48 | ) 49 | 50 | def test_last_alinea_2(self): 51 | self.assertEqualAST( 52 | self.call_parse_func( 53 | parser.parse_alinea_reference, 54 | u"au dernier alinéa" 55 | ), 56 | {'children': [ 57 | { 58 | 'order': -1, 59 | 'type': u'alinea-reference' 60 | } 61 | ]} 62 | ) 63 | 64 | def test_last_alinea_3(self): 65 | self.assertEqualAST( 66 | self.call_parse_func( 67 | parser.parse_alinea_reference, 68 | u"le dernier alinéa" 69 | ), 70 | {'children': [ 71 | { 72 | 'order': -1, 73 | 'type': u'alinea-reference' 74 | } 75 | ]} 76 | ) 77 | 78 | def test_before_the_last_alinea(self): 79 | self.assertEqualAST( 80 | self.call_parse_func( 81 | parser.parse_alinea_reference, 82 | u"avant le dernier alinéa" 83 | ), 84 | {'children': [ 85 | { 86 | 'type': u'alinea-reference', 87 | 'order': -1, 88 | 'position': u'before' 89 | } 90 | ]} 91 | ) 92 | 93 | def test_before_last_alinea(self): 94 | self.assertEqualAST( 95 | self.call_parse_func( 96 | parser.parse_alinea_reference, 97 | u"à l'avant dernier alinéa" 98 | ), 99 | {'children': [ 100 | { 101 | 'order': -2, 102 | 'type': u'alinea-reference' 103 | } 104 | ]} 105 | ) 106 | 107 | def test_before_last_alinea_2(self): 108 | self.assertEqualAST( 109 | self.call_parse_func( 110 | parser.parse_alinea_reference, 111 | u"l'avant-dernier alinéa" 112 | ), 113 | {'children': [ 114 | { 115 | 'order': -2, 116 | 'type': u'alinea-reference' 117 | } 118 | ]} 119 | ) 120 | 121 | def test_before_last_alinea_3(self): 122 | self.assertEqualAST( 123 | self.call_parse_func( 124 | parser.parse_alinea_reference, 125 | u"à l'avant-dernier alinéa" 126 | ), 127 | {'children': [ 128 | { 129 | 'order': -2, 130 | 'type': u'alinea-reference' 131 | } 132 | ]} 133 | ) 134 | 135 | def test_number_word_alinea(self): 136 | self.assertEqualAST( 137 | self.call_parse_func( 138 | parser.parse_alinea_reference, 139 | u"au premier alinéa" 140 | ), 141 | {'children': [ 142 | { 143 | 'order': 1, 144 | 'type': u'alinea-reference' 145 | } 146 | ]} 147 | ) 148 | 149 | def test_number_word_alinea_2(self): 150 | self.assertEqualAST( 151 | self.call_parse_func( 152 | parser.parse_alinea_reference, 153 | u"le second alinéa" 154 | ), 155 | {'children': [ 156 | { 157 | 'order': 2, 158 | 'type': u'alinea-reference' 159 | } 160 | ]} 161 | ) 162 | 163 | def test_number_word_alinea_3(self): 164 | self.assertEqualAST( 165 | self.call_parse_func( 166 | parser.parse_alinea_reference, 167 | u"du troisième alinéa" 168 | ), 169 | {'children': [ 170 | { 171 | 'order': 3, 172 | 'type': u'alinea-reference' 173 | } 174 | ]} 175 | ) 176 | 177 | def test_number_word_alinea_article_id(self): 178 | self.assertEqualAST( 179 | self.call_parse_func( 180 | parser.parse_alinea_reference, 181 | u"le deuxième alinéa de l'article L. 121-3" 182 | ), 183 | {'children': [ 184 | { 185 | 'order': 2, 186 | 'type': u'alinea-reference', 187 | 'children': [ 188 | { 189 | 'id': u'L. 121-3', 190 | 'type': u'article-reference' 191 | } 192 | ] 193 | } 194 | ]} 195 | ) 196 | 197 | def test_number_word_alinea_header1_article_id(self): 198 | self.assertEqualAST( 199 | self.call_parse_func( 200 | parser.parse_alinea_reference, 201 | u"le premier alinéa du II de l'article L. 121-3" 202 | ), 203 | {'children': [ 204 | { 205 | 'order': 1, 206 | 'type': u'alinea-reference', 207 | 'children': [ 208 | { 209 | 'order': 2, 210 | 'type': u'header1-reference', 211 | 'children': [ 212 | { 213 | 'id': u'L. 121-3', 214 | 'type': u'article-reference' 215 | } 216 | ] 217 | } 218 | ] 219 | } 220 | ]} 221 | ) 222 | 223 | def test_number_word_alinea_header1_article_id_code(self): 224 | self.assertEqualAST( 225 | self.call_parse_func( 226 | parser.parse_alinea_reference, 227 | u"le premier alinéa du II de l'article L. 121-3 du code de l'éducation" 228 | ), 229 | {'children': [ 230 | { 231 | 'order': 1, 232 | 'type': u'alinea-reference', 233 | 'children': [ 234 | { 235 | 'order': 2, 236 | 'type': u'header1-reference', 237 | 'children': [ 238 | { 239 | 'id': u'L. 121-3', 240 | 'type': u'article-reference', 241 | 'children': [ 242 | { 243 | 'id': u'code de l\'éducation', 244 | 'type': u'code-reference' 245 | } 246 | ] 247 | } 248 | ] 249 | } 250 | ] 251 | } 252 | ]} 253 | ) 254 | 255 | def test_the_same_alinea(self): 256 | self.assertEqualAST( 257 | self.call_parse_func( 258 | parser.parse_alinea_reference, 259 | u"le même alinéa", 260 | {'children':[ 261 | { 262 | 'type': u'alinea-reference', 263 | 'order': 42 264 | } 265 | ]} 266 | ), 267 | {'children':[ 268 | { 269 | 'type': u'alinea-reference', 270 | 'order': 42 271 | }, 272 | { 273 | 'type': u'alinea-reference', 274 | 'order': 42 275 | } 276 | ]} 277 | ) 278 | 279 | def test_before_the_last_alinea_article_ref(self): 280 | self.assertEqualAST( 281 | self.call_parse_func( 282 | parser.parse_alinea_reference, 283 | u"avant le dernier alinéa" 284 | ), 285 | {'children':[ 286 | { 287 | 'type': u'alinea-reference', 288 | 'position': u'before', 289 | 'order': -1 290 | } 291 | ]} 292 | ) 293 | 294 | def test_alinea_id_list(self): 295 | self.assertEqualAST( 296 | self.call_parse_func( 297 | parser.parse_alinea_reference, 298 | u"les alinéas 3, 4 et 5" 299 | ), 300 | {'children':[ 301 | { 302 | 'type': u'alinea-reference', 303 | 'order': 3 304 | }, 305 | { 306 | 'type': u'alinea-reference', 307 | 'order': 4 308 | }, 309 | { 310 | 'type': u'alinea-reference', 311 | 'order': 5 312 | } 313 | ]} 314 | ) 315 | 316 | def test_alinea_id_of_article_id_list(self): 317 | self.assertEqualAST( 318 | self.call_parse_func( 319 | parser.parse_alinea_reference, 320 | u"l'alinéa 3 des articles 2 et 3" 321 | ), 322 | {'children':[ 323 | { 324 | 'type': u'alinea-reference', 325 | 'order': 3, 326 | 'children': [ 327 | { 328 | 'type': u'article-reference', 329 | 'id': u'2' 330 | }, 331 | { 332 | 'type': u'article-reference', 333 | 'id': u'3' 334 | } 335 | ] 336 | } 337 | ]} 338 | ) 339 | -------------------------------------------------------------------------------- /tests/ResolveFullyQualifiedReferencesVisitorTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DuralexTestCase import DuralexTestCase 4 | 5 | from duralex.ResolveFullyQualifiedReferencesVisitor import ResolveFullyQualifiedReferencesVisitor 6 | 7 | class ResolveFullyQualifiedReferencesVisitorTest(DuralexTestCase): 8 | def test_code_danling_reference(self): 9 | self.assertEqualAST( 10 | self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [ 11 | { 12 | 'editType': u'edit', 13 | 'type': u'edit', 14 | 'children': [ 15 | { 16 | 'id': u'code de l\'éducation', 17 | 'type': u'code-reference' 18 | } 19 | ] 20 | }, 21 | { 22 | 'editType': u'replace', 23 | 'type': u'edit', 24 | 'children': [ 25 | { 26 | 'type': u'word-definition', 27 | 'children': [ 28 | { 29 | 'type': u'quote', 30 | 'words': u'mots de remplacement' 31 | } 32 | ] 33 | }, 34 | { 35 | 'type': u'word-reference', 36 | 'children': [ 37 | { 38 | 'type': u'quote', 39 | 'words': u'mots d\'origine' 40 | } 41 | ] 42 | } 43 | ] 44 | } 45 | ]})), 46 | {'children': [ 47 | { 48 | 'editType': u'replace', 49 | 'type': u'edit', 50 | 'children': [ 51 | { 52 | 'id': u'code de l\'éducation', 53 | 'type': u'code-reference', 54 | 'children': [ 55 | { 56 | 'type': u'word-reference', 57 | 'children': [ 58 | { 59 | 'type': u'quote', 60 | 'words': u'mots d\'origine' 61 | } 62 | ] 63 | } 64 | ] 65 | }, 66 | { 67 | 'type': u'word-definition', 68 | 'children': [ 69 | { 70 | 'type': u'quote', 71 | 'words': u'mots de remplacement' 72 | } 73 | ] 74 | } 75 | ] 76 | } 77 | ]} 78 | ) 79 | 80 | def test_code_danling_reference_2(self): 81 | self.assertEqualAST( 82 | self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [ 83 | { 84 | 'editType': u'edit', 85 | 'type': u'edit', 86 | 'children': [ 87 | { 88 | 'id': u'L. 42', 89 | 'type': u'article-reference', 90 | 'children': [ 91 | { 92 | 'id': u'code de l\'éducation', 93 | 'type': u'code-reference' 94 | } 95 | ] 96 | } 97 | ] 98 | }, 99 | { 100 | 'editType': u'replace', 101 | 'type': u'edit', 102 | 'children': [ 103 | { 104 | 'type': u'word-definition', 105 | 'children': [ 106 | { 107 | 'type': u'quote', 108 | 'words': u'mots de remplacement' 109 | } 110 | ] 111 | }, 112 | { 113 | 'type': u'word-reference', 114 | 'children': [ 115 | { 116 | 'type': u'quote', 117 | 'words': u'mots d\'origine' 118 | } 119 | ] 120 | } 121 | ] 122 | } 123 | ]})), 124 | {'children': [ 125 | { 126 | 'editType': u'replace', 127 | 'type': u'edit', 128 | 'children': [ 129 | { 130 | 'id': u'code de l\'éducation', 131 | 'type': u'code-reference', 132 | 'children': [ 133 | { 134 | 'id': u'L. 42', 135 | 'type': u'article-reference', 136 | 'children': [ 137 | { 138 | 'type': u'word-reference', 139 | 'children': [ 140 | { 141 | 'type': u'quote', 142 | 'words': u'mots d\'origine' 143 | } 144 | ] 145 | } 146 | ] 147 | } 148 | ] 149 | }, 150 | { 151 | 'type': u'word-definition', 152 | 'children': [ 153 | { 154 | 'type': u'quote', 155 | 'words': u'mots de remplacement' 156 | } 157 | ] 158 | } 159 | ] 160 | } 161 | ]} 162 | ) 163 | 164 | def test_code_danling_reference_3(self): 165 | self.assertEqualAST( 166 | self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [ 167 | { 168 | 'type': u'header1-definition', 169 | 'order': 1, 170 | 'children': [ 171 | { 172 | 'editType': u'edit', 173 | 'type': u'edit', 174 | 'children': [ 175 | { 176 | 'id': u'L. 42', 177 | 'type': u'article-reference' 178 | } 179 | ] 180 | }, 181 | { 182 | 'type': u'header2-definition', 183 | 'order': 1, 184 | 'children': [ 185 | { 186 | 'editType': u'edit', 187 | 'type': u'edit', 188 | 'children': [ 189 | { 190 | 'order': 42, 191 | 'type': u'alinea-reference' 192 | } 193 | ] 194 | }, 195 | { 196 | 'type': u'header3-definition', 197 | 'order': 1, 198 | 'children': [ 199 | { 200 | 'editType': u'replace', 201 | 'type': u'edit', 202 | 'children': [ 203 | { 204 | 'type': u'word-definition', 205 | 'children': [ 206 | { 207 | 'type': u'quote', 208 | 'words': u'mots de remplacement' 209 | } 210 | ] 211 | }, 212 | { 213 | 'type': u'word-reference', 214 | 'children': [ 215 | { 216 | 'type': u'quote', 217 | 'words': u'mots d\'origine' 218 | } 219 | ] 220 | } 221 | ] 222 | } 223 | ] 224 | } 225 | ] 226 | } 227 | ] 228 | } 229 | ]})), 230 | {'children': [ 231 | { 232 | 'type': u'header1-definition', 233 | 'order': 1, 234 | 'children': [ 235 | { 236 | 'type': u'header2-definition', 237 | 'order': 1, 238 | 'children': [ 239 | { 240 | 'type': u'header3-definition', 241 | 'order': 1, 242 | 'children': [ 243 | { 244 | 'editType': u'replace', 245 | 'type': u'edit', 246 | 'children': [ 247 | { 248 | 'id': u'L. 42', 249 | 'type': u'article-reference', 250 | 'children': [ 251 | { 252 | 'order': 42, 253 | 'type': u'alinea-reference', 254 | 'children': [ 255 | { 256 | 'type': u'word-reference', 257 | 'children': [ 258 | { 259 | 'type': u'quote', 260 | 'words': u'mots d\'origine' 261 | } 262 | ] 263 | } 264 | ] 265 | } 266 | ] 267 | }, 268 | { 269 | 'type': u'word-definition', 270 | 'children': [ 271 | { 272 | 'type': u'quote', 273 | 'words': u'mots de remplacement' 274 | } 275 | ] 276 | } 277 | ] 278 | } 279 | ] 280 | } 281 | ] 282 | } 283 | ] 284 | } 285 | ]} 286 | ) 287 | 288 | def test_do_nothing_when_no_nested_edits(self): 289 | self.assertEqualAST( 290 | self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [ 291 | { 292 | 'children': [ 293 | { 294 | 'children': [ 295 | { 296 | 'children': [ 297 | { 298 | 'children': [ 299 | { 300 | 'type': u'quote', 301 | 'words': u'Art. 4. - Le territoire de la République forme une circonscription unique.' 302 | } 303 | ], 304 | 'type': u'word-definition' 305 | }, 306 | { 307 | 'children': [ 308 | { 309 | 'id': u'4', 310 | 'type': u'article-reference' 311 | } 312 | ], 313 | 'lawDate': u'1977-7-7', 314 | 'id': u'77-729', 315 | 'type': u'law-reference' 316 | } 317 | ], 318 | 'editType': u'edit', 319 | 'type': u'edit' 320 | } 321 | ], 322 | 'order': 1, 323 | 'type': u'header1-definition' 324 | } 325 | ], 326 | 'isNew': False, 327 | 'order': 2, 328 | 'type': u'article-definition' 329 | } 330 | ]})), 331 | {'children': [ 332 | { 333 | 'children': [ 334 | { 335 | 'children': [ 336 | { 337 | 'children': [ 338 | { 339 | 'children': [ 340 | { 341 | 'type': u'quote', 342 | 'words': u'Art. 4. - Le territoire de la République forme une circonscription unique.' 343 | } 344 | ], 345 | 'type': u'word-definition' 346 | }, 347 | { 348 | 'children': [ 349 | { 350 | 'id': u'4', 351 | 'type': u'article-reference' 352 | } 353 | ], 354 | 'lawDate': u'1977-7-7', 355 | 'id': u'77-729', 356 | 'type': u'law-reference' 357 | } 358 | ], 359 | 'editType': u'edit', 360 | 'type': u'edit' 361 | } 362 | ], 363 | 'order': 1, 364 | 'type': u'header1-definition' 365 | } 366 | ], 367 | 'isNew': False, 368 | 'order': 2, 369 | 'type': u'article-definition' 370 | } 371 | ]} 372 | ) 373 | -------------------------------------------------------------------------------- /duralex/bill_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | """ 3 | Original code by RegardsCitoyen (https://github.com/RegardsCitoyens) for the-law-factory-parser 4 | (https://github.com/regardscitoyens/the-law-factory-parser). 5 | """ 6 | 7 | import sys, re, html5lib 8 | from bs4 import BeautifulSoup 9 | 10 | from duralex.alinea_parser import word_to_number, month_to_number 11 | 12 | import duralex.tree 13 | 14 | bister = u'(un|duo|tre|bis|qua|quin[tqu]*|sex|sept|octo?|novo?|non|dec|vic|ter|ies)+' 15 | 16 | ORDER = '' 17 | 18 | # Warning changing parenthesis in this regexp has multiple consequences throughout the code 19 | section_titles = u"((chap|t)itre|volume|livre|tome|(sous-)?section)" 20 | 21 | re_definitif = re.compile(r']*align[=:\s\-]*center"?>\(?<(b|strong)>\(?texte d[^f]*finitif\)?\)?

', re.I) 22 | 23 | clean_texte_regexps = [ 24 | (re.compile(r'[\n\t\r\s]+'), ' '), 25 | (re.compile(r'(]*>) ?

]*> ?'), r'\1'), 26 | (re.compile(r' ?

?()'), r'\1'), 27 | (re.compile(r'(>%s\s*[\dIVXLCDM]+([eE][rR]?)?)\s+-\s+([^<]*?)\s*

' % section_titles.upper()), r'\1

\6

'), 28 | ] 29 | 30 | re_clean_title_legif = re.compile("[\s|]*l[eé]gifrance(.gouv.fr)?$", re.I) 31 | clean_legifrance_regexps = [ 32 | (re.compile(r'[\n\t\r\s]+'), ' '), 33 | (re.compile(r']*>\s*En savoir plus sur ce[^<]*', re.I), ''), 34 | (re.compile(r']*>', re.I), ''), 35 | (re.compile(r'\s*
\s*', re.I), '

'), 36 | (re.compile(r']*class="titreSection[^>]*>\s*(%s\s+[\dIVXLCDM]+e?r?)\s*:\s*([^<]*?)\s*' % section_titles, re.I), r'

\1

\5

'), 37 | (re.compile(r']*class="titreArt[^>]*>(.*?)\s*', re.I), r'

\1

'), 38 | ] 39 | 40 | # Convert from roman numbers 41 | re_mat_romans = re.compile(r"[IVXCLDM]+", re.I) 42 | romans_map = zip( 43 | (1000, 900, 500, 400 , 100, 90 , 50 , 40 , 10 , 9 , 5 , 4 , 1), 44 | ( u'M', u'CM', u'D', u'CD', u'C', u'XC', u'L', u'XL', u'X', u'IX', u'V', u'IV', u'I') 45 | ) 46 | 47 | def romans(n): 48 | n = n.upper() 49 | i = res = 0 50 | for d, r in romans_map: 51 | while n[i:i + len(r)] == r: 52 | res += d 53 | i += len(r) 54 | return res 55 | 56 | upcase_accents = u"ÇÀÂÄÉÈÊËÎÏÔÖÙÛÜ" 57 | locase_accents = u"çàâäéèêëîïôöùûü" 58 | 59 | 60 | def real_lower(text): 61 | for a in upcase_accents: 62 | text = text.replace(a, locase_accents[upcase_accents.find(a)]) 63 | return text.lower() 64 | 65 | 66 | def lower_but_first(text): 67 | return text[0].upper() + real_lower(text[1:]) 68 | 69 | 70 | re_fullupcase = re.compile(r"^([\W0-9]*)([A-Z%s][\W0-9A-Z%s]*)$" % (upcase_accents, upcase_accents), re.U) 71 | 72 | 73 | def clean_full_upcase(text): 74 | mat = re_fullupcase.match(text) 75 | if mat: 76 | text = mat.group(1) + lower_but_first(mat.group(2)) 77 | return text 78 | 79 | re_clean_premier = re.compile(r'((PREM)?)(1|I)ER?') 80 | re_clean_bister = re.compile(r'([IXV\d]+e?r?)\s+(%s)' % bister, re.I) 81 | re_clean_subsec_space = re.compile(r'^("?[IVX0-9]{1,4}(\s+[a-z]+)?(\s+[A-Z]{1,4})?)\s*([\.°\-]+)\s*([^\s\)])', re.I) 82 | re_clean_subsec_space2 = re.compile(r'^("?[IVX0-9]{1,4})\s*([a-z]*)\s*([A-H]{1,4})([\.°\-])', re.I) 83 | re_clean_punc_space = re.compile(u'([°«»:;,\.!\?\]\)%€&\$])([^\s\)\.,\d"])') 84 | re_clean_spaces = re.compile(r'(\s|\xc2\xa0|\xa0)+') 85 | re_clean_coord = re.compile(r'^["\(]*(pour)?\s*coordination[\)\s\.]*$', re.I) 86 | # Clean html and special chars 87 | lower_inner_title = lambda x: x.group(1)+lower_but_first(x.group(3))+" " 88 | html_replace = [ 89 | (re_clean_spaces, " "), 90 | (re.compile(r"\s*\n+\s*"), " "), 91 | (re.compile(r'

'), u'\n'), 92 | (re.compile(r"−"), "-"), 93 | (re.compile(r" "), " "), 94 | (re.compile(r"", re.I), ""), 95 | # (re.compile(r"[«\"\s]+", re.I), " "), 96 | (re.compile(r'(«\s+|\s+»)'), '"'), 97 | (re.compile(r'(«|»|“|”|„|‟|❝|❞|"|〟|〞|〝)'), '"'), 98 | (re.compile(r"(’|'|’|ߴ|՚|ʼ|❛|❜)"), "'"), 99 | (re.compile(r"(‒|–|—|―|⁓|‑|‐|⁃|⏤)"), "-"), 100 | (re.compile(r"(]*>"), r"\1>"), 101 | (re.compile(r"(", re.I), r"\1i>"), 102 | (re.compile(r"(", re.I), r"\1b>"), 103 | (re.compile(r"<(![^>]*|/?(p|span))>", re.I), ""), 104 | (re.compile(r"<[^>]*>]*>"), ""), 105 | (re.compile(r"^", re.I), ""), 106 | (re.compile(r"(\s*)", re.I), r"\1"), 107 | (re.compile(r"", re.I), ""), 108 | (re.compile(r"^((<[bi]>)*)\((S|AN)[12]\)\s*", re.I), r"\1"), 109 | (re.compile(r"^(Article\s*)\d+\s*\s*", re.I), r"\1"), 110 | (re.compile(r"(.*)", re.I), ""), 111 | (re.compile(r"", re.I), ""), 112 | (re.compile(r"\s*\s*", re.I), ""), 113 | (re.compile(r"œ([A-Z])"), r"OE\1"), 114 | (re.compile(r"œ\s*", re.I), "oe"), 115 | (re.compile(r'^((<[^>]*>)*")%s ' % section_titles, re.I), lower_inner_title), 116 | (re.compile(r' pr..?liminaire', re.I), ' préliminaire'), 117 | (re.compile(r'[^<]*', re.I), ''), 118 | (re.compile(r'^(\w)', re.I), r"\1"), 119 | ] 120 | 121 | 122 | def clean_html(t): 123 | for regex, repl in html_replace: 124 | t = regex.sub(repl, t) 125 | return t.strip() 126 | 127 | re_clean_et = re.compile(r'(,|\s+et)\s+', re.I) 128 | 129 | def cleanup(dic): 130 | # Clean empty articles with only "Supprimé" as text 131 | if not dic: 132 | return 133 | if 'alineas' in dic: 134 | if len(dic['alineas']) == 1 and dic['alineas']['001'].startswith("(Supprimé)"): 135 | dic['statut'] = "supprimé" 136 | dic['alineas'] = {'001': ''} 137 | elif dic['statut'].startswith('conforme') and not len(dic['alineas']): 138 | dic['alineas'] = {'001': '(Non modifié)'} 139 | multiples = re_clean_et.sub(',', dic['titre']).split(',') 140 | if len(multiples) > 1: 141 | for d in multiples: 142 | new = dict(dic) 143 | new['titre'] = d 144 | return new 145 | 146 | return dic 147 | 148 | def save_text(txt): 149 | if "done" not in txt: 150 | return cleanup(txt) 151 | txt["done"] = True 152 | return txt 153 | 154 | blank_none = lambda x: x if x else "" 155 | re_cl_html = re.compile(r"<[^>]+>") 156 | re_cl_html_except_tables = re.compile(r"]*>", re.I) 157 | re_fix_missing_table = re.compile(r'(\W*)$', re.I) 158 | cl_html_except_tables = lambda x: re_fix_missing_table.sub(r'\1', re_cl_html_except_tables.sub('', x)).strip().replace('> ', '>').replace(' <', '<').replace('', '') 159 | re_cl_par = re.compile(r"[()]") 160 | re_cl_uno = re.compile(r"(premie?r?|unique?)", re.I) 161 | re_cl_sec_uno = re.compile(r"^[Ii1][eE][rR]?") 162 | re_mat_sec = re.compile(r"%s(\s+(.+)e?r?)" % section_titles, re.I) 163 | re_mat_n = re.compile(r"((pr..?)?limin|unique|premier|[IVX\d]+)", re.I) 164 | re_mat_art = re.compile(r"articles?\s*([^(]*)(\([^)]*\))?$", re.I) 165 | re_mat_ppl = re.compile(r"()?pro.* loi", re.I) 166 | re_mat_tco = re.compile(r"\s*\s*(ANNEXE[^:]*:\s*|\d+\)\s+)?TEXTES?\s*(ADOPTÉS?\s*PAR|DE)\s*LA\s*COMMISSION.*\s*$") 167 | re_mat_exp = re.compile(r"()?expos[eéÉ]", re.I) 168 | re_mat_end = re.compile(r"(()?Délibéré en|()?NB[\s:<]+|()?RAPPORT ANNEX|Fait à .*, le|\s*©|\s*N.?B.?\s*:|()*[1*]\s*()*\(\)()*|\(1\)\s*Nota[\s:]+|\*\s*()?1)", re.I) 169 | re_mat_ann = re.compile(r"\s*\s*ANNEXES?[\s<]+") 170 | re_mat_dots = re.compile(r"^()?[.…]+()?$") 171 | re_mat_st = re.compile(r"(|\()+\s*(conform|non[\s\-]*modif|suppr|nouveau).{0,10}$", re.I) 172 | re_mat_new = re.compile(r"\s*\(\s*nouveau\s*\)\s*", re.I) 173 | re_mat_texte = re.compile(r'\(texte (modifié|élaboré|d(u|e l))', re.I) 174 | re_mat_single_char = re.compile(r'^\s*[LMN]\s*$') 175 | re_clean_idx_spaces = re.compile(r'^([IVXLCDM0-9]+)\s*\.\s*') 176 | re_clean_art_spaces = re.compile(r'^\s*("?)\s+') 177 | re_clean_art_spaces2 = re.compile(r'\s+\.\s*-\s+') 178 | re_clean_conf = re.compile(r"\((conforme|non[\s-]*modifi..?)s?\)", re.I) 179 | re_clean_supr = re.compile(r'\((dispositions?\s*d..?clar..?es?\s*irrecevable.*article 4.*Constitution.*|(maintien de la )?suppr(ession|im..?s?)(\s*(conforme|maintenue|par la commission mixte paritaire))*)\)["\s]*$', re.I) 180 | re_echec_hemi = re.compile(r"L('Assemblée nationale|e Sénat) (a rejeté|n'a pas adopté)[, ]+", re.I) 181 | re_echec_hemi2 = re.compile(r"de loi a été rejetée par l('Assemblée nationale|e Sénat)\.$", re.I) 182 | re_echec_com = re.compile(r" la commission .*(effet est d'entraîner le rejet|demande de rejeter|a rejeté|n'a pas adopté)[dleau\s]*(projet|proposition|texte)[.\s]", re.I) 183 | re_echec_cmp = re.compile(r" (a conclu à l'échec de ses travaux|(ne|pas) .*parven(u[es]?|ir) à (élaborer )?un texte commun)", re.I) 184 | re_rap_mult = re.compile(r'[\s<>/ai]*N[°\s]*\d+\s*(,|et)\s*[N°\s]*\d+', re.I) 185 | re_src_mult = re.compile(r'^- L(?:A PROPOSITION|E PROJET) DE LOI n°\s*(\d+)\D') 186 | re_clean_mult_1 = re.compile(r'\s*et\s*', re.I) 187 | re_clean_mult_2 = re.compile(r'[^,\d]', re.I) 188 | re_clean_footer_notes = re.compile(r"[\.\s]*\(*\d*\([\d\*]+[\)\d\*\.\s]*$") 189 | re_sep_text = re.compile(r'\s*\s*(article|%s)\s*(I|uniqu|pr..?limina|1|prem)[ier]*\s*\s*$' % section_titles, re.I) 190 | re_stars = re.compile(r'^[\s*_]+$') 191 | re_art_uni = re.compile(r'\s*article\s*unique\s*$', re.I) 192 | re_all_caps = re.compile(r'[A-Z' + upcase_accents + r' ]+') 193 | section = {"type": "section", "id": ""} 194 | 195 | def parse_bill(string, url): 196 | section_id = "" 197 | curtext = -1 198 | srclst = [] 199 | article = None 200 | read = art_num = ali_num = 0 201 | indextext = -1 202 | 203 | definitif = re_definitif.search(string) is not None 204 | soup = BeautifulSoup(string, "html5lib") 205 | 206 | texte = { 207 | "type": "projet de loi", 208 | "definitive": definitif, 209 | "articles": [], 210 | "url": url, 211 | "expose": "" 212 | } 213 | expose = False 214 | 215 | if url: 216 | url = re.sub(r"^.*/http", "http", url) 217 | url = re.sub(r"%3A", ":", re.sub(r"%2F", "/", url)) 218 | # Generate Senat or AN ID from URL 219 | if "legifrance.gouv.fr" in url: 220 | m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I) 221 | texte["id"] = ORDER + m.group(1) 222 | elif re.search(r"assemblee-?nationale", url, re.I): 223 | m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I) 224 | numero = int(m.group(3)) 225 | texte["id"] = ORDER+"A" + m.group(1) + "-" 226 | if m.group(2) is not None: 227 | texte["id"] += m.group(2) 228 | texte["id"] += str(numero) 229 | else: 230 | m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})\d?\.", url, re.I) 231 | if m is None: 232 | m = re.search(r"/(-)?20(\d+)-\d+/(\d+).html", url, re.I) 233 | numero = int(m.group(3)) 234 | texte["id"] = ORDER+"S" + m.group(2) + "-" 235 | if m.group(1) is not None: 236 | texte["id"] += m.group(1) 237 | texte["id"] += "%03d" % numero 238 | 239 | is_html = string.find('') == 0 or string.find('') == 0 240 | lines = soup.body.find_all('p') if is_html else string.split(u'\n') 241 | 242 | for line in lines: 243 | line = clean_html(line.text if is_html else line) 244 | 245 | if re_stars.match(line): 246 | continue 247 | 248 | match = re.compile(r'^N°\D+(\d+)$', re.MULTILINE).search(line) 249 | if match: 250 | texte['id'] = int(match.group(1)) 251 | 252 | match = re.compile(r'^(.*) LÉGISLATURE$', re.MULTILINE).search(line) 253 | if match: 254 | texte['legislature'] = word_to_number(match.group(1)) 255 | 256 | match = re.compile(r'Enregistré à la Présidence (du |de l\')(.*) le (\d+) (\w+) (\d{4})').search(line) 257 | if match: 258 | texte['date'] = match.group(5) + '-' + str(month_to_number(match.group(4))) + '-' + match.group(3) 259 | texte['place'] = match.group(2).lower() 260 | 261 | if line == u'PROPOSITION DE LOI': 262 | texte['type'] = duralex.tree.TYPE_LAW_PROPOSAL 263 | elif line == u'PROJET DE LOI': 264 | texte['type'] = duralex.tree.TYPE_LAW_PROJECT 265 | 266 | if 'description' not in texte and line in [u'PROPOSITION DE LOI', u'PROJET DE LOI']: 267 | texte['description'] = line.lower() 268 | read = 3 269 | continue 270 | if read == 3: 271 | if real_lower(line).startswith(u'transmise par') or real_lower(line).startswith(u'présentée par'): 272 | read = 0 273 | else: 274 | if re_all_caps.match(line): 275 | line = real_lower(line) 276 | line = line.replace(',', '') 277 | if line: 278 | texte['description'] += ' ' + line 279 | continue 280 | 281 | if line == "RAPPORT" or line == "Mesdames, Messieurs,": 282 | read = -1 283 | if (srclst or indextext != -1) and re_sep_text.match(line): 284 | curtext += 1 285 | art_num = 0 286 | srcl = re_src_mult.search(line) 287 | cl_line = re_cl_html.sub("", line).strip() 288 | if srcl and read < 1: 289 | srclst.append(int(srcl.group(1))) 290 | continue 291 | elif re_rap_mult.match(line): 292 | line = cl_line 293 | line = re_clean_mult_1.sub(",", line) 294 | line = re_clean_mult_2.sub("", line) 295 | cl_line = re_cl_html.sub("", line).strip() 296 | for n_t in line.split(','): 297 | indextext += 1 298 | if int(n_t) == numero: 299 | break 300 | elif re_mat_ppl.match(line) or re_mat_tco.match(line): 301 | read = 0 302 | texte = save_text(texte) 303 | elif re_mat_exp.match(line): 304 | read = -1 # Deactivate description lecture 305 | expose = True 306 | elif re_echec_cmp.search(cl_line) or re_echec_com.search(cl_line) or re_echec_hemi.match(cl_line) or re_echec_hemi2.search(cl_line): 307 | texte = save_text(texte) 308 | cleanup({"type": "echec", "texte": cl_line}) 309 | break 310 | elif read == -1 or (indextext != -1 and curtext != indextext): 311 | continue 312 | 313 | # Identify section zones 314 | m = re_mat_sec.match(line) 315 | if m: 316 | read = 1 # Activate titles lecture 317 | section["type_section"] = real_lower(m.group(1)) 318 | section_typ = m.group(1).upper()[0] 319 | if m.group(3) is not None: 320 | section_typ += "S" 321 | 322 | if " LIMINAIRE" in line: 323 | section_num = "L" 324 | else: 325 | section_num = re_cl_uno.sub('1', re_cl_sec_uno.sub('1', re_cl_html.sub('', m.group(5).strip())).strip()) 326 | section_num = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), section_num) 327 | section_num = re_mat_new.sub('', section_num).strip() 328 | m2 = re_mat_romans.match(section_num) 329 | if m2: 330 | rest = section_num.replace(m2.group(0), '') 331 | section_num = romans(m2.group(0)) 332 | if rest: section_num = str(section_num) + rest 333 | # Get parent section id to build current section id 334 | section_par = re.sub(r""+section_typ+"[\dL].*$", "", section["id"]) 335 | section["id"] = section_par + section_typ + str(section_num) 336 | 337 | # Identify titles and new article zones 338 | elif (not expose and re_mat_end.match(line)) or (read == 2 and re_mat_ann.match(line)): 339 | break 340 | elif re.match(r"()?", line) or re_art_uni.match(line) or re.match(r"^Articles? ", line): 341 | line = cl_line 342 | # Read a new article 343 | if re_mat_art.match(line): 344 | if article is not None: 345 | texte = save_text(texte) 346 | cleanup(article) 347 | read = 2 # Activate alineas lecture 348 | expose = False 349 | art_num += 1 350 | ali_num = 0 351 | article = {"type": "article", "order": art_num, "alineas": {}, "statut": "none"} 352 | texte['articles'].append(article) 353 | if srclst: 354 | article["source_text"] = srclst[curtext] 355 | m = re_mat_art.match(line) 356 | article["titre"] = re_cl_uno.sub("1er", re_cl_sec_uno.sub("1er", m.group(1).strip())).strip(" -'") 357 | if m.group(2) is not None: 358 | article["statut"] = re_cl_par.sub("", real_lower(m.group(2))).strip() 359 | if section["id"] != "": 360 | article["section"] = section["id"] 361 | # Read a section's title 362 | elif read == 1: 363 | texte = save_text(texte) 364 | section["titre"] = lower_but_first(line) 365 | if article is not None: 366 | cleanup(article) 367 | article = None 368 | cleanup(section) 369 | read = 0 370 | 371 | # Read articles with alineas 372 | if read == 2 and not m: 373 | # Find extra status information 374 | if ali_num == 0 and re_mat_st.match(line): 375 | article["statut"] = re_cl_html.sub("", re_cl_par.sub("", real_lower(line)).strip()) 376 | continue 377 | if re_mat_dots.match(line): 378 | continue 379 | if "" in line: 380 | cl_line = cl_html_except_tables(line) 381 | line = re_clean_art_spaces2.sub('. - ', re_clean_art_spaces.sub(r'\1', re_clean_idx_spaces.sub(r'\1. ', re_mat_new.sub(" ", cl_line).strip()))) 382 | # Clean low/upcase issues with BIS TER etc. 383 | line = line.replace("oeUVRE", "OEUVRE") 384 | line = clean_full_upcase(line) 385 | line = re_clean_premier.sub(lambda m: (real_lower(m.group(0)) if m.group(1) else "")+m.group(3)+"er", line) 386 | line = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), line) 387 | # Clean different versions of same comment. 388 | line = re_clean_supr.sub('(Supprimé)', line) 389 | line = re_clean_conf.sub('(Non modifié)', line) 390 | line = re_clean_coord.sub('', line) 391 | line = re_clean_subsec_space.sub(r'\1\4 \5', line) 392 | line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line) 393 | line = re_clean_punc_space.sub(r'\1 \2', line)#.encode('utf-8') 394 | line = re_clean_spaces.sub(' ', line) 395 | line = re_mat_sec.sub(lambda x: lower_but_first(x.group(1))+x.group(4) if re_mat_n.match(x.group(4)) else x.group(0), line) 396 | line = re_clean_footer_notes.sub(".", line) 397 | # Clean comments (Texte du Sénat), (Texte de la Commission), ... 398 | if ali_num == 0 and re_mat_texte.match(line): 399 | continue 400 | line = re_mat_single_char.sub("", line) 401 | line = line.strip() 402 | if line: 403 | ali_num += 1 404 | # match alinea numbering in the form of "(ali_num) actual alinea content goes here..." 405 | m = re.compile(r"^\((\d)\) (.*)$", re.MULTILINE).match(line) 406 | if m: 407 | ali_num = int(m.group(1)) 408 | line = m.group(2) 409 | article["alineas"]["%03d" % ali_num] = line 410 | else: 411 | #metas 412 | continue 413 | 414 | # save_text(texte) 415 | cleanup(texte) 416 | 417 | return texte 418 | --------------------------------------------------------------------------------