├── duralex
    ├── __init__.py
    ├── DeleteUUIDVisitor.py
    ├── DeleteParentVisitor.py
    ├── DeleteEmptyChildrenVisitor.py
    ├── SwapDefinitionAndReferenceVisitor.py
    ├── RemoveQuotePrefixVisitor.py
    ├── AddParentVisitor.py
    ├── ForkReferenceVisitor.py
    ├── ForkEditVisitor.py
    ├── FixMissingCodeOrLawReferenceVisitor.py
    ├── SortReferencesVisitor.py
    ├── alinea_lexer.py
    ├── ResolveFullyQualifiedDefinitionsVisitor.py
    ├── amendment_parser.py
    ├── diff_parser.py
    ├── ResolveFullyQualifiedReferencesVisitor.py
    ├── AbstractVisitor.py
    ├── tree.py
    └── bill_parser.py
├── requirements.txt
├── article_to_json.jpg
├── setup.py
├── .travis.yml
├── tests
    ├── ParseHeader3Test.py
    ├── ParseHeader2Test.py
    ├── ParseCodePartReferenceTest.py
    ├── ParseBookReferenceTest.py
    ├── ParseTitleReferenceTest.py
    ├── ParseChapterReferenceTest.py
    ├── ParseSectionReferenceTest.py
    ├── ParseParagraphReferenceTest.py
    ├── ParseSubSectionReferenceTest.py
    ├── ParseMultiplicativeAdverbTest.py
    ├── ParseSubParagraphDefinitionTest.py
    ├── ForkReferenceVisitorTest.py
    ├── ParseArticleDefinitionTest.py
    ├── ParseMentionDefinitionTest.py
    ├── ParseHeader2ReferenceTest.py
    ├── ParseSentenceDefinitionTest.py
    ├── ParseHeader3ReferenceTest.py
    ├── main.py
    ├── ParseAlineaDefinitionTest.py
    ├── ParseHeader3DefinitionTest.py
    ├── ParseCodeReferenceTest.py
    ├── DuralexTestCase.py
    ├── ParseHeader1DefinitionTest.py
    ├── ParseWordDefinitionTest.py
    ├── ParseSentenceReferenceTest.py
    ├── ParseHeader2DefinitionTest.py
    ├── ParseLawReferenceTest.py
    ├── ParseDefinitionListTest.py
    ├── SortReferencesVisitorTest.py
    ├── ParseHeader1Test.py
    ├── ParseArticleReferenceTest.py
    ├── ParseRawContentTest.py
    ├── ParseWordReferenceTest.py
    ├── ForkEditVisitorTest.py
    ├── ParseAlineaReferenceTest.py
    └── ResolveFullyQualifiedReferencesVisitorTest.py
├── .gitignore
├── README.md
└── bin
    └── duralex


/duralex/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | colorama
2 | html5lib
3 | beautifulsoup4
4 | requests
5 | unidiff
6 | 


--------------------------------------------------------------------------------
/article_to_json.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Legilibre/DuraLex/HEAD/article_to_json.jpg


--------------------------------------------------------------------------------
/duralex/DeleteUUIDVisitor.py:
--------------------------------------------------------------------------------
1 | from duralex.AbstractVisitor import AbstractVisitor
2 | 
3 | class DeleteUUIDVisitor(AbstractVisitor):
4 |     def visit_node(self, node):
5 |         if 'uuid' in node:
6 |             del node['uuid']
7 | 
8 |         super(DeleteUUIDVisitor, self).visit_node(node)
9 | 


--------------------------------------------------------------------------------
/duralex/DeleteParentVisitor.py:
--------------------------------------------------------------------------------
1 | from duralex.AbstractVisitor import AbstractVisitor
2 | 
3 | class DeleteParentVisitor(AbstractVisitor):
4 |     def visit_node(self, node):
5 |         if 'parent' in node:
6 |             del node['parent']
7 | 
8 |         super(DeleteParentVisitor, self).visit_node(node)
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='DuraLex',
 5 |     version='0.2',
 6 |     install_requires=[
 7 |         'html5lib',
 8 |         'simplejson',
 9 |         'beautifulsoup4'
10 |     ],
11 |     packages=[
12 |         'duralex'
13 |     ],
14 |     scripts=[
15 |         'bin/duralex'
16 |     ]
17 | )
18 | 


--------------------------------------------------------------------------------
/duralex/DeleteEmptyChildrenVisitor.py:
--------------------------------------------------------------------------------
1 | from duralex.AbstractVisitor import AbstractVisitor
2 | 
3 | class DeleteEmptyChildrenVisitor(AbstractVisitor):
4 |     def visit_node(self, node):
5 |         if 'children' in node and len(node['children']) == 0:
6 |             del node['children']
7 | 
8 |         super(DeleteEmptyChildrenVisitor, self).visit_node(node)
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.5"
 4 | 
 5 | # command to install dependencies
 6 | install:
 7 |     - pip install -r requirements.txt
 8 | 
 9 | # command to run tests
10 | script:
11 |     - cd tests
12 |     - python main.py
13 | 
14 | notifications:
15 |   webhooks:
16 |     urls:
17 |     - https://webhooks.gitter.im/e/e8aba838e8d75ff07b0f
18 |     on_success: change
19 |     on_failure: always
20 |     on_start: never
21 | 


--------------------------------------------------------------------------------
/duralex/SwapDefinitionAndReferenceVisitor.py:
--------------------------------------------------------------------------------
 1 | from duralex.AbstractVisitor import AbstractVisitor
 2 | 
 3 | import duralex.tree as tree
 4 | 
 5 | class SwapDefinitionAndReferenceVisitor(AbstractVisitor):
 6 |     def visit_edit_node(self, node, post):
 7 |         defs = filter(lambda n: tree.is_definition(n), node['children'])
 8 | 
 9 |         for d in defs:
10 |             tree.remove_node(node, d)
11 |             tree.push_node(node, d)
12 | 


--------------------------------------------------------------------------------
/duralex/RemoveQuotePrefixVisitor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from duralex.AbstractVisitor import AbstractVisitor
 4 | 
 5 | from duralex.alinea_parser import *
 6 | 
 7 | class RemoveQuotePrefixVisitor(AbstractVisitor):
 8 |     def visit_quote_node(self, node, post):
 9 |         if post:
10 |             return
11 | 
12 |         # Art. {articleId}. -
13 |         node['words'] = re.sub(r'^Art\. .*?\. - ', '', node['words'], 0, re.UNICODE | re.MULTILINE)
14 | 


--------------------------------------------------------------------------------
/duralex/AddParentVisitor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding=utf-8 -*-
 2 | 
 3 | from duralex.AbstractVisitor import AbstractVisitor
 4 | 
 5 | class AddParentVisitor(AbstractVisitor):
 6 |     def __init__(self):
 7 |         self.parent = []
 8 | 
 9 |         super(AddParentVisitor, self).__init__()
10 | 
11 |     def visit_node(self, node):
12 |         if 'parent' not in node and len(self.parent):
13 |             node['parent'] = self.parent[-1]
14 | 
15 |         self.parent.append(node)
16 | 
17 |         super(AddParentVisitor, self).visit_node(node)
18 | 
19 |         del self.parent[-1]
20 | 


--------------------------------------------------------------------------------
/duralex/ForkReferenceVisitor.py:
--------------------------------------------------------------------------------
 1 | from duralex.AbstractVisitor import AbstractVisitor
 2 | 
 3 | from duralex.alinea_parser import *
 4 | 
 5 | import duralex.tree
 6 | 
 7 | class ForkReferenceVisitor(AbstractVisitor):
 8 |     def visit_node(self, node):
 9 |         if duralex.tree.is_reference(node) and 'children' in node and len(node['children']) > 1:
10 |             ref_nodes = [n for n in node['children'] if duralex.tree.is_reference(n)]
11 |             for i in range(1, len(ref_nodes)):
12 |                 ref = ref_nodes[i]
13 |                 fork = copy_node(node, recursive=False)
14 |                 remove_node(node, ref)
15 |                 push_node(fork, ref)
16 |                 push_node(node['parent'], fork)
17 | 
18 |         super(ForkReferenceVisitor, self).visit_node(node)
19 | 


--------------------------------------------------------------------------------
/tests/ParseHeader3Test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseHeader3Test(DuralexTestCase):
 8 |     def test_header3_raw_content(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_header2,
12 |                 u"b) Ceci est un header3."
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'header3',
17 |                     'order': 2,
18 |                     'children': [
19 |                         {
20 |                             'type': u'raw-content',
21 |                             'content': u'Ceci est un header3.'
22 |                         }
23 |                     ]
24 |                 }
25 |             ]}
26 |         )
27 | 


--------------------------------------------------------------------------------
/tests/ParseHeader2Test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseHeader2Test(DuralexTestCase):
 8 |     def test_header2_raw_content(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_header2,
12 |                 u"42° Ceci est un header2."
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'header2',
17 |                     'order': 42,
18 |                     'children': [
19 |                         {
20 |                             'content': u'Ceci est un header2.',
21 |                             'type': u'raw-content'
22 |                         }
23 |                     ],
24 |                 }
25 |             ]}
26 |         )
27 | 


--------------------------------------------------------------------------------
/tests/ParseCodePartReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseCodePartReferenceTest(DuralexTestCase):
 8 |     def test_code_part(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_code_part_reference,
12 |                 u"la troisième partie du code de l'éducation"
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'code-part-reference',
17 |                     'order': 3,
18 |                     'children': [
19 |                         {
20 |                             'type': u'code-reference',
21 |                             'id': u'code de l\'éducation'
22 |                         }
23 |                     ]
24 |                 }
25 |             ]}
26 |         )
27 | 


--------------------------------------------------------------------------------
/tests/ParseBookReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseBookReferenceTest(DuralexTestCase):
 8 |     def test_book(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_book_reference,
12 |                 u"le livre III"
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'book-reference',
17 |                     'order': 3
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_book_2(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_book_reference,
26 |                 u"du livre V"
27 |             ),
28 |             {'children': [
29 |                 {
30 |                     'type': u'book-reference',
31 |                     'order': 5
32 |                 }
33 |             ]}
34 |         )
35 | 


--------------------------------------------------------------------------------
/tests/ParseTitleReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseTitleReferenceTest(DuralexTestCase):
 8 |     def test_title(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_title_reference,
12 |                 u"le titre IV"
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'title-reference',
17 |                     'order': 4
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_title_2(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_title_reference,
26 |                 u"du titre IV"
27 |             ),
28 |             {'children': [
29 |                 {
30 |                     'type': u'title-reference',
31 |                     'order': 4
32 |                 }
33 |             ]}
34 |         )
35 | 


--------------------------------------------------------------------------------
/tests/ParseChapterReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseChapterReferenceTest(DuralexTestCase):
 8 |     def test_chapter(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_chapter_reference,
12 |                 u"le chapitre IV"
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'chapter-reference',
17 |                     'order': 4
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_chapter_2(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_chapter_reference,
26 |                 u"du chapitre IV"
27 |             ),
28 |             {'children': [
29 |                 {
30 |                     'type': u'chapter-reference',
31 |                     'order': 4
32 |                 }
33 |             ]}
34 |         )
35 | 


--------------------------------------------------------------------------------
/tests/ParseSectionReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseSectionReferenceTest(DuralexTestCase):
 8 |     def test_the_section_order(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_section_reference,
12 |                 "la section 2"
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'section-reference',
17 |                     'order': 2
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_of_the_section_order(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_section_reference,
26 |                 "de la section 2"
27 |             ),
28 |             {'children':[
29 |                 {
30 |                     'type': u'section-reference',
31 |                     'order': 2
32 |                 }
33 |             ]}
34 |         )
35 | 


--------------------------------------------------------------------------------
/tests/ParseParagraphReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseParagraphReferenceTest(DuralexTestCase):
 8 |     def test_paragraph(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_paragraph_reference,
12 |                 u"le paragraphe 42"
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'paragraph-reference',
17 |                     'order': 42
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_paragraph_2(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_paragraph_reference,
26 |                 u"du paragraphe 42"
27 |             ),
28 |             {'children': [
29 |                 {
30 |                     'type': u'paragraph-reference',
31 |                     'order': 42
32 |                 }
33 |             ]}
34 |         )
35 | 


--------------------------------------------------------------------------------
/tests/ParseSubSectionReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseSubSectionReferenceTest(DuralexTestCase):
 8 |     def test_the_subsection_order(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_subsection_reference,
12 |                 "la sous-section 2"
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'subsection-reference',
17 |                     'order': 2
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_of_the_subsection_order(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_subsection_reference,
26 |                 "de la sous-section 2"
27 |             ),
28 |             {'children':[
29 |                 {
30 |                     'type': u'subsection-reference',
31 |                     'order': 2
32 |                 }
33 |             ]}
34 |         )
35 | 


--------------------------------------------------------------------------------
/duralex/ForkEditVisitor.py:
--------------------------------------------------------------------------------
 1 | from duralex.AbstractVisitor import AbstractVisitor
 2 | 
 3 | from duralex.alinea_parser import *
 4 | 
 5 | import duralex.tree
 6 | 
 7 | class ForkEditVisitor(AbstractVisitor):
 8 |     def visit_node(self, node):
 9 |         if 'type' in node and node['type'] == 'edit' and 'children' in node and len(node['children']) > 1:
10 |             ref_nodes = [n for n in node['children'] if duralex.tree.is_reference(n)]
11 |             def_nodes = [n for n in node['children'] if duralex.tree.is_definition(n)]
12 |             edit_node = copy_node(node, recursive=False)
13 |             parent = node['parent']
14 |             remove_node(parent, node)
15 |             for ref_node in ref_nodes:
16 |                 if len(def_nodes) > 0:
17 |                     for def_node in def_nodes:
18 |                         ref_node = copy_node(ref_node)
19 |                         def_node = copy_node(def_node)
20 |                         fork = copy_node(edit_node)
21 |                         push_node(fork, ref_node)
22 |                         push_node(fork, def_node)
23 |                         push_node(parent, fork)
24 |                 else:
25 |                     ref_node = copy_node(ref_node)
26 |                     fork = copy_node(edit_node)
27 |                     push_node(fork, ref_node)
28 |                     push_node(parent, fork)
29 |         else:
30 |             super(ForkEditVisitor, self).visit_node(node)
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | data
92 | 


--------------------------------------------------------------------------------
/tests/ParseMultiplicativeAdverbTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseMultiplicativeAdverbTest(DuralexTestCase):
 8 |     def test_header2_bis(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_header2_definition,
12 |                 "un 3° bis"
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'header2-definition',
17 |                     'order': 3,
18 |                     'isBis': True
19 |                 }
20 |             ]}
21 |         )
22 | 
23 |     def test_header2_ter(self):
24 |         self.assertEqualAST(
25 |             self.call_parse_func(
26 |                 parser.parse_header2_definition,
27 |                 "un 3° ter"
28 |             ),
29 |             {'children':[
30 |                 {
31 |                     'type': u'header2-definition',
32 |                     'order': 3,
33 |                     'isTer': True
34 |                 }
35 |             ]}
36 |         )
37 | 
38 |     def test_header2_quater(self):
39 |         self.assertEqualAST(
40 |             self.call_parse_func(
41 |                 parser.parse_header2_definition,
42 |                 "un 3° quater"
43 |             ),
44 |             {'children':[
45 |                 {
46 |                     'type': u'header2-definition',
47 |                     'order': 3,
48 |                     'isQuater': True
49 |                 }
50 |             ]}
51 |         )
52 | 


--------------------------------------------------------------------------------
/duralex/FixMissingCodeOrLawReferenceVisitor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding=utf-8 -*-
 2 | 
 3 | from duralex.AbstractVisitor import AbstractVisitor
 4 | 
 5 | from duralex.tree import *
 6 | 
 7 | # If an edit reference does not feature a code-reference or law-reference node, we won't be able to find the actual
 8 | # original texte to apply the edits to. To fix this, this visitor will :
 9 | # * target only article-reference nodes with no law-reference and code-reference ancestor/descendant,
10 | # * find, copy and insert as it's own child the first previous law-reference or code-reference whichever comes first in
11 | # reversed traversal 
12 | class FixMissingCodeOrLawReferenceVisitor(AbstractVisitor):
13 |     def __init__(self):
14 |         self.law_or_code_ref = None
15 |         super(FixMissingCodeOrLawReferenceVisitor, self).__init__()
16 | 
17 |     def visit_law_reference_node(self, node, post):
18 |         if post:
19 |             return
20 |         self.law_or_code_ref = node
21 | 
22 |     def visit_code_reference_node(self, node, post):
23 |         if post:
24 |             return
25 |         self.law_or_code_ref = node
26 | 
27 |     def visit_article_reference_node(self, node, post):
28 |         if post:
29 |             return
30 |         ancestor_refs = [n for n in get_node_ancestors(node) + get_node_descendants(node) if
31 |             not is_root(n) and n['type'] in [TYPE_CODE_REFERENCE, TYPE_LAW_REFERENCE]
32 |         ]
33 |         if len(ancestor_refs) == 0:
34 |             while len(node['children']) != 0:
35 |                 node = node['children'][0]
36 |             node['children'] = [copy_node(self.law_or_code_ref, False)]
37 | 


--------------------------------------------------------------------------------
/tests/ParseSubParagraphDefinitionTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseSubParagraphDefinitionTest(DuralexTestCase):
 8 |     def test_one_subparagraph_with_quote(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_subparagraph_definition,
12 |                 ("un sous-paragraphe ainsi rédigé : \n"
13 |                 "\"sous-paragraphe 1\"")
14 |             ),
15 |             {'children': [
16 |                 {
17 |                     'type': u'subparagraph-definition',
18 |                     'children': [
19 |                         {
20 |                             'type': u'quote',
21 |                             'words': u'sous-paragraphe 1'
22 |                         }
23 |                     ],
24 |                 }
25 |             ]}
26 |         )
27 | 
28 |     def test_one_subparagraph_order_with_quote(self):
29 |         self.assertEqualAST(
30 |             self.call_parse_func(
31 |                 parser.parse_subparagraph_definition,
32 |                 ("un sous-paragraphe 3 ainsi rédigé : \n"
33 |                 "\"sous-paragraphe 1\"")
34 |             ),
35 |             {'children': [
36 |                 {
37 |                     'type': u'subparagraph-definition',
38 |                     'order': 3,
39 |                     'children': [
40 |                         {
41 |                             'type': u'quote',
42 |                             'words': u'sous-paragraphe 1'
43 |                         }
44 |                     ],
45 |                 }
46 |             ]}
47 |         )
48 | 


--------------------------------------------------------------------------------
/tests/ForkReferenceVisitorTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | from duralex.ForkReferenceVisitor import ForkReferenceVisitor
 6 | 
 7 | class ForkReferenceVisitorTest(DuralexTestCase):
 8 |     def test(self):
 9 |         self.assertEqualAST(
10 |             self.call_visitor(ForkReferenceVisitor, self.make_tree({'children': [
11 |                 {
12 |                     'type': u'alinea-reference',
13 |                     'order': 3,
14 |                     'children': [
15 |                         {
16 |                             'type': u'article-reference',
17 |                             'id': u'2'
18 |                         },
19 |                         {
20 |                             'type': u'article-reference',
21 |                             'id': u'3'
22 |                         }
23 |                     ]
24 |                 }
25 |             ]})),
26 |             {'children': [
27 |                 {
28 |                     'type': u'alinea-reference',
29 |                     'order': 3,
30 |                     'children': [
31 |                         {
32 |                             'id': u'2',
33 |                             'type': u'article-reference'
34 |                         }
35 |                     ],
36 |                 },
37 |                 {
38 |                     'order': 3,
39 |                     'type': u'alinea-reference',
40 |                     'children': [
41 |                         {
42 |                             'id': u'3',
43 |                             'type': u'article-reference'
44 |                         }
45 |                     ]
46 |                 }
47 |             ]}
48 |         )
49 | 


--------------------------------------------------------------------------------
/duralex/SortReferencesVisitor.py:
--------------------------------------------------------------------------------
 1 | from duralex.alinea_parser import *
 2 | 
 3 | from duralex.AbstractVisitor import AbstractVisitor
 4 | 
 5 | import duralex.tree
 6 | 
 7 | class SortReferencesVisitor(AbstractVisitor):
 8 |     def visit_node(self, node):
 9 |         if not self.sort_references(node):
10 |             super(SortReferencesVisitor, self).visit_node(node)
11 | 
12 |     def sort_references(self, node):
13 |         root_refs = filter_nodes(node, lambda n: duralex.tree.is_reference(n) and 'parent' in n and (not duralex.tree.is_reference(n['parent'])))
14 | 
15 |         if len(root_refs) == 0:
16 |             return False
17 | 
18 |         for root_ref in root_refs:
19 |             root_ref_parent = root_ref['parent']
20 |             refs = filter_nodes(root_ref, lambda n: duralex.tree.is_reference(n))
21 |             sorted_refs = sorted(refs, key=lambda r: duralex.tree.TYPE_REFERENCE.index(r['type']))
22 |             filtered_refs = [sorted_refs[0]]
23 |             for ref in sorted_refs:
24 |                 if 'parent' in ref:
25 |                     remove_node(ref['parent'], ref)
26 |                     # the deepest *-reference of the same type wins
27 |                     # FIXME: should we raise because we're not supposed to have the same *-reference twice?
28 |                     if ref['type'] == filtered_refs[-1]['type']:
29 |                         filtered_refs.pop()
30 |                     filtered_refs.append(ref)
31 |             for i in range(0, len(filtered_refs)):
32 |                 ref = filtered_refs[i]
33 |                 if i == 0:
34 |                     push_node(root_ref_parent, ref)
35 |                 else:
36 |                     push_node(filtered_refs[i - 1], ref)
37 | 
38 |         return True
39 | 


--------------------------------------------------------------------------------
/tests/ParseArticleDefinitionTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseArticleDefinitionTest(DuralexTestCase):
 8 |     def test_an_article(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_article_definition,
12 |                 "un article"
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'article-definition'
17 |                 }
18 |             ]}
19 |         )
20 | 
21 |     def test_an_article_2(self):
22 |         self.assertEqualAST(
23 |             self.call_parse_func(
24 |                 parser.parse_article_definition,
25 |                 "l'article"
26 |             ),
27 |             {'children':[
28 |                 {
29 |                     'type': u'article-definition'
30 |                 }
31 |             ]}
32 |         )
33 | 
34 |     def test_an_article_with_id(self):
35 |         self.assertEqualAST(
36 |             self.call_parse_func(
37 |                 parser.parse_article_definition,
38 |                 "un article 42"
39 |             ),
40 |             {'children':[
41 |                 {
42 |                     'type': u'article-definition',
43 |                     'id': u'42'
44 |                 }
45 |             ]}
46 |         )
47 | 
48 |     def test_an_article_with_id_2(self):
49 |         self.assertEqualAST(
50 |             self.call_parse_func(
51 |                 parser.parse_article_definition,
52 |                 "l'article 42"
53 |             ),
54 |             {'children':[
55 |                 {
56 |                     'type': u'article-definition',
57 |                     'id': u'42'
58 |                 }
59 |             ]}
60 |         )
61 | 


--------------------------------------------------------------------------------
/tests/ParseMentionDefinitionTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseMentionDefinitionTest(DuralexTestCase):
 8 |     def test_mention_with_single_quote(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_mention_definition,
12 |                 ("la mention : \"ceci est une mention\"")
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'mention-definition',
17 |                     'children': [
18 |                         {
19 |                             'type': u'quote',
20 |                             'words': u'ceci est une mention'
21 |                         }
22 |                     ]
23 |                 }
24 |             ]}
25 |         )
26 | 
27 |     def test_mention_with_n_quotes(self):
28 |         self.assertEqualAST(
29 |             self.call_parse_func(
30 |                 parser.parse_mention_definition,
31 |                 ("la mention : \n"
32 |                 "\"ceci est le début de la mention\"\n"
33 |                 "\"ceci est la suite de la mention\"")
34 |             ),
35 |             {'children': [
36 |                 {
37 |                     'type': u'mention-definition',
38 |                     'children': [
39 |                         {
40 |                             'type': u'quote',
41 |                             'words': u'ceci est le début de la mention'
42 |                         },
43 |                         {
44 |                             'type': u'quote',
45 |                             'words': u'ceci est la suite de la mention'
46 |                         }
47 |                     ]
48 |                 }
49 |             ]}
50 |         )
51 | 


--------------------------------------------------------------------------------
/tests/ParseHeader2ReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseHeader2ReferenceTest(DuralexTestCase):
 8 |     def test_header2_number(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_header2_reference,
12 |                 "au 42°"
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'header2-reference',
17 |                     'order': 42
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_before_header2_number(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_header2_reference,
26 |                 "avant le 1°"
27 |             ),
28 |             {'children':[
29 |                 {
30 |                     'type': u'header2-reference',
31 |                     'position': u'before',
32 |                     'order': 1
33 |                 }
34 |             ]}
35 |         )
36 | 
37 |     def test_header2_order_letter_adverb_article_code(self):
38 |         self.assertEqualAST(
39 |             self.call_parse_func(
40 |                 parser.parse_header2_reference,
41 |                 "le 3° de l'article L. 711-2 du code de l'éducation"
42 |             ),
43 |             {'children': [
44 |                 {
45 |                     'order': 3,
46 |                     'type': u'header2-reference',
47 |                     'children': [
48 |                         {
49 |                             'children': [
50 |                                 {
51 |                                     'id': u'code de l\'éducation',
52 |                                     'type': u'code-reference'
53 |                                 }
54 |                             ],
55 |                             'id': u'L. 711-2',
56 |                             'type': u'article-reference'
57 |                         }
58 |                     ],
59 |                 }
60 |             ]}
61 |         )
62 | 


--------------------------------------------------------------------------------
/duralex/alinea_lexer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | 
 5 | TOKEN_DELIMITERS = re.compile(u'(\xa0|\s|\(|\)|\.|\!|\'|,|")')
 6 | TOKEN_NEW_LINE = '\n'
 7 | TOKEN_SINGLE_QUOTE = u'\''
 8 | TOKEN_DOUBLE_QUOTE_OPEN = u'"'
 9 | TOKEN_DOUBLE_QUOTE_CLOSE = u'"'
10 | TOKEN_MONTH_NAMES = [
11 |     u'janvier',
12 |     u'février',
13 |     u'mars',
14 |     u'avril',
15 |     u'mai',
16 |     u'juin',
17 |     u'juillet',
18 |     u'août',
19 |     u'septembre',
20 |     u'octobre',
21 |     u'novembre',
22 |     u'décembre'
23 | ]
24 | TOKEN_MULTIPLICATIVE_ADVERBS = [
25 |     u'bis',
26 |     u'ter',
27 |     u'quater',
28 |     u'quinquies',
29 |     u'sexies',
30 |     u'septies',
31 |     u'octies',
32 |     u'novies',
33 |     u'decies',
34 |     u'undecies',
35 |     u'duodecies',
36 |     u'terdecies',
37 |     u'quaterdecies',
38 |     u'quindecies',
39 |     u'sexdecies',
40 |     u'septdecies',
41 |     u'octodecies',
42 |     u'novodecies',
43 |     u'vicies',
44 |     u'unvicies',
45 |     u'duovicies',
46 |     u'tervicies',
47 |     u'quatervicies',
48 |     u'quinvicies',
49 |     u'sexvicies',
50 |     u'septvicies'
51 | ]
52 | 
53 | def tokenize(text):
54 |     try:
55 |         text = text.decode('utf-8')
56 |     except:
57 |         pass
58 | 
59 |     tokens = TOKEN_DELIMITERS.split(text)
60 |     # remove empty strings
61 |     tokens = [s for s in tokens if s != '']
62 |     return tokens
63 | 
64 | def skip_tokens(tokens, i, f):
65 |     while i < len(tokens) and f(tokens[i]):
66 |         i += 1
67 |     return i
68 | 
69 | def skip_spaces(tokens, i):
70 |     return skip_tokens(tokens, i, lambda t: re.compile('\s+').match(t))
71 | 
72 | def skip_to_next_word(tokens, i):
73 |     return skip_tokens(tokens, i, lambda t: not re.compile('[\wà]+', re.IGNORECASE | re.UNICODE).match(t))
74 | 
75 | def skip_to_token(tokens, i, token):
76 |     return skip_tokens(tokens, i, lambda t: t != token)
77 | 
78 | def skip_to_end_of_line(tokens, i):
79 |     if i > 0 and i < len(tokens) and tokens[i - 1] == TOKEN_NEW_LINE:
80 |         return i
81 | 
82 |     return skip_to_token(tokens, i, TOKEN_NEW_LINE)
83 | 
84 | def skip_to_quote_start(tokens, i):
85 |     return skip_to_token(tokens, i, TOKEN_DOUBLE_QUOTE_OPEN)
86 | 


--------------------------------------------------------------------------------
/tests/ParseSentenceDefinitionTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseSentenceDefinitionTest(DuralexTestCase):
 8 |     def test_one_sentence_with_quotes(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_sentence_definition,
12 |                 ("une phrase ainsi rédigée :\n"
13 |                 "\"phrase 1\"\n")
14 |             ),
15 |             {'children':[
16 |                 {
17 |                     'children': [
18 |                         {
19 |                             'type': 'quote',
20 |                             'words': 'phrase 1'
21 |                         }
22 |                     ],
23 |                     'type': 'sentence-definition'
24 |                 }
25 |             ]}
26 |         )
27 | 
28 |     def test_three_sentences_with_quotes(self):
29 |         self.assertEqualAST(
30 |             self.call_parse_func(
31 |                 parser.parse_sentence_definition,
32 |                 ("trois phrases ainsi rédigées :\n"
33 |                 "\"phrase 1\"\n"
34 |                 "\"phrase 2\"\n"
35 |                 "\"phrase 3\"\n")
36 |             ),
37 |             {'children':[
38 |                 {
39 |                     'children': [
40 |                         {
41 |                             'type': 'quote',
42 |                             'words': 'phrase 1'
43 |                         }
44 |                     ],
45 |                     'type': 'sentence-definition'
46 |                 },
47 |                 {
48 |                     'children': [
49 |                         {
50 |                             'type': 'quote',
51 |                             'words': 'phrase 2'
52 |                         }
53 |                     ],
54 |                     'type': 'sentence-definition'
55 |                 },
56 |                 {
57 |                     'children': [
58 |                         {
59 |                             'type': 'quote',
60 |                             'words': 'phrase 3'
61 |                         }
62 |                     ],
63 |                     'type': 'sentence-definition'
64 |                 }
65 |             ]}
66 |         )
67 | 


--------------------------------------------------------------------------------
/duralex/ResolveFullyQualifiedDefinitionsVisitor.py:
--------------------------------------------------------------------------------
 1 | from duralex.alinea_parser import *
 2 | 
 3 | from duralex.AbstractVisitor import AbstractVisitor
 4 | 
 5 | import duralex.tree
 6 | 
 7 | class ResolveFullyQualifiedDefinitionsVisitor(AbstractVisitor):
 8 |     def visit_node(self, node):
 9 |         self.resolve_fully_qualified_definitions(node)
10 |         super(ResolveFullyQualifiedDefinitionsVisitor, self).visit_node(node)
11 | 
12 |     def resolve_fully_qualified_definitions(self, node):
13 |         if 'type' in node and node['type'] == 'edit':
14 |             def_nodes = filter_nodes(node, lambda x : duralex.tree.is_definition(x))
15 |             # if we have more than 1 definition in a single edit, we assume:
16 |             # - they have different types
17 |             # - the final type of definition is the combination of all those types
18 |             if len(def_nodes) > 1:
19 |                 content_nodes = filter(lambda x : len(x['children']) > 0, def_nodes)
20 |                 type_nodes = filter(lambda x : len(x['children']) == 0, def_nodes)
21 |                 types = []
22 |                 for type_node in type_nodes:
23 |                     remove_node(node, type_node)
24 |                     types.append(type_node)
25 |                     del type_node['count']
26 |                     # if 'count' in type_node and type_node['count'] == len(content_nodes):
27 |                     # FIXME: else we should issue a warning because the count doesn't match and the type qualifier cannot
28 |                     # apply
29 |                 for content_node in content_nodes:
30 |                     children = []
31 |                     for child in content_node['children']:
32 |                         children.append(child)
33 |                         remove_node(content_node, child)
34 |                     remove_node(node, content_node)
35 |                     sorted_types = sorted(types + [content_node], key=lambda x : duralex.tree.TYPE_DEFINITION.index(x['type']))
36 |                     type_node = node
37 |                     for sorted_type in sorted_types:
38 |                         t = copy_node(sorted_type)
39 |                         push_node(type_node, t)
40 |                         type_node = t
41 |                     for child in children:
42 |                         push_node(type_node, child)
43 | 


--------------------------------------------------------------------------------
/tests/ParseHeader3ReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseHeader3ReferenceTest(DuralexTestCase):
 8 |     def test_header3(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_header3_reference,
12 |                 "au e"
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'type': u'header3-reference',
17 |                     'order': 5
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_before_header3(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_header3_reference,
26 |                 "avant le d"
27 |             ),
28 |             {'children':[
29 |                 {
30 |                     'type': u'header3-reference',
31 |                     'position': u'before',
32 |                     'order': 4
33 |                 }
34 |             ]}
35 |         )
36 | 
37 |     def test_header3_header2_article_code(self):
38 |         self.assertEqualAST(
39 |             self.call_parse_func(
40 |                 parser.parse_header3_reference,
41 |                 "le b du 3° de l'article L. 711-2 du code de l'éducation"
42 |             ),
43 |             {'children': [
44 |                 {
45 |                     'order': 2,
46 |                     'type': u'header3-reference',
47 |                     'children': [
48 |                         {
49 |                             'order': 3,
50 |                             'type': u'header2-reference',
51 |                             'children': [
52 |                                 {
53 |                                     'children': [
54 |                                         {
55 |                                             'id': u'code de l\'éducation',
56 |                                             'type': u'code-reference'
57 |                                         }
58 |                                     ],
59 |                                     'id': u'L. 711-2',
60 |                                     'type': u'article-reference'
61 |                                 }
62 |                             ],
63 |                         }
64 |                     ]
65 |                 }
66 |             ]}
67 |         )
68 | 


--------------------------------------------------------------------------------
/tests/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import unittest
 5 | 
 6 | from ParseHeader1Test import ParseHeader1Test
 7 | from ParseHeader2Test import ParseHeader2Test
 8 | from ParseHeader3Test import ParseHeader3Test
 9 | from ParseRawContentTest import ParseRawContentTest
10 | from ParseArticleReferenceTest import ParseArticleReferenceTest
11 | from ParseEditTest import ParseEditTest
12 | from ParseWordReferenceTest import ParseWordReferenceTest
13 | from ParseAlineaReferenceTest import ParseAlineaReferenceTest
14 | from ParseAlineaDefinitionTest import ParseAlineaDefinitionTest
15 | from ParseHeader2ReferenceTest import ParseHeader2ReferenceTest
16 | from ParseHeader2DefinitionTest import ParseHeader2DefinitionTest
17 | from ParseCodeReferenceTest import ParseCodeReferenceTest
18 | from ParseLawReferenceTest import ParseLawReferenceTest
19 | from ParseMultiplicativeAdverbTest import ParseMultiplicativeAdverbTest
20 | from ParseSentenceDefinitionTest import ParseSentenceDefinitionTest
21 | from ParseSentenceReferenceTest import ParseSentenceReferenceTest
22 | from ParseWordDefinitionTest import ParseWordDefinitionTest
23 | from ParseArticleDefinitionTest import ParseArticleDefinitionTest
24 | from ParseMentionDefinitionTest import ParseMentionDefinitionTest
25 | from ParseHeader1DefinitionTest import ParseHeader1DefinitionTest
26 | from ParseDefinitionListTest import ParseDefinitionListTest
27 | from ParseHeader3DefinitionTest import ParseHeader3DefinitionTest
28 | from ParseHeader3ReferenceTest import ParseHeader3ReferenceTest
29 | from ParseSectionReferenceTest import ParseSectionReferenceTest
30 | from ParseSubSectionReferenceTest import ParseSubSectionReferenceTest
31 | from ParseChapterReferenceTest import ParseChapterReferenceTest
32 | from ParseParagraphReferenceTest import ParseParagraphReferenceTest
33 | from ParseSubParagraphDefinitionTest import ParseSubParagraphDefinitionTest
34 | from ParseCodePartReferenceTest import ParseCodePartReferenceTest
35 | from ParseTitleReferenceTest import ParseTitleReferenceTest
36 | from ParseBookReferenceTest import ParseBookReferenceTest
37 | from ResolveFullyQualifiedReferencesVisitorTest import ResolveFullyQualifiedReferencesVisitorTest
38 | from SortReferencesVisitorTest import SortReferencesVisitorTest
39 | from ForkReferenceVisitorTest import ForkReferenceVisitorTest
40 | from ForkEditVisitorTest import ForkEditVisitorTest
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/tests/ParseAlineaDefinitionTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseAlineaDefinitionTest(DuralexTestCase):
 8 |     def test_one_alinea_with_quote(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_alinea_definition,
12 |                 ("un alinéa ainsi rédigé : \n"
13 |                 "\"alinéa 1\"")
14 |             ),
15 |             {'children': [
16 |                 {
17 |                     'type': u'alinea-definition',
18 |                     'children': [
19 |                         {
20 |                             'type': u'quote',
21 |                             'words': u'alinéa 1'
22 |                         }
23 |                     ],
24 |                 }
25 |             ]}
26 |         )
27 | 
28 |     def test_n_alineas_with_n_quotes(self):
29 |         self.assertEqualAST(
30 |             self.call_parse_func(
31 |                 parser.parse_alinea_definition,
32 |                 ("quatre alinéas ainsi rédigés : \n"
33 |                 "\"alinéa 1\"\n"
34 |                 "\"alinéa 2\"\n"
35 |                 "\"alinéa 3\"\n"
36 |                 "\"alinéa 4\"")
37 |             ),
38 |             {'children': [
39 |                 {
40 |                     'type': u'alinea-definition',
41 |                     'children': [
42 |                         {
43 |                             'type': u'quote',
44 |                             'words': u'alinéa 1'
45 |                         }
46 |                     ],
47 |                 },
48 |                 {
49 |                     'type': u'alinea-definition',
50 |                     'children': [
51 |                         {
52 |                             'type': u'quote',
53 |                             'words': u'alinéa 2'
54 |                         }
55 |                     ],
56 |                 },
57 |                 {
58 |                     'type': u'alinea-definition',
59 |                     'children': [
60 |                         {
61 |                             'type': u'quote',
62 |                             'words': u'alinéa 3'
63 |                         }
64 |                     ],
65 |                 },
66 |                 {
67 |                     'type': u'alinea-definition',
68 |                     'children': [
69 |                         {
70 |                             'type': u'quote',
71 |                             'words': u'alinéa 4'
72 |                         }
73 |                     ],
74 |                 }
75 |             ]}
76 |         )
77 | 


--------------------------------------------------------------------------------
/tests/ParseHeader3DefinitionTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseHeader3DefinitionTest(DuralexTestCase):
 8 |     def test_header3(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_header3_definition,
12 |                 ("un a")
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'header3-definition',
17 |                     'order': 1
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_header3_2(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_header3_definition,
26 |                 ("un e")
27 |             ),
28 |             {'children': [
29 |                 {
30 |                     'type': u'header3-definition',
31 |                     'order': 5
32 |                 }
33 |             ]}
34 |         )
35 | 
36 |     def test_scope_with_quotes(self):
37 |         self.assertEqualAST(
38 |             self.call_parse_func(
39 |                 parser.parse_header3_definition,
40 |                 (u"des c à e ainsi rédigés :\n"
41 |                 u"\"ceci est le contenu du header3 3\"\n"
42 |                 u"\"ceci est le contenu du header3 4\"\n"
43 |                 u"\"ceci est le contenu du header3 5\"")
44 |             ),
45 |             {'children': [
46 |                 {
47 |                     'type': u'header3-definition',
48 |                     'order': 3,
49 |                     'children': [
50 |                         {
51 |                             'type': u'quote',
52 |                             'words': u'ceci est le contenu du header3 3'
53 |                         }
54 |                     ],
55 |                 },
56 |                 {
57 |                     'type': u'header3-definition',
58 |                     'order': 4,
59 |                     'children': [
60 |                         {
61 |                             'type': u'quote',
62 |                             'words': u'ceci est le contenu du header3 4'
63 |                         }
64 |                     ],
65 |                 },
66 |                 {
67 |                     'type': u'header3-definition',
68 |                     'order': 5,
69 |                     'children': [
70 |                         {
71 |                             'type': u'quote',
72 |                             'words': u'ceci est le contenu du header3 5'
73 |                         }
74 |                     ],
75 |                 }
76 |             ]}
77 |         )
78 | 


--------------------------------------------------------------------------------
/tests/ParseCodeReferenceTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseCodeReferenceTest(DuralexTestCase):
 8 |     def test_code_with_name(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_code_reference,
12 |                 "le code de l\'éducation"
13 |             ),
14 |             {'children':[
15 |                 {
16 |                     'id': u'code de l\'éducation',
17 |                     'type': u'code-reference'
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_code_with_name_2(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_code_reference,
26 |                 "du code de l\'éducation"
27 |             ),
28 |             {'children':[
29 |                 {
30 |                     'id': u'code de l\'éducation',
31 |                     'type': u'code-reference'
32 |                 }
33 |             ]}
34 |         )
35 | 
36 |     def test_the_same_code(self):
37 |         self.assertEqualAST(
38 |             self.call_parse_func(
39 |                 parser.parse_code_reference,
40 |                 "le même code",
41 |                 {'children':[
42 |                     {
43 |                         'id': u'code de l\'éducation',
44 |                         'type': u'code-reference'
45 |                     }
46 |                 ]}
47 |             ),
48 |             {'children':[
49 |                 {
50 |                     'id': u'code de l\'éducation',
51 |                     'type': u'code-reference'
52 |                 },
53 |                 {
54 |                     'id': u'code de l\'éducation',
55 |                     'type': u'code-reference'
56 |                 }
57 |             ]}
58 |         )
59 | 
60 |     def test_the_same_code_2(self):
61 |         self.assertEqualAST(
62 |             self.call_parse_func(
63 |                 parser.parse_code_reference,
64 |                 "du même code",
65 |                 {'children':[
66 |                     {
67 |                         'id': u'code de l\'éducation',
68 |                         'type': u'code-reference'
69 |                     }
70 |                 ]}
71 |             ),
72 |             {'children':[
73 |                 {
74 |                     'id': u'code de l\'éducation',
75 |                     'type': u'code-reference'
76 |                 },
77 |                 {
78 |                     'id': u'code de l\'éducation',
79 |                     'type': u'code-reference'
80 |                 }
81 |             ]}
82 |         )
83 | 


--------------------------------------------------------------------------------
/duralex/amendment_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import duralex.alinea_lexer as lexer
 4 | 
 5 | from duralex.bill_parser import clean_html
 6 | from duralex.tree import *
 7 | from duralex.alinea_parser import is_number_word, word_to_number, is_number, parse_int, parse_alineas
 8 | 
 9 | AMENDMENT_STATUS = {
10 |     u'rejeté': 'rejected',
11 |     u'retiré': 'removed',
12 |     u'non soutenu': 'undefended',
13 |     u'retiré avant séance': 'removed',
14 |     u'adopté': 'approved'
15 | }
16 | 
17 | def parse(data, tree):
18 |     amendements = []
19 |     # ast = create_node(ast, {'type': 'amendments'})
20 |     for amendement in data['amendements']:
21 |         amendements.append(parse_amendment(amendement['amendement'], tree))
22 |     return tree
23 | 
24 | def parse_amendment(data, parent):
25 |     subject = data['sujet']
26 |     text = clean_html(data['texte'])
27 | 
28 |     tokens = lexer.tokenize(subject + '\n' + text)
29 |     node = create_node(parent, {
30 |         'type': 'amendment',
31 |         'id': data['numero'],
32 |         'content': text,
33 |         'status': AMENDMENT_STATUS[data['sort'].lower()],
34 |         'description': clean_html(data['expose']),
35 |         'signatories': [{'name': s.strip()} for s in data['signataires'].split(', ')],
36 |         'url': data['source']
37 |     })
38 | 
39 |     # The "subject" declares the target bill article reference for this admendment.
40 |     # That reference will be referenced later on using syntaxes such as "cet article" ("this article").
41 |     parse_subject(tokens, 0, node)
42 |     parse_alineas(node['content'], node)
43 |     # If the admendment content actually need that bill article reference, they already have it copied by now.
44 |     # So we simply we remove it.
45 |     remove_node(node, node['children'][0])
46 | 
47 |     return node
48 | 
49 | def parse_subject(tokens, i, parent):
50 |     node = create_node(parent, {
51 |         'type': TYPE_BILL_ARTICLE_REFERENCE
52 |     })
53 | 
54 |     i = parse_ref_position(tokens, i, node)
55 | 
56 |     # ART. PREMIER
57 |     if tokens[i] == 'ART' and is_number_word(tokens[i + 3]):
58 |         node['order'] = word_to_number(tokens[i + 3])
59 |         i += lexer.skip_to_end_of_line(tokens, i)
60 |     # ART. {order}
61 |     elif tokens[i] == 'ART' and is_number(tokens[i + 3]):
62 |         node['order'] = parse_int(tokens[i + 3])
63 |         i += lexer.skip_to_end_of_line(tokens, i)
64 | 
65 |     return i
66 | 
67 | def parse_ref_position(tokens, i, node):
68 |     if i >= len(tokens):
69 |         return
70 | 
71 |     if tokens[i] == u'AVANT':
72 |         node['position'] = u'before'
73 |         i += 2
74 |     elif tokens[i] == u'APRÈS':
75 |         node['position'] = u'after'
76 |         i += 2
77 | 
78 |     return i
79 | 


--------------------------------------------------------------------------------
/duralex/diff_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding=utf-8 -*-
 2 | 
 3 | import re
 4 | 
 5 | from unidiff import PatchSet
 6 | 
 7 | import duralex.tree
 8 | 
 9 | def parse(data, tree):
10 |     patches = PatchSet.from_string(data)
11 |     for patch in patches:
12 |         parse_patch(patch, tree)
13 | 
14 | def parse_article_reference(patch, tree):
15 |     law_ref = duralex.tree.create_node(tree, {
16 |         'type': duralex.tree.TYPE_LAW_REFERENCE,
17 |         'id': parse_law_id(patch.source_file),
18 |     })
19 | 
20 |     article_ref = duralex.tree.create_node(law_ref, {
21 |         'type': duralex.tree.TYPE_ARTICLE_REFERENCE,
22 |         'id': parse_article_id(patch.source_file),
23 |     })
24 | 
25 |     return law_ref
26 | 
27 | def parse_law_id(filename):
28 |     return re.search(r"loi_([-0-9]+)", filename).group(1)
29 | 
30 | def parse_article_id(filename):
31 |     return re.search(r"Article_([-0-9]+)\.", filename).group(1)
32 | 
33 | def parse_patch(patch, tree):
34 |     amendment = duralex.tree.create_node(tree, {
35 |         'type': duralex.tree.TYPE_AMENDMENT,
36 |         'id': '1',
37 |     })
38 |     law_ref = parse_article_reference(patch, None)
39 | 
40 |     if patch.target_file == '/dev/null':
41 |         # The patch.source_file has been deleted.
42 |         edit = duralex.tree.create_node(amendment, {
43 |             'type': duralex.tree.TYPE_EDIT,
44 |             'editType': 'delete',
45 |         })
46 |         duralex.tree.push_node(edit, law_ref)
47 |     elif patch.source_file == '/dev/null':
48 |         # The patch.target_file has been added.
49 |         edit = duralex.tree.create_node(amendment, {
50 |             'type': duralex.tree.TYPE_EDIT,
51 |             'editType': 'add',
52 |         })
53 |         duralex.tree.push_node(edit, law_ref)
54 |     else:
55 |         for hunk in patch:
56 |             parse_hunk(hunk, amendment, law_ref)
57 | 
58 | def parse_hunk(hunk, parent, ref):
59 |     line_type = ''
60 |     edit = None
61 |     word_def = None
62 | 
63 |     for line in hunk:
64 |         if line.line_type != line_type:
65 |             if edit and "editType" in edit:
66 |                 duralex.tree.push_node(parent, edit)
67 |             edit = duralex.tree.create_node(None, {
68 |                 'type': duralex.tree.TYPE_EDIT,
69 |             })
70 |             duralex.tree.push_node(edit, duralex.tree.copy_node(ref))
71 |             word_def = duralex.tree.create_node(edit, {
72 |                 'type': duralex.tree.TYPE_WORD_DEFINITION,
73 |             })
74 |             if line.line_type == '+':
75 |                 edit['editType'] = 'add'
76 |             elif line.line_type == '-':
77 |                 edit['editType'] = 'delete'
78 |             line_type = line.line_type
79 | 
80 |         quote = duralex.tree.create_node(word_def, {
81 |             'type': duralex.tree.TYPE_QUOTE,
82 |             'words': line.value,
83 |         })
84 | 
85 |     if edit and "editType" in edit:
86 |         duralex.tree.push_node(parent, edit)
87 | 


--------------------------------------------------------------------------------
/tests/DuralexTestCase.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | import sys
 5 | import os
 6 | import json
 7 | import difflib
 8 | import uuid
 9 | 
10 | sys.path.insert(0, os.path.join(os.path.realpath(os.path.dirname(__file__)), '..'))
11 | 
12 | import duralex.alinea_parser as parser
13 | import duralex.alinea_lexer as lexer
14 | import duralex.tree
15 | 
16 | from duralex.DeleteEmptyChildrenVisitor import DeleteEmptyChildrenVisitor
17 | from duralex.DeleteParentVisitor import DeleteParentVisitor
18 | from duralex.DeleteUUIDVisitor import DeleteUUIDVisitor
19 | from duralex.AddParentVisitor import AddParentVisitor
20 | 
21 | from colorama import init, Fore
22 | 
23 | init()
24 | 
25 | class DuralexTestCase(unittest.TestCase):
26 |     def pretty_diff_output(self, lines):
27 |         out = '\n'
28 | 
29 |         for line in lines:
30 |             if line[0] == '-':
31 |                 out += Fore.RED + line
32 |             elif line[0] == '+':
33 |                 out += Fore.GREEN + line
34 |             else:
35 |                 out += Fore.RESET + line
36 |             out = out + Fore.RESET + '\n'
37 | 
38 |         return out
39 | 
40 |     def call_parse_func(self, fn, data, tree=None):
41 |         if not tree:
42 |             tree = duralex.tree.create_node(None, {})
43 |         fn(lexer.tokenize(data), 0, tree)
44 |         return tree
45 | 
46 |     def add_parent(self, tree):
47 |         AddParentVisitor().visit(tree)
48 |         return tree
49 | 
50 |     def add_children(self, tree):
51 |         if 'children' not in tree:
52 |             tree['children'] = []
53 |         for child in tree['children']:
54 |             self.add_children(child)
55 |         return tree
56 | 
57 |     def add_uuid(self, tree):
58 |         if 'uuid' not in tree:
59 |             tree['uuid'] = str(uuid.uuid4())
60 |         for child in tree['children']:
61 |             self.add_uuid(child)
62 |         return tree
63 | 
64 |     def make_tree(self, tree):
65 |         tree = self.add_parent(tree)
66 |         tree = self.add_children(tree)
67 |         tree = self.add_uuid(tree)
68 |         return tree
69 | 
70 |     def call_visitor(self, visitor, tree):
71 |         tree = self.make_tree(tree)
72 |         visitor().visit(tree)
73 |         return tree
74 | 
75 |     def assertEqualAST(self, a, b):
76 |         DeleteParentVisitor().visit(a)
77 |         DeleteEmptyChildrenVisitor().visit(a)
78 |         DeleteUUIDVisitor().visit(a)
79 |         DeleteParentVisitor().visit(b)
80 |         DeleteEmptyChildrenVisitor().visit(b)
81 |         DeleteUUIDVisitor().visit(b)
82 | 
83 |         a = json.dumps(a, sort_keys=True, indent=2, ensure_ascii=False)
84 |         b = json.dumps(b, sort_keys=True, indent=2, ensure_ascii=False)
85 | 
86 |         diff = difflib.unified_diff(a.splitlines(), b.splitlines(), fromfile='computed', tofile='expected')
87 |         diff_lines = list(diff)
88 |         self.assertEqual(len(diff_lines), 0, '\n' + a + self.pretty_diff_output(diff_lines))
89 | 


--------------------------------------------------------------------------------
/duralex/ResolveFullyQualifiedReferencesVisitor.py:
--------------------------------------------------------------------------------
 1 | from duralex.alinea_parser import *
 2 | 
 3 | from duralex.AbstractVisitor import AbstractVisitor
 4 | 
 5 | class ResolveFullyQualifiedReferencesVisitor(AbstractVisitor):
 6 |     def __init__(self):
 7 |         self.ctx = []
 8 |         super(ResolveFullyQualifiedReferencesVisitor, self).__init__()
 9 | 
10 |     def visit_node(self, node):
11 |         if not self.resolve_fully_qualified_references(node):
12 |             super(ResolveFullyQualifiedReferencesVisitor, self).visit_node(node)
13 | 
14 |     def resolve_fully_qualified_references(self, node):
15 |         # If we are on an edit node that has edit ancestors
16 |         # if 'type' in node and len(filter(lambda x : x['type'] == 'edit', get_node_ancestors(node))) > 0:
17 |         #     # FIXME
18 |         #     None
19 | 
20 |         # If we have an 'edit' node in an 'edit' node, the parent gives its
21 |         # context to its descendants.
22 |         if (not duralex.tree.is_reference(node) and len(node['children']) >= 1 and node['children'][0]['type'] == 'edit'
23 |             and node['children'][0]['editType'] == 'edit'
24 |             and len(filter_nodes(node, lambda n: 'type' in n and n['type'] == 'edit')) > 1):
25 |             context = node['children'][0]['children'][0]
26 |             remove_node(node, node['children'][0])
27 |             self.ctx.append([copy_node(ctx_node, False) for ctx_node in filter_nodes(context, lambda x: duralex.tree.is_reference(x))])
28 |             for child in node['children']:
29 |                 self.visit_node(child)
30 |             self.ctx.pop()
31 |             return True
32 |         # If we have a context and there is no ref type at all and we're not on a 'swap' edit
33 |         elif len(self.ctx) > 0 and node['type'] == 'edit' and len(filter_nodes(node, lambda x : duralex.tree.is_reference(x))) == 0:
34 |             n = [copy_node(item) for sublist in self.ctx for item in sublist]
35 |             n = sorted(n, key=lambda x : duralex.tree.TYPE_REFERENCE.index(x['type']))
36 |             unshift_node(node, n[0])
37 |             for i in range(1, len(n)):
38 |                 unshift_node(n[i - 1], n[i])
39 |             return True
40 |         # If we have a context and we're on root ref type
41 |         elif len(self.ctx) > 0 and duralex.tree.is_reference(node) and not duralex.tree.is_reference(node['parent']):
42 |             n = [copy_node(item) for sublist in self.ctx for item in sublist]
43 |             n = sorted(n, key=lambda x : duralex.tree.TYPE_REFERENCE.index(x['type']))
44 |             unshift_node(node['parent'], n[0])
45 |             for i in range(1, len(n)):
46 |                 unshift_node(n[i - 1], n[i])
47 |             remove_node(node['parent'], node)
48 |             if node['type'] == 'incomplete-reference':
49 |                 if 'position' in node:
50 |                     n[len(n) - 1]['position'] = node['position']
51 |             else:
52 |                 unshift_node(n[len(n) - 1], node)
53 |             return True
54 | 
55 |         return False
56 | 


--------------------------------------------------------------------------------
/duralex/AbstractVisitor.py:
--------------------------------------------------------------------------------
 1 | import duralex.tree as tree
 2 | 
 3 | class AbstractVisitor(object):
 4 |     def __init__(self):
 5 |         self.visitors = {
 6 |             tree.TYPE_EDIT: self.visit_edit_node,
 7 |             tree.TYPE_CODE_REFERENCE: self.visit_code_reference_node,
 8 |             tree.TYPE_BOOK_REFERENCE: self.visit_book_reference_node,
 9 |             tree.TYPE_LAW_REFERENCE: self.visit_law_reference_node,
10 |             tree.TYPE_TITLE_REFERENCE: self.visit_title_reference_node,
11 |             tree.TYPE_ARTICLE_REFERENCE: self.visit_article_reference_node,
12 |             tree.TYPE_HEADER1_REFERENCE: self.visit_header1_reference_node,
13 |             tree.TYPE_HEADER2_REFERENCE: self.visit_header2_reference_node,
14 |             tree.TYPE_HEADER3_REFERENCE: self.visit_header3_reference_node,
15 |             tree.TYPE_ALINEA_REFERENCE: self.visit_alinea_reference_node,
16 |             tree.TYPE_SENTENCE_REFERENCE: self.visit_sentence_reference_node,
17 |             tree.TYPE_WORD_REFERENCE: self.visit_words_reference_node,
18 |             tree.TYPE_WORD_DEFINITION: self.visit_words_definition_node,
19 |             tree.TYPE_ARTICLE_DEFINITION: self.visit_article_definition_node,
20 |             tree.TYPE_QUOTE: self.visit_quote_node,
21 |             tree.TYPE_BILL_ARTICLE_REFERENCE: self.visit_bill_article_reference_node,
22 |             tree.TYPE_BILL_ARTICLE: self.visit_bill_article_node,
23 |         }
24 | 
25 |     def visit_code_reference_node(self, node, post):
26 |         pass
27 | 
28 |     def visit_book_reference_node(self, node, post):
29 |         pass
30 | 
31 |     def visit_law_reference_node(self, node, post):
32 |         pass
33 | 
34 |     def visit_title_reference_node(self, node, post):
35 |         pass
36 | 
37 |     def visit_article_reference_node(self, node, post):
38 |         pass
39 | 
40 |     def visit_header1_reference_node(self, node, post):
41 |         pass
42 | 
43 |     def visit_header2_reference_node(self, node, post):
44 |         pass
45 | 
46 |     def visit_header3_reference_node(self, node, post):
47 |         pass
48 | 
49 |     def visit_alinea_reference_node(self, node, post):
50 |         pass
51 | 
52 |     def visit_sentence_reference_node(self, node, post):
53 |         pass
54 | 
55 |     def visit_words_reference_node(self, node, post):
56 |         pass
57 | 
58 |     def visit_edit_node(self, node, post):
59 |         pass
60 | 
61 |     def visit_words_definition_node(self, node, post):
62 |         pass
63 | 
64 |     def visit_article_definition_node(self, node, post):
65 |         pass
66 | 
67 |     def visit_quote_node(self, node, post):
68 |         pass
69 | 
70 |     def visit_bill_article_reference_node(self, node, post):
71 |         pass
72 | 
73 |     def visit_bill_article_node(self, node, post):
74 |         pass
75 | 
76 |     def visit_node(self, node):
77 |         if 'type' in node and node['type'] in self.visitors:
78 |             self.visitors[node['type']](node, False)
79 | 
80 |         if 'children' in node:
81 |             for child in node['children']:
82 |                 self.visit_node(child)
83 | 
84 |         if 'type' in node and node['type'] in self.visitors:
85 |             self.visitors[node['type']](node, True)
86 | 
87 |     def visit(self, node):
88 |         self.visit_node(node)
89 | 


--------------------------------------------------------------------------------
/tests/ParseHeader1DefinitionTest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DuralexTestCase import DuralexTestCase
 4 | 
 5 | import duralex.alinea_parser as parser
 6 | 
 7 | class ParseHeader1DefinitionTest(DuralexTestCase):
 8 |     def test_header1(self):
 9 |         self.assertEqualAST(
10 |             self.call_parse_func(
11 |                 parser.parse_header1_definition,
12 |                 ("un I")
13 |             ),
14 |             {'children': [
15 |                 {
16 |                     'type': u'header1-definition',
17 |                     'order': 1
18 |                 }
19 |             ]}
20 |         )
21 | 
22 |     def test_header1_2(self):
23 |         self.assertEqualAST(
24 |             self.call_parse_func(
25 |                 parser.parse_header1_definition,
26 |                 ("un IV")
27 |             ),
28 |             {'children': [
29 |                 {
30 |                     'type': u'header1-definition',
31 |                     'order': 4
32 |                 }
33 |             ]}
34 |         )
35 | 
36 |     def test_header1_with_quote(self):
37 |         self.assertEqualAST(
38 |             self.call_parse_func(
39 |                 parser.parse_header1_definition,
40 |                 ("un III ainsi rédigé :\n"
41 |                 "\"ceci est le contenu du header1\"")
42 |             ),
43 |             {'children': [
44 |                 {
45 |                     'type': u'header1-definition',
46 |                     'order': 3,
47 |                     'children': [
48 |                         {
49 |                             'type': u'quote',
50 |                             'words': u'ceci est le contenu du header1'
51 |                         }
52 |                     ],
53 |                 }
54 |             ]}
55 |         )
56 | 
57 |     def test_scope_with_quotes(self):
58 |         self.assertEqualAST(
59 |             self.call_parse_func(
60 |                 parser.parse_header1_definition,
61 |                 (u"des III à V ainsi rédigés :\n"
62 |                 u"\"ceci est le contenu du header1 3\"\n"
63 |                 u"\"ceci est le contenu du header1 4\"\n"
64 |                 u"\"ceci est le contenu du header1 5\"")
65 |             ),
66 |             {'children': [
67 |                 {
68 |                     'type': u'header1-definition',
69 |                     'order': 3,
70 |                     'children': [
71 |                         {
72 |                             'type': u'quote',
73 |                             'words': u'ceci est le contenu du header1 3'
74 |                         }
75 |                     ],
76 |                 },
77 |                 {
78 |                     'type': u'header1-definition',
79 |                     'order': 4,
80 |                     'children': [
81 |                         {
82 |                             'type': u'quote',
83 |                             'words': u'ceci est le contenu du header1 4'
84 |                         }
85 |                     ],
86 |                 },
87 |                 {
88 |                     'type': u'header1-definition',
89 |                     'order': 5,
90 |                     'children': [
91 |                         {
92 |                             'type': u'quote',
93 |                             'words': u'ceci est le contenu du header1 5'
94 |                         }
95 |                     ],
96 |                 }
97 |             ]}
98 |         )
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DuraLex
  2 | 
  3 | [![Build Status](https://img.shields.io/travis/Legilibre/DuraLex.svg)](https://travis-ci.org/Legilibre/DuraLex)
  4 | [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/Legilibre/DuraLex)
  5 | 
  6 | DuraLex is a French bill compiler. It takes an official bill document written in plain natural French and transforms
  7 | it into an automatable semantic data structure. This data structure describes the content of the bill, including but
  8 | not limited to:
  9 | 
 10 | * the id and type of the bill
 11 | * articles and sections/headers
 12 | * each edit with the corresponding operators (add, remove, replace...) and operands (words, articles...)
 13 | * references to existing laws, codes, articles, headers...
 14 | * definition of new articles, headers...
 15 | 
 16 | DuraLex is the backend for [SedLex](https://github.com/Legilibre/SedLex).
 17 | 
 18 | ## Installation
 19 | 
 20 | Requirements:
 21 | 
 22 | * Python 3+
 23 | * pip
 24 | 
 25 | ```bash
 26 | pip install -r requirements.txt
 27 | ```
 28 | 
 29 | ## Usage
 30 | 
 31 | ```bash
 32 | usage: duralex [-h] [--file FILE] [--url URL] [--amendments] [--quiet] [--uuid]
 33 | 
 34 | optional arguments:
 35 |   -h, --help            show this help message and exit
 36 |   --file FILE           the path of the bill to process
 37 |   --url URL             the URL of the bill to process
 38 |   --quiet               no stdout output
 39 |   --uuid                add a unique ID on each node
 40 |   --amendments          fetch and include amendments for the specified bill
 41 | ```
 42 | 
 43 | Examples:
 44 | 
 45 | ```bash
 46 | ./duralex --file pion1561.html
 47 | ```
 48 | ```bash
 49 | ./duralex --url http://www.assemblee-nationale.fr/14/propositions/pion1561.asp
 50 | ```
 51 | ```bash
 52 | curl -s http://www.assemblee-nationale.fr/14/propositions/pion1561.asp | ./duralex
 53 | ```
 54 | ```bash
 55 | cat http://www.assemblee-nationale.fr/14/propositions/pion1561.asp | ./duralex
 56 | ```
 57 | 
 58 | ## Intermediary representation
 59 | 
 60 | ### Principle
 61 | 
 62 | DuraLex turns plain text into a standardized JSON tree structure intermediary representation.
 63 | This standardized intermediary representation can then be used as an input for other (third party) tools.
 64 | 
 65 | ![article to json](article_to_json.jpg)
 66 | 
 67 | ### Example
 68 | 
 69 | The following bill article:
 70 | 
 71 | ```
 72 | L'article 11 de la loi n° 78-753 du 17 juillet 1978 portant diverses mesures d'amélioration des relations entre l'administration et le public et diverses dispositions d'ordre administratif, social et fiscal est abrogé.
 73 | ```
 74 | 
 75 | will give the following intermediary representation:
 76 | 
 77 | ```json
 78 | {
 79 |   "children": [
 80 |     {
 81 |       "children": [
 82 |         {
 83 |           "children": [
 84 |             {
 85 |               "children": [
 86 |                 {
 87 |                   "id": "11",
 88 |                   "type": "article-reference"
 89 |                 }
 90 |               ],
 91 |               "lawDate": "1978-7-17",
 92 |               "id": "78-753",
 93 |               "type": "law-reference"
 94 |             }
 95 |           ],
 96 |           "editType": "delete",
 97 |           "type": "edit"
 98 |         }
 99 |       ],
100 |       "isNew": false,
101 |       "order": 1,
102 |       "type": "article"
103 |     }
104 |   ]
105 | }
106 | ```
107 | 
108 | ## Tests
109 | 
110 | ```bash
111 | cd tests
112 | python main.py
113 | ```
114 | 
115 | ## Related projects
116 | 
117 | * https://github.com/Legilibre/SedLex
118 | * https://github.com/Legilibre/NuitCodeCitoyen
119 | * https://github.com/Legilibre/Archeo-Lex
120 | * https://github.com/regardscitoyens/the-law-factory-parser
121 | 


--------------------------------------------------------------------------------
/tests/ParseWordDefinitionTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseWordDefinitionTest(DuralexTestCase):
  8 |     def test_the_word(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_word_definition,
 12 |                 ("le mot \"test\"")
 13 |             ),
 14 |             {'children':[
 15 |                 {
 16 |                     'type': u'word-definition',
 17 |                     'children': [
 18 |                         {
 19 |                             'type': u'quote',
 20 |                             'words': u'test'
 21 |                         }
 22 |                     ]
 23 |                 }
 24 |             ]}
 25 |         )
 26 | 
 27 |     def test_the_words(self):
 28 |         self.assertEqualAST(
 29 |             self.call_parse_func(
 30 |                 parser.parse_word_definition,
 31 |                 ("les mots \"ceci est un test\"")
 32 |             ),
 33 |             {'children':[
 34 |                 {
 35 |                     'type': u'word-definition',
 36 |                     'children': [
 37 |                         {
 38 |                             'type': u'quote',
 39 |                             'words': u'ceci est un test'
 40 |                         }
 41 |                     ]
 42 |                 }
 43 |             ]}
 44 |         )
 45 | 
 46 |     def test_the_number(self):
 47 |         self.assertEqualAST(
 48 |             self.call_parse_func(
 49 |                 parser.parse_word_definition,
 50 |                 ("le nombre \"42\"")
 51 |             ),
 52 |             {'children':[
 53 |                 {
 54 |                     'type': u'word-definition',
 55 |                     'children': [
 56 |                         {
 57 |                             'type': u'quote',
 58 |                             'words': u'42'
 59 |                         }
 60 |                     ]
 61 |                 }
 62 |             ]}
 63 |         )
 64 | 
 65 |     def test_the_figure(self):
 66 |         self.assertEqualAST(
 67 |             self.call_parse_func(
 68 |                 parser.parse_word_definition,
 69 |                 ("le nombre \"4\"")
 70 |             ),
 71 |             {'children':[
 72 |                 {
 73 |                     'type': u'word-definition',
 74 |                     'children': [
 75 |                         {
 76 |                             'type': u'quote',
 77 |                             'words': u'4'
 78 |                         }
 79 |                     ]
 80 |                 }
 81 |             ]}
 82 |         )
 83 | 
 84 |     def test_the_reference(self):
 85 |         self.assertEqualAST(
 86 |             self.call_parse_func(
 87 |                 parser.parse_word_definition,
 88 |                 ("la référence \"ceci est une référence\"")
 89 |             ),
 90 |             {'children':[
 91 |                 {
 92 |                     'type': u'word-definition',
 93 |                     'children': [
 94 |                         {
 95 |                             'type': u'quote',
 96 |                             'words': u'ceci est une référence'
 97 |                         }
 98 |                     ]
 99 |                 }
100 |             ]}
101 |         )
102 | 
103 |     def test_the_references(self):
104 |         self.assertEqualAST(
105 |             self.call_parse_func(
106 |                 parser.parse_word_definition,
107 |                 ("la références \"ceci est une référence\"")
108 |             ),
109 |             {'children':[
110 |                 {
111 |                     'type': u'word-definition',
112 |                     'children': [
113 |                         {
114 |                             'type': u'quote',
115 |                             'words': u'ceci est une référence'
116 |                         }
117 |                     ]
118 |                 }
119 |             ]}
120 |         )
121 | 


--------------------------------------------------------------------------------
/tests/ParseSentenceReferenceTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseSentenceReferenceTest(DuralexTestCase):
  8 |     def test_position_sentence(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_sentence_reference,
 12 |                 u"la première phrase"
 13 |             ),
 14 |             {'children':[
 15 |                 {
 16 |                     'type': u'sentence-reference',
 17 |                     'order': 1
 18 |                 }
 19 |             ]}
 20 |         )
 21 | 
 22 |     def test_position_sentence_2(self):
 23 |         self.assertEqualAST(
 24 |             self.call_parse_func(
 25 |                 parser.parse_sentence_reference,
 26 |                 u"à la première phrase"
 27 |             ),
 28 |             {'children':[
 29 |                 {
 30 |                     'type': u'sentence-reference',
 31 |                     'order': 1
 32 |                 }
 33 |             ]}
 34 |         )
 35 | 
 36 |     def test_position_sentence_3(self):
 37 |         self.assertEqualAST(
 38 |             self.call_parse_func(
 39 |                 parser.parse_sentence_reference,
 40 |                 u"la seconde phrase"
 41 |             ),
 42 |             {'children':[
 43 |                 {
 44 |                     'type': u'sentence-reference',
 45 |                     'order': 2
 46 |                 }
 47 |             ]}
 48 |         )
 49 | 
 50 |     def test_position_sentence_article_id_code_name(self):
 51 |         self.assertEqualAST(
 52 |             self.call_parse_func(
 53 |                 parser.parse_sentence_reference,
 54 |                 u"la première phrase de l'article L. 114-5"
 55 |             ),
 56 |             {'children':[
 57 |                 {
 58 |                     'type': u'sentence-reference',
 59 |                     'order': 1,
 60 |                     'children': [
 61 |                         {
 62 |                             'type': u'article-reference',
 63 |                             'id': u'L. 114-5'
 64 |                         }
 65 |                     ]
 66 |                 }
 67 |             ]}
 68 |         )
 69 | 
 70 |     def test_position_sentence_article_id_code_name(self):
 71 |         self.assertEqualAST(
 72 |             self.call_parse_func(
 73 |                 parser.parse_sentence_reference,
 74 |                 u"la première phrase de l'article L. 114-5 du code de la recherche"
 75 |             ),
 76 |             {'children':[
 77 |                 {
 78 |                     'type': u'sentence-reference',
 79 |                     'order': 1,
 80 |                     'children': [
 81 |                         {
 82 |                             'type': u'article-reference',
 83 |                             'id': u'L. 114-5',
 84 |                             'children': [
 85 |                                 {
 86 |                                     'type': u'code-reference',
 87 |                                     'id': u'code de la recherche'
 88 |                                 }
 89 |                             ]
 90 |                         }
 91 |                     ]
 92 |                 }
 93 |             ]}
 94 |         )
 95 | 
 96 |     def test_the_end_of_the_nth_sentence(self):
 97 |         self.assertEqualAST(
 98 |             self.call_parse_func(
 99 |                 parser.parse_sentence_reference,
100 |                 u"la fin de la première phrase"
101 |             ),
102 |             {'children':[
103 |                 {
104 |                     'type': u'sentence-reference',
105 |                     'scope': 'end',
106 |                     'order': 1,
107 |                 }
108 |             ]}
109 |         )
110 | 
111 |     def test_the_first_two_sentences(self):
112 |         self.assertEqualAST(
113 |             self.call_parse_func(
114 |                 parser.parse_sentence_reference,
115 |                 u"les deux premières phrases"
116 |             ),
117 |             {'children':[
118 |                 {
119 |                     'type': u'sentence-reference',
120 |                     'order': [0, 2]
121 |                 }
122 |             ]}
123 |         )
124 | 


--------------------------------------------------------------------------------
/tests/ParseHeader2DefinitionTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseHeader2DefinitionTest(DuralexTestCase):
  8 |     def test_header2_order_with_quote(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_header2_definition,
 12 |                 "un 1° ainsi rédigé : \n\"ceci est un test\""
 13 |             ),
 14 |             {'children': [
 15 |                 {
 16 |                     'type': u'header2-definition',
 17 |                     'order': 1,
 18 |                     'children': [
 19 |                         {
 20 |                             'type': u'quote',
 21 |                             'words': u'ceci est un test'
 22 |                         }
 23 |                     ],
 24 |                 }
 25 |             ]}
 26 |         )
 27 | 
 28 |     def test_header2_ellipsis_with_quote(self):
 29 |         self.assertEqualAST(
 30 |             self.call_parse_func(
 31 |                 parser.parse_header2_definition,
 32 |                 "un ... ° ainsi rédigé : \n\"ceci est un test\""
 33 |             ),
 34 |             {'children': [
 35 |                 {
 36 |                     'type': u'header2-definition',
 37 |                     'order': '...',
 38 |                     'children': [
 39 |                         {
 40 |                             'type': u'quote',
 41 |                             'words': u'ceci est un test'
 42 |                         }
 43 |                     ],
 44 |                 }
 45 |             ]}
 46 |         )
 47 | 
 48 |     def test_header2_order_suborder(self):
 49 |         self.assertEqualAST(
 50 |             self.call_parse_func(
 51 |                 parser.parse_header2_definition,
 52 |                 "un 1° A bis"
 53 |             ),
 54 |             {'children': [
 55 |                 {
 56 |                     'type': u'header2-definition',
 57 |                     'order': 1,
 58 |                     'isBis': True,
 59 |                     'subOrder': 'A'
 60 |                 }
 61 |             ]}
 62 |         )
 63 | 
 64 |     def test_scope_with_quotes(self):
 65 |         self.assertEqualAST(
 66 |             self.call_parse_func(
 67 |                 parser.parse_header2_definition,
 68 |                 (u"des 5° à 8° ainsi rédigés :\n"
 69 |                 u"\"ceci est le contenu du header2 5\"\n"
 70 |                 u"\"ceci est le contenu du header2 6\"\n"
 71 |                 u"\"ceci est le contenu du header2 7\"\n"
 72 |                 u"\"ceci est le contenu du header2 8\"")
 73 |             ),
 74 |             {'children': [
 75 |                 {
 76 |                     'type': u'header2-definition',
 77 |                     'order': 5,
 78 |                     'children': [
 79 |                         {
 80 |                             'type': u'quote',
 81 |                             'words': u'ceci est le contenu du header2 5'
 82 |                         }
 83 |                     ],
 84 |                 },
 85 |                 {
 86 |                     'type': u'header2-definition',
 87 |                     'order': 6,
 88 |                     'children': [
 89 |                         {
 90 |                             'type': u'quote',
 91 |                             'words': u'ceci est le contenu du header2 6'
 92 |                         }
 93 |                     ],
 94 |                 },
 95 |                 {
 96 |                     'type': u'header2-definition',
 97 |                     'order': 7,
 98 |                     'children': [
 99 |                         {
100 |                             'type': u'quote',
101 |                             'words': u'ceci est le contenu du header2 7'
102 |                         }
103 |                     ],
104 |                 },
105 |                 {
106 |                     'type': u'header2-definition',
107 |                     'order': 8,
108 |                     'children': [
109 |                         {
110 |                             'type': u'quote',
111 |                             'words': u'ceci est le contenu du header2 8'
112 |                         }
113 |                     ],
114 |                 },
115 |             ]}
116 |         )
117 | 


--------------------------------------------------------------------------------
/bin/duralex:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding=utf-8 -*-
  3 | 
  4 | import codecs
  5 | import os
  6 | import json
  7 | import sys
  8 | import argparse
  9 | 
 10 | import requests
 11 | 
 12 | sys.path.insert(0, os.path.join(os.path.realpath(os.path.dirname(__file__)), '..'))
 13 | 
 14 | import duralex.alinea_parser
 15 | import duralex.bill_parser
 16 | import duralex.amendment_parser
 17 | import duralex.diff_parser
 18 | from duralex.DeleteEmptyChildrenVisitor import DeleteEmptyChildrenVisitor
 19 | from duralex.DeleteParentVisitor import DeleteParentVisitor
 20 | from duralex.DeleteUUIDVisitor import DeleteUUIDVisitor
 21 | from duralex.ForkReferenceVisitor import ForkReferenceVisitor
 22 | from duralex.SortReferencesVisitor import SortReferencesVisitor
 23 | from duralex.ResolveFullyQualifiedReferencesVisitor import ResolveFullyQualifiedReferencesVisitor
 24 | from duralex.ResolveFullyQualifiedDefinitionsVisitor import ResolveFullyQualifiedDefinitionsVisitor
 25 | from duralex.RemoveQuotePrefixVisitor import RemoveQuotePrefixVisitor
 26 | from duralex.FixMissingCodeOrLawReferenceVisitor import FixMissingCodeOrLawReferenceVisitor
 27 | from duralex.SwapDefinitionAndReferenceVisitor import SwapDefinitionAndReferenceVisitor
 28 | 
 29 | def decode(data, encoding = None):
 30 |     if encoding:
 31 |         return data.decode(encoding)
 32 | 
 33 |     try:
 34 |         data = data.decode('utf-8')
 35 |     except:
 36 |         try:
 37 |             data = data.decode('iso-8859-1')
 38 |         except:
 39 |             pass
 40 | 
 41 |     return data
 42 | 
 43 | def handle_data(data, args):
 44 |     if data.startswith('diff'):
 45 |         tree = duralex.tree.create_node(None, {})
 46 |         duralex.diff_parser.parse(data, tree)
 47 |     else:
 48 |         bill_data = duralex.bill_parser.parse_bill(data, args.url)
 49 |         tree = duralex.tree.create_node(None, {})
 50 |         for field in ['id', 'type', 'legislature', 'url', 'description', 'date', 'place']:
 51 |             if field in bill_data:
 52 |                 tree[field] = bill_data[field]
 53 | 
 54 |         duralex.alinea_parser.parse(bill_data, tree)
 55 | 
 56 |         if args.amendments:
 57 |             if args.amendments == '-':
 58 |                 amendment_url = (
 59 |                     'https://www.nosdeputes.fr/'
 60 |                     + str(bill_data['legislature'])
 61 |                     + '/amendements/'
 62 |                     + str(bill_data['id'])
 63 |                     + '/json'
 64 |                 )
 65 |                 amendments = requests.get(amendment_url).text
 66 |             else:
 67 |                 amendments = open(args.amendments, 'r').read()
 68 |             amendments = decode(amendments)
 69 |             amendments = json.loads(amendments)
 70 |             duralex.amendment_parser.parse(amendments, tree)
 71 | 
 72 |     ForkReferenceVisitor().visit(tree)
 73 |     ResolveFullyQualifiedDefinitionsVisitor().visit(tree)
 74 |     ResolveFullyQualifiedReferencesVisitor().visit(tree)
 75 |     FixMissingCodeOrLawReferenceVisitor().visit(tree)
 76 |     SortReferencesVisitor().visit(tree)
 77 |     SwapDefinitionAndReferenceVisitor().visit(tree)
 78 |     RemoveQuotePrefixVisitor().visit(tree)
 79 | 
 80 |     if not args.uuid:
 81 |         DeleteUUIDVisitor().visit(tree)
 82 | 
 83 |     DeleteParentVisitor().visit(tree)
 84 |     DeleteEmptyChildrenVisitor().visit(tree)
 85 | 
 86 |     if not args.quiet:
 87 |         json_data = json.dumps(tree, sort_keys=True, indent=2, ensure_ascii=False)
 88 |         sys.stdout.write(json_data)
 89 | 
 90 | def main(argv=None):
 91 |     parser = argparse.ArgumentParser(prog='duralex')
 92 |     parser.add_argument('--file', help='the path of the bill to process', type=argparse.FileType('r'), default='-')
 93 |     parser.add_argument('--url', help='the URL of the bill to process')
 94 |     parser.add_argument('--quiet', action='store_true', help='no stdout output')
 95 |     parser.add_argument('--uuid', action='store_true', help='add a unique ID on each node')
 96 |     parser.add_argument('--amendments', nargs='?', const='-', default=False, help='fetch and parse amendements')
 97 |     parser.add_argument('--debug', action='store_true')
 98 | 
 99 |     args = parser.parse_args()
100 | 
101 |     if args.url:
102 |         res = requests.get(args.url)
103 |         data = decode(res.content, res.apparent_encoding)
104 |     elif args.file:
105 |         data = decode(args.file.read())
106 | 
107 |     handle_data(data, args)
108 | 
109 |     return 0
110 | 
111 | if __name__ == "__main__":
112 |     if sys.version_info < (3,0):
113 |         raise Exception(
114 |             'DuraLex requires Python 3.0+, current version is '
115 |             + str(sys.version_info[0]) + '.' + str(sys.version_info[1])
116 |         )
117 | 
118 |     sys.exit(main())
119 | 


--------------------------------------------------------------------------------
/duralex/tree.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import uuid
  4 | 
  5 | TYPE_HEADER1        = u'header1'
  6 | TYPE_HEADER2        = u'header2'
  7 | TYPE_HEADER3        = u'header3'
  8 | TYPE_BILL_ARTICLE   = u'bill-article'
  9 | TYPE_AMENDMENT      = u'amendment'
 10 | TYPE_EDIT           = u'edit'
 11 | TYPE_QUOTE          = u'quote'
 12 | TYPE_LAW_PROJECT    = u'law-project'
 13 | TYPE_LAW_PROPOSAL   = u'law-proposal'
 14 | 
 15 | TYPE_TITLE_DEFINITION           = u'title-definition'
 16 | TYPE_ARTICLE_DEFINITION         = u'article-definition'
 17 | TYPE_HEADER1_DEFINITION         = u'header1-definition'
 18 | TYPE_HEADER2_DEFINITION         = u'header2-definition'
 19 | TYPE_HEADER3_DEFINITION         = u'header3-definition'
 20 | TYPE_SUBPARAGRAPH_DEFINITION    = u'subparagraph-definition'
 21 | TYPE_ALINEA_DEFINITION          = u'alinea-definition'
 22 | TYPE_SENTENCE_DEFINITION        = u'sentence-definition'
 23 | TYPE_MENTION_DEFINITION         = u'mention-definition'
 24 | TYPE_WORD_DEFINITION            = u'word-definition'
 25 | 
 26 | TYPE_DEFINITION = [
 27 |     TYPE_TITLE_DEFINITION,
 28 |     TYPE_ARTICLE_DEFINITION,
 29 |     TYPE_HEADER1_DEFINITION,
 30 |     TYPE_HEADER2_DEFINITION,
 31 |     TYPE_HEADER3_DEFINITION,
 32 |     TYPE_SUBPARAGRAPH_DEFINITION,
 33 |     TYPE_ALINEA_DEFINITION,
 34 |     TYPE_SENTENCE_DEFINITION,
 35 |     TYPE_MENTION_DEFINITION,
 36 |     TYPE_WORD_DEFINITION,
 37 | ]
 38 | 
 39 | TYPE_BILL_ARTICLE_REFERENCE = u'bill-article-reference'
 40 | TYPE_CODE_REFERENCE         = u'code-reference'
 41 | TYPE_CODE_PART_REFERENCE    = u'code-part-reference'
 42 | TYPE_BOOK_REFERENCE         = u'book-reference'
 43 | TYPE_LAW_REFERENCE          = u'law-reference'
 44 | TYPE_TITLE_REFERENCE        = u'title-reference'
 45 | TYPE_CHAPTER_REFERENCE      = u'chapter-reference'
 46 | TYPE_SECTION_REFERENCE      = u'section-reference'
 47 | TYPE_SUBSECTION_REFERENCE   = u'subsection-reference'
 48 | TYPE_PARAGRAPH_REFERENCE    = u'paragraph-reference'
 49 | TYPE_ARTICLE_REFERENCE      = u'article-reference'
 50 | TYPE_HEADER1_REFERENCE      = u'header1-reference'
 51 | TYPE_HEADER2_REFERENCE      = u'header2-reference'
 52 | TYPE_HEADER3_REFERENCE      = u'header3-reference'
 53 | TYPE_ALINEA_REFERENCE       = u'alinea-reference'
 54 | TYPE_SENTENCE_REFERENCE     = u'sentence-reference'
 55 | TYPE_WORD_REFERENCE         = u'word-reference'
 56 | TYPE_INCOMPLETE_REFERENCE   = u'incomplete-reference'
 57 | 
 58 | TYPE_REFERENCE = [
 59 |     TYPE_CODE_REFERENCE,
 60 |     TYPE_CODE_PART_REFERENCE,
 61 |     TYPE_BOOK_REFERENCE,
 62 |     TYPE_LAW_REFERENCE,
 63 |     TYPE_TITLE_REFERENCE,
 64 |     TYPE_CHAPTER_REFERENCE,
 65 |     TYPE_SECTION_REFERENCE,
 66 |     TYPE_SUBSECTION_REFERENCE,
 67 |     TYPE_PARAGRAPH_REFERENCE,
 68 |     TYPE_ARTICLE_REFERENCE,
 69 |     TYPE_HEADER1_REFERENCE,
 70 |     TYPE_HEADER2_REFERENCE,
 71 |     TYPE_HEADER3_REFERENCE,
 72 |     TYPE_ALINEA_REFERENCE,
 73 |     TYPE_SENTENCE_REFERENCE,
 74 |     TYPE_WORD_REFERENCE,
 75 |     TYPE_INCOMPLETE_REFERENCE,
 76 |     TYPE_BILL_ARTICLE_REFERENCE,
 77 | ]
 78 | 
 79 | def unshift_node(parent, node):
 80 |     node['parent'] = parent
 81 |     if 'children' not in parent:
 82 |         parent['children'] = []
 83 |     parent['children'] = [node] + parent['children']
 84 | 
 85 | def push_node(parent, node):
 86 |     if 'parent' in node:
 87 |         remove_node(node['parent'], node)
 88 |     node['parent'] = parent
 89 |     if 'children' not in parent:
 90 |         parent['children'] = []
 91 |     parent['children'].append(node)
 92 | 
 93 | def create_node(parent, node):
 94 |     if 'children' not in node:
 95 |         node['children'] = []
 96 |     node['uuid'] = str(uuid.uuid4())
 97 | 
 98 |     if parent:
 99 |         push_node(parent, node)
100 | 
101 |     return node
102 | 
103 | def compare_nodes(a, b):
104 |     return a['uuid'] == b['uuid'] if 'uuid' in a and 'uuid' in b else a == b
105 | 
106 | def remove_node(parent, node):
107 |     if not parent:
108 |         raise Exception('invalid parent')
109 |     if 'parent' not in node or node['parent'] != parent:
110 |         raise Exception('parent node does not match')
111 | 
112 |     for i in range(0, len(parent['children'])):
113 |         if compare_nodes(node, parent['children'][i]):
114 |             del parent['children'][i]
115 |             del node['parent']
116 |             return True
117 | 
118 |     return False
119 | 
120 | def copy_node(node, recursive=True):
121 |     c = node.copy()
122 |     if 'uuid' in c:
123 |         c['uuid'] = str(uuid.uuid4())
124 |     if 'parent' in c:
125 |         del c['parent']
126 |     c['children'] = []
127 |     if 'children' in node and recursive:
128 |         for child in node['children']:
129 |             push_node(c, copy_node(child))
130 |     return c
131 | 
132 | def get_node_depth(node):
133 |     if not 'parent' in node:
134 |         return 0
135 |     return 1 + get_node_depth(node['parent'])
136 | 
137 | def get_root(node):
138 |     while 'parent' in node:
139 |         node = node['parent']
140 | 
141 |     return node
142 | 
143 | def filter_nodes(root, fn):
144 |     return filter_nodes_rec(root, fn, [])
145 | 
146 | def filter_nodes_rec(root, fn, results):
147 |     if fn(root):
148 |         results.append(root)
149 | 
150 |     if 'children' in root:
151 |         for child in root['children']:
152 |             filter_nodes_rec(child, fn, results)
153 | 
154 |     return results
155 | 
156 | def is_definition(node):
157 |     return 'type' in node and node['type'] in TYPE_DEFINITION
158 | 
159 | def is_reference(node):
160 |     return 'type' in node and node['type'] in TYPE_REFERENCE
161 | 
162 | def is_root(node):
163 |     return 'parent' not in node
164 | 
165 | def get_node_descendants(node):
166 |     return filter_nodes(node, lambda n: True)
167 | 
168 | def get_node_ancestors(node):
169 |     a = []
170 | 
171 |     if 'parent' not in node:
172 |         return a
173 | 
174 |     node = node['parent']
175 |     while node and 'type' in node:
176 |         a.append(node)
177 |         node = node['parent'] if 'parent' in node else None
178 |     return a
179 | 


--------------------------------------------------------------------------------
/tests/ParseLawReferenceTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseLawReferenceTest(DuralexTestCase):
  8 |     def test_ordonnance_with_id(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_law_reference,
 12 |                 u"l'ordonnance n° 2008-1305 du 11 décembre 2008"
 13 |             ),
 14 |             {'children':[
 15 |                 {
 16 |                     'type': u'law-reference',
 17 |                     'id': u'2008-1305',
 18 |                     'lawDate': u'2008-12-11',
 19 |                     'lawType': u'ordonnance'
 20 |                 }
 21 |             ]}
 22 |         )
 23 | 
 24 |     def test_ordonnance_with_id_2(self):
 25 |         self.assertEqualAST(
 26 |             self.call_parse_func(
 27 |                 parser.parse_law_reference,
 28 |                 u"de l'ordonnance n° 2008-1305 du 11 décembre 2008"
 29 |             ),
 30 |             {'children':[
 31 |                 {
 32 |                     'type': u'law-reference',
 33 |                     'id': u'2008-1305',
 34 |                     'lawDate': u'2008-12-11',
 35 |                     'lawType': u'ordonnance'
 36 |                 }
 37 |             ]}
 38 |         )
 39 | 
 40 |     def test_law_with_id(self):
 41 |         self.assertEqualAST(
 42 |             self.call_parse_func(
 43 |                 parser.parse_law_reference,
 44 |                 u"la loi n° 2007-1199"
 45 |             ),
 46 |             {'children':[
 47 |                 {
 48 |                     'type': u'law-reference',
 49 |                     'id': u'2007-1199'
 50 |                 }
 51 |             ]}
 52 |         )
 53 | 
 54 |     def test_the_same_law(self):
 55 |         self.assertEqualAST(
 56 |             self.call_parse_func(
 57 |                 parser.parse_law_reference,
 58 |                 u"de la même loi",
 59 |                 {'children':[
 60 |                     {
 61 |                         'type': u'law-reference',
 62 |                         'id': u'2007-1199'
 63 |                     }
 64 |                 ]}
 65 |             ),
 66 |             {'children':[
 67 |                 {
 68 |                     'type': u'law-reference',
 69 |                     'id': u'2007-1199'
 70 |                 },
 71 |                 {
 72 |                     'type': u'law-reference',
 73 |                     'id': u'2007-1199'
 74 |                 }
 75 |             ]}
 76 |         )
 77 | 
 78 |     def test_the_same_law_word_ref(self):
 79 |         self.assertEqualAST(
 80 |             self.call_parse_func(
 81 |                 parser.parse_law_reference,
 82 |                 u"de la même loi, les mots \"ceci est un test\"",
 83 |                 {'children':[
 84 |                     {
 85 |                         'type': u'law-reference',
 86 |                         'id': u'2007-1199'
 87 |                     }
 88 |                 ]}
 89 |             ),
 90 |             {'children':[
 91 |                 {
 92 |                     'type': u'law-reference',
 93 |                     'id': u'2007-1199'
 94 |                 },
 95 |                 {
 96 |                     'type': u'law-reference',
 97 |                     'id': u'2007-1199',
 98 |                     'children': [
 99 |                         {
100 |                             'type': u'word-reference',
101 |                             'children': [
102 |                                 {
103 |                                     'type': u'quote',
104 |                                     'words': u'ceci est un test'
105 |                                 }
106 |                             ]
107 |                         }
108 |                     ]
109 |                 }
110 |             ]}
111 |         )
112 | 
113 |     def test_law_with_id_2(self):
114 |         self.assertEqualAST(
115 |             self.call_parse_func(
116 |                 parser.parse_law_reference,
117 |                 u"de la loi n° 2007-1199"
118 |             ),
119 |             {'children':[
120 |                 {
121 |                     'type': u'law-reference',
122 |                     'id': u'2007-1199'
123 |                 }
124 |             ]}
125 |         )
126 | 
127 |     def test_law_with_id_and_date(self):
128 |         self.assertEqualAST(
129 |             self.call_parse_func(
130 |                 parser.parse_law_reference,
131 |                 u"la loi n° 2007-1199 du 10 août 2007"
132 |             ),
133 |             {'children':[
134 |                 {
135 |                     'type': u'law-reference',
136 |                     'id': u'2007-1199',
137 |                     'lawDate': u'2007-8-10'
138 |                 }
139 |             ]}
140 |         )
141 | 
142 |     def test_law_with_id_and_date_2(self):
143 |         self.assertEqualAST(
144 |             self.call_parse_func(
145 |                 parser.parse_law_reference,
146 |                 u"de la loi n° 2007-1199 du 10 août 2007"
147 |             ),
148 |             {'children':[
149 |                 {
150 |                     'type': u'law-reference',
151 |                     'id': u'2007-1199',
152 |                     'lawDate': u'2007-8-10'
153 |                 }
154 |             ]}
155 |         )
156 | 
157 |     def test_law_with_id_and_code_name(self):
158 |         self.assertEqualAST(
159 |             self.call_parse_func(
160 |                 parser.parse_law_reference,
161 |                 u"l'ordonnance n° 2008-1305 du 11 décembre 2008 modifiant la partie législative du code de la recherche"
162 |             ),
163 |             {'children':[
164 |                 {
165 |                     'type': u'law-reference',
166 |                     'lawType': u'ordonnance',
167 |                     'id': u'2008-1305',
168 |                     'lawDate': u'2008-12-11',
169 |                     'children': [
170 |                         {
171 |                             'type': u'code-reference',
172 |                             'id': u'code de la recherche'
173 |                         }
174 |                     ]
175 |                 }
176 |             ]}
177 |         )
178 | 


--------------------------------------------------------------------------------
/tests/ParseDefinitionListTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseDefinitionListTest(DuralexTestCase):
  8 |     def test_n_sentences_and_n_alineas_with_quotes(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_definition_list,
 12 |                 (u"cinq phrases et cinq alinéas ainsi rédigés : \n"
 13 |                 u"\"alinéa 1\"\n"
 14 |                 u"\"alinéa 2\"\n"
 15 |                 u"\"alinéa 3\"\n"
 16 |                 u"\"alinéa 4\"\n"
 17 |                 u"\"alinéa 5\"\n")
 18 |             ),
 19 |             {'children': [
 20 |                 {
 21 |                     'count': 5,
 22 |                     'type': u'sentence-definition'
 23 |                 },
 24 |                 {
 25 |                     'children': [
 26 |                         {
 27 |                             'type': u'quote',
 28 |                             'words': u'alinéa 1'
 29 |                         }
 30 |                     ],
 31 |                     'type': u'alinea-definition'
 32 |                 },
 33 |                 {
 34 |                     'children': [
 35 |                         {
 36 |                             'type': u'quote',
 37 |                             'words': u'alinéa 2'
 38 |                         }
 39 |                     ],
 40 |                     'type': u'alinea-definition'
 41 |                 },
 42 |                 {
 43 |                     'children': [
 44 |                         {
 45 |                             'type': u'quote',
 46 |                             'words': u'alinéa 3'
 47 |                         }
 48 |                     ],
 49 |                     'type': u'alinea-definition'
 50 |                 },
 51 |                 {
 52 |                     'children': [
 53 |                         {
 54 |                             'type': u'quote',
 55 |                             'words': u'alinéa 4'
 56 |                         }
 57 |                     ],
 58 |                     'type': u'alinea-definition'
 59 |                 },
 60 |                 {
 61 |                     'children': [
 62 |                         {
 63 |                             'type': u'quote',
 64 |                             'words': u'alinéa 5'
 65 |                         }
 66 |                     ],
 67 |                     'type': u'alinea-definition'
 68 |                 }
 69 |             ]}
 70 |         )
 71 | 
 72 |     def test_n_header1_with_n_quotes(self):
 73 |         self.assertEqualAST(
 74 |             self.call_parse_func(
 75 |                 parser.parse_definition_list,
 76 |                 ("un III et un IV ainsi rédigés :\n"
 77 |                 "\"ceci est le contenu du premier header1\"\n"
 78 |                 "\"ceci est le contenu du second header1\"")
 79 |             ),
 80 |             {'children': [
 81 |                 {
 82 |                     'type': u'header1-definition',
 83 |                     'order': 3,
 84 |                     'children': [
 85 |                         {
 86 |                             'type': u'quote',
 87 |                             'words': u'ceci est le contenu du premier header1'
 88 |                         }
 89 |                     ],
 90 |                 },
 91 |                 {
 92 |                     'type': u'header1-definition',
 93 |                     'order': 4,
 94 |                     'children': [
 95 |                         {
 96 |                             'type': u'quote',
 97 |                             'words': u'ceci est le contenu du second header1'
 98 |                         }
 99 |                     ],
100 |                 }
101 |             ]}
102 |         )
103 | 
104 |     def test_n_header2_with_n_quotes(self):
105 |         self.assertEqualAST(
106 |             self.call_parse_func(
107 |                 parser.parse_definition_list,
108 |                 ("un 2° et un 3° ainsi rédigés :\n"
109 |                 "\"ceci est le contenu du premier header2\"\n"
110 |                 "\"ceci est le contenu du second header2\"")
111 |             ),
112 |             {'children': [
113 |                 {
114 |                     'type': u'header2-definition',
115 |                     'order': 2,
116 |                     'children': [
117 |                         {
118 |                             'type': u'quote',
119 |                             'words': u'ceci est le contenu du premier header2'
120 |                         }
121 |                     ],
122 |                 },
123 |                 {
124 |                     'type': u'header2-definition',
125 |                     'order': 3,
126 |                     'children': [
127 |                         {
128 |                             'type': u'quote',
129 |                             'words': u'ceci est le contenu du second header2'
130 |                         }
131 |                     ],
132 |                 }
133 |             ]}
134 |         )
135 | 
136 |     def test_n_alineas_with_quotes(self):
137 |         self.assertEqualAST(
138 |             self.call_parse_func(
139 |                 parser.parse_definition_list,
140 |                 (u"trois alinéas ainsi rédigés : \n"
141 |                 u"\"alinéa 1\"\n"
142 |                 u"\"alinéa 2\"\n"
143 |                 u"\"alinéa 3\"")
144 |             ),
145 |             {'children': [
146 |                 {
147 |                     'children': [
148 |                         {
149 |                             'type': u'quote',
150 |                             'words': u'alinéa 1'
151 |                         }
152 |                     ],
153 |                     'type': u'alinea-definition'
154 |                 },
155 |                 {
156 |                     'children': [
157 |                         {
158 |                             'type': u'quote',
159 |                             'words': u'alinéa 2'
160 |                         }
161 |                     ],
162 |                     'type': u'alinea-definition'
163 |                 },
164 |                 {
165 |                     'children': [
166 |                         {
167 |                             'type': u'quote',
168 |                             'words': u'alinéa 3'
169 |                         }
170 |                     ],
171 |                     'type': u'alinea-definition'
172 |                 }
173 |             ]}
174 |         )
175 |     
176 | 


--------------------------------------------------------------------------------
/tests/SortReferencesVisitorTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | from duralex.SortReferencesVisitor import SortReferencesVisitor
  6 | 
  7 | class SortReferencesVisitorTest(DuralexTestCase):
  8 |     def test_law_article(self):
  9 |         self.assertEqualAST(
 10 |             self.call_visitor(SortReferencesVisitor, {'children': [
 11 |                 {
 12 |                     'lawDate': u'1978-7-17',
 13 |                     'id': u'78-753',
 14 |                     'type': u'law-reference',
 15 |                     'children': [
 16 |                         {
 17 |                             'type': u'article-reference',
 18 |                             'id': u'11',
 19 |                         }
 20 |                     ]
 21 |                 }
 22 |             ]}),
 23 |             {'children': [
 24 |                 {
 25 |                     'lawDate': u'1978-7-17',
 26 |                     'id': u'78-753',
 27 |                     'type': u'law-reference',
 28 |                     'children': [
 29 |                         {
 30 |                             'type': u'article-reference',
 31 |                             'id': u'11'
 32 |                         }
 33 |                     ]
 34 |                 }
 35 |             ]}
 36 |         )
 37 | 
 38 |     def test_article_law(self):
 39 |         self.assertEqualAST(
 40 |             self.call_visitor(SortReferencesVisitor, {'children': [
 41 |                 {
 42 |                     'type': u'article-reference',
 43 |                     'id': u'11',
 44 |                     'children': [
 45 |                         {
 46 |                             'lawDate': u'1978-7-17',
 47 |                             'id': u'78-753',
 48 |                             'type': u'law-reference',
 49 |                         }
 50 |                     ]
 51 |                 }
 52 |             ]}),
 53 |             {'children': [
 54 |                 {
 55 |                     'lawDate': u'1978-7-17',
 56 |                     'id': u'78-753',
 57 |                     'type': u'law-reference',
 58 |                     'children': [
 59 |                         {
 60 |                             'type': u'article-reference',
 61 |                             'id': u'11'
 62 |                         }
 63 |                     ]
 64 |                 }
 65 |             ]}
 66 |         )
 67 | 
 68 |     def test_paragraph_subsection_section_chapter_title_book(self):
 69 |         self.assertEqualAST(
 70 |             self.call_visitor(SortReferencesVisitor, {'children': [
 71 |                 {
 72 |                     'children': [
 73 |                         {
 74 |                             'children': [
 75 |                                 {
 76 |                                     'children': [
 77 |                                         {
 78 |                                             'children': [
 79 |                                                 {
 80 |                                                     'children': [
 81 |                                                         {
 82 |                                                             'order': 1,
 83 |                                                             'type': u'book-reference'
 84 |                                                         }
 85 |                                                     ],
 86 |                                                     'order': 3,
 87 |                                                     'type': u'title-reference'
 88 |                                                 }
 89 |                                             ],
 90 |                                             'order': 2,
 91 |                                             'type': u'chapter-reference'
 92 |                                         }
 93 |                                     ],
 94 |                                     'order': 2,
 95 |                                     'type': u'section-reference'
 96 |                                 }
 97 |                             ],
 98 |                             'order': 2,
 99 |                             'type': u'subsection-reference'
100 |                         }
101 |                     ],
102 |                     'order': 3,
103 |                     'type': u'paragraph-reference'
104 |                 }
105 |             ]}),
106 |             {'children': [
107 |                 {
108 |                     'children': [
109 |                         {
110 |                             'children': [
111 |                                 {
112 |                                     'children': [
113 |                                         {
114 |                                             'children': [
115 |                                                 {
116 |                                                     'children': [
117 |                                                         {
118 |                                                             'order': 3,
119 |                                                             'type': u'paragraph-reference'
120 |                                                         }
121 |                                                     ],
122 |                                                     'order': 2,
123 |                                                     'type': u'subsection-reference'
124 |                                                 }
125 |                                             ],
126 |                                             'order': 2,
127 |                                             'type': u'section-reference'
128 |                                         }
129 |                                     ],
130 |                                     'order': 2,
131 |                                     'type': u'chapter-reference'
132 |                                 }
133 |                             ],
134 |                             'order': 3,
135 |                             'type': u'title-reference'
136 |                         }
137 |                     ],
138 |                     'order': 1,
139 |                     'type': u'book-reference'
140 |                 }
141 |             ]}
142 |         )
143 | 
144 |     def test_article_ref_article_ref(self):
145 |         self.assertEqualAST(
146 |             self.call_visitor(SortReferencesVisitor, {'children': [
147 |                 {
148 |                     'type': u'article-reference',
149 |                     'id': u'11',
150 |                     'children': [
151 |                         {
152 |                             'type': u'article-reference',
153 |                             'id': u'42'
154 |                         }
155 |                     ]
156 |                 }
157 |             ]}),
158 |             {'children': [
159 |                 {
160 |                     'type': u'article-reference',
161 |                     'id': u'42'
162 |                 }
163 |             ]}
164 |         )
165 | 


--------------------------------------------------------------------------------
/tests/ParseHeader1Test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseHeader1Test(DuralexTestCase):
  8 |     def test_header1_raw_content(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_header1,
 12 |                 (u"I. Ceci est un header1.")
 13 |             ),
 14 |             {'children':[
 15 |                 {
 16 |                     'type': u'header1',
 17 |                     'order': 1,
 18 |                     'children': [
 19 |                         {
 20 |                             'content': u'Ceci est un header1.',
 21 |                             'type': u'raw-content'
 22 |                         }
 23 |                     ]
 24 |                 }
 25 |             ]}
 26 |         )
 27 | 
 28 |     def test_header1_incomplete_edit_header2_edit(self):
 29 |         self.assertEqualAST(
 30 |             self.call_parse_func(
 31 |                 parser.parse_header1,
 32 |                 (u"L'article L. 123-5 du code de l'éducation est ainsi modifié :\n"
 33 |                 u"1° À la première phrase, les mots : \"mots d'origine\" sont remplacés par les mots : \"mots de remplacement\".")
 34 |             ),
 35 |             {'children':[
 36 |                 {
 37 |                     'editType': u'edit',
 38 |                     'type': u'edit',
 39 |                     'children': [
 40 |                         {
 41 |                             'id': u'L. 123-5',
 42 |                             'type': u'article-reference',
 43 |                             'children': [
 44 |                                 {
 45 |                                     'type': u'code-reference',
 46 |                                     'id': u'code de l\'éducation'
 47 |                                 }
 48 |                             ]
 49 |                         }
 50 |                     ]
 51 |                 },
 52 |                 {
 53 |                     'type': u'header2',
 54 |                     'order': 1,
 55 |                     'children': [
 56 |                         {
 57 |                             'editType': u'replace',
 58 |                             'type': u'edit',
 59 |                             'children': [
 60 |                                 {
 61 |                                     'type': u'sentence-reference',
 62 |                                     'order': 1,
 63 |                                     'children': [
 64 |                                         {
 65 |                                             'type': u'word-reference',
 66 |                                             'children': [
 67 |                                                 {
 68 |                                                     'type': u'quote',
 69 |                                                     'words': u'mots d\'origine'
 70 |                                                 }
 71 |                                             ]
 72 |                                         }
 73 |                                     ]
 74 |                                 },
 75 |                                 {
 76 |                                     'type': u'word-definition',
 77 |                                     'children': [
 78 |                                         {
 79 |                                             'type': u'quote',
 80 |                                             'words': u'mots de remplacement'
 81 |                                         }
 82 |                                     ]
 83 |                                 }
 84 |                             ]
 85 |                         }
 86 |                     ]
 87 |                 }
 88 |             ]}
 89 |         )
 90 | 
 91 |     def test_header1_incomplete_edit_header2_incomplete_edit_header3_edit(self):
 92 |         self.assertEqualAST(
 93 |             self.call_parse_func(
 94 |                 parser.parse_header1,
 95 |                 (u"L'article L. 123-5 du code de l'éducation est ainsi modifié :\n"
 96 |                 u"1° L'avant-dernier alinéa est ainsi modifié :\n"
 97 |                 u"a) À la première phrase, les mots : \"mots d'origine\" sont remplacés par les mots : \"mots de remplacement\".")
 98 |             ),
 99 |             {'children':[
100 |                 {
101 |                     'editType': u'edit',
102 |                     'type': u'edit',
103 |                     'children': [
104 |                         {
105 |                             'id': u'L. 123-5',
106 |                             'type': u'article-reference',
107 |                             'children': [
108 |                                 {
109 |                                     'type': u'code-reference',
110 |                                     'id': u'code de l\'éducation'
111 |                                 }
112 |                             ]
113 |                         }
114 |                     ]
115 |                 },
116 |                 {
117 |                     'type': u'header2',
118 |                     'order': 1,
119 |                     'children': [
120 |                         {
121 |                             'editType': u'edit',
122 |                             'type': u'edit',
123 |                             'children': [
124 |                                 {
125 |                                     'order': -2,
126 |                                     'type': u'alinea-reference'
127 |                                 }
128 |                             ]
129 |                         },
130 |                         {
131 |                             'type': u'header3',
132 |                             'order': 1,
133 |                             'children': [
134 |                                 {
135 |                                     'editType': u'replace',
136 |                                     'type': u'edit',
137 |                                     'children': [
138 |                                         {
139 |                                             'type': u'sentence-reference',
140 |                                             'order': 1,
141 |                                             'children': [
142 |                                                 {
143 |                                                     'type': u'word-reference',
144 |                                                     'children': [
145 |                                                         {
146 |                                                             'type': u'quote',
147 |                                                             'words': u'mots d\'origine'
148 |                                                         }
149 |                                                     ]
150 |                                                 }
151 |                                             ]
152 |                                         },
153 |                                         {
154 |                                             'type': u'word-definition',
155 |                                             'children': [
156 |                                                 {
157 |                                                     'type': u'quote',
158 |                                                     'words': u'mots de remplacement'
159 |                                                 }
160 |                                             ]
161 |                                         }
162 |                                     ]
163 |                                 }
164 |                             ]
165 |                         }
166 |                     ]
167 |                 }
168 |             ]}
169 |         )
170 | 


--------------------------------------------------------------------------------
/tests/ParseArticleReferenceTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseArticleReferenceTest(DuralexTestCase):
  8 |     def test_article_number(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_article_reference,
 12 |                 "l'article 3"
 13 |             ),
 14 |             {'children':[
 15 |                 {
 16 |                     'type': u'article-reference',
 17 |                     'id': u'3'
 18 |                 }
 19 |             ]}
 20 |         )
 21 | 
 22 |     def test_article_id(self):
 23 |         self.assertEqualAST(
 24 |             self.call_parse_func(
 25 |                 parser.parse_article_reference,
 26 |                 "l'article L. 121-3"
 27 |             ),
 28 |             {'children':[
 29 |                 {
 30 |                     'type': u'article-reference',
 31 |                     'id': u'L. 121-3'
 32 |                 }
 33 |             ]}
 34 |         )
 35 | 
 36 |     def test_article_id_2(self):
 37 |         self.assertEqualAST(
 38 |             self.call_parse_func(
 39 |                 parser.parse_article_reference,
 40 |                 "à l'article L. 121-3"
 41 |             ),
 42 |             {'children':[
 43 |                 {
 44 |                     'type': u'article-reference',
 45 |                     'id': u'L. 121-3'
 46 |                 }
 47 |             ]}
 48 |         )
 49 | 
 50 |     def test_article_id_law_id(self):
 51 |         self.assertEqualAST(
 52 |             self.call_parse_func(
 53 |                 parser.parse_article_reference,
 54 |                 u"l'article 11 de la loi n° 78-753"
 55 |             ),
 56 |             {'children':[
 57 |                 {
 58 |                     'type': u'article-reference',
 59 |                     'id': u'11',
 60 |                     'children': [
 61 |                         {
 62 |                             'id': u'78-753',
 63 |                             'type': u'law-reference',
 64 |                         }
 65 |                     ]
 66 |                 }
 67 |             ]}
 68 |         )
 69 | 
 70 |     def test_article_id_law_id_law_date(self):
 71 |         self.assertEqualAST(
 72 |             self.call_parse_func(
 73 |                 parser.parse_article_reference,
 74 |                 u"l'article 11 de la loi n° 78-753 du 17 juillet 1978"
 75 |             ),
 76 |             {'children':[
 77 |                 {
 78 |                     'type': u'article-reference',
 79 |                     'id': u'11',
 80 |                     'children': [
 81 |                         {
 82 |                             'lawDate': u'1978-7-17',
 83 |                             'id': u'78-753',
 84 |                             'type': u'law-reference',
 85 |                         }
 86 |                     ]
 87 |                 }
 88 |             ]}
 89 |         )
 90 | 
 91 |     def test_article_id_code_name(self):
 92 |         self.assertEqualAST(
 93 |             self.call_parse_func(
 94 |                 parser.parse_article_reference,
 95 |                 u"l'article L. 111-5 du code de l'éducation"
 96 |             ),
 97 |             {'children': [
 98 |                 {
 99 |                     'id': u'L. 111-5',
100 |                     'type': u'article-reference',
101 |                     'children': [
102 |                         {
103 |                             'id': u'code de l\'éducation',
104 |                             'type': u'code-reference'
105 |                         }
106 |                     ]
107 |                 }
108 |             ]}
109 |         )
110 | 
111 |     def test_the_same_article(self):
112 |         self.assertEqualAST(
113 |             self.call_parse_func(
114 |                 parser.parse_article_reference,
115 |                 u"le même article",
116 |                 {'children':[
117 |                     {
118 |                         'id': u'L. 111-5',
119 |                         'type': u'article-reference'
120 |                     }
121 |                 ]}
122 |             ),
123 |             {'children':[
124 |                 {
125 |                     'id': u'L. 111-5',
126 |                     'type': u'article-reference'
127 |                 },
128 |                 {
129 |                     'id': u'L. 111-5',
130 |                     'type': u'article-reference'
131 |                 }
132 |             ]}
133 |         )
134 | 
135 |     def test_the_same_article_2(self):
136 |         self.assertEqualAST(
137 |             self.call_parse_func(
138 |                 parser.parse_article_reference,
139 |                 u"du même article",
140 |                 {'children':[
141 |                     {
142 |                         'id': u'L. 111-5',
143 |                         'type': u'article-reference'
144 |                     }
145 |                 ]}
146 |             ),
147 |             {'children':[
148 |                 {
149 |                     'id': u'L. 111-5',
150 |                     'type': u'article-reference'
151 |                 },
152 |                 {
153 |                     'id': u'L. 111-5',
154 |                     'type': u'article-reference'
155 |                 }
156 |             ]}
157 |         )
158 | 
159 |     def test_article_id_same_code(self):
160 |         self.assertEqualAST(
161 |             self.call_parse_func(
162 |                 parser.parse_article_reference,
163 |                 u"l'article L. 123-2 du même code",
164 |                 {'children':[
165 |                     {
166 |                         'id': u'code de l\'éducation',
167 |                         'type': u'code-reference'
168 |                     }
169 |                 ]}
170 |             ),
171 |             {'children': [
172 |                 {
173 |                     'id': u'code de l\'éducation',
174 |                     'type': u'code-reference'
175 |                 },
176 |                 {
177 |                     'id': u'L. 123-2',
178 |                     'type': u'article-reference',
179 |                     'children': [
180 |                         {
181 |                             'id': u'code de l\'éducation',
182 |                             'type': u'code-reference'
183 |                         }
184 |                     ]
185 |                 }
186 |             ]}
187 |         )
188 | 
189 |     def test_article_id_list(self):
190 |         self.assertEqualAST(
191 |             self.call_parse_func(
192 |                 parser.parse_article_reference,
193 |                 u"les articles 3, 4 et 5"
194 |             ),
195 |             {'children':[
196 |                 {
197 |                     'type': u'article-reference',
198 |                     'id': u'3'
199 |                 },
200 |                 {
201 |                     'type': u'article-reference',
202 |                     'id': u'4'
203 |                 },
204 |                 {
205 |                     'type': u'article-reference',
206 |                     'id': u'5'
207 |                 }
208 |             ]}
209 |         )
210 | 
211 |     def test_article_id_list_code_name(self):
212 |         self.assertEqualAST(
213 |             self.call_parse_func(
214 |                 parser.parse_article_reference,
215 |                 u"les articles 3, 4 et 5 du code de l'éducation"
216 |             ),
217 |             {'children':[
218 |                 {
219 |                     'type': u'article-reference',
220 |                     'id': u'3',
221 |                     'children': [
222 |                         {
223 |                             'type': u'code-reference',
224 |                             'id': u'code de l\'éducation'
225 |                         }
226 |                     ]
227 |                 },
228 |                 {
229 |                     'type': u'article-reference',
230 |                     'id': u'4',
231 |                     'children': [
232 |                         {
233 |                             'type': u'code-reference',
234 |                             'id': u'code de l\'éducation'
235 |                         }
236 |                     ]
237 |                 },
238 |                 {
239 |                     'type': u'article-reference',
240 |                     'id': u'5',
241 |                     'children': [
242 |                         {
243 |                             'type': u'code-reference',
244 |                             'id': u'code de l\'éducation'
245 |                         }
246 |                     ]
247 |                 }
248 |             ]}
249 |         )
250 | 
251 |     def test_the_end_of_article_number(self):
252 |         self.assertEqualAST(
253 |             self.call_parse_func(
254 |                 parser.parse_article_reference,
255 |                 "la fin de l'article 3"
256 |             ),
257 |             {'children':[
258 |                 {
259 |                     'scope': 'end',
260 |                     'type': u'article-reference',
261 |                     'id': u'3'
262 |                 }
263 |             ]}
264 |         )
265 | 


--------------------------------------------------------------------------------
/tests/ParseRawContentTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseRawContentTest(DuralexTestCase):
  8 |     def test_header1_raw_content_header2_raw_content(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 lambda tokens, i, parent: parser.parse_for_each(parser.parse_header1, tokens, 0, parent),
 12 |                 (u"I. - Contenu du header1 :\n"
 13 |                 u"1° Contenu du header2.")
 14 |             ),
 15 |             {'children':[
 16 |                 {
 17 |                     'order': 1,
 18 |                     'type': u'header1',
 19 |                     'children': [
 20 |                         {
 21 |                             'content': u'Contenu du header1 :',
 22 |                             'type': u'raw-content'
 23 |                         },
 24 |                         {
 25 |                             'type': u'header2',
 26 |                             'order': 1,
 27 |                             'children': [
 28 |                                 {
 29 |                                     'type': u'raw-content',
 30 |                                     'content': u'Contenu du header2.'
 31 |                                 }
 32 |                             ]
 33 |                         }
 34 |                     ]
 35 |                 }
 36 |             ]}
 37 |         )
 38 | 
 39 |     def test_header1_raw_content_header2_raw_content_header3_raw_content(self):
 40 |         self.assertEqualAST(
 41 |             self.call_parse_func(
 42 |                 lambda tokens, i, parent: parser.parse_for_each(parser.parse_header1, tokens, 0, parent),
 43 |                 (u"I. - Contenu du header1 :\n"
 44 |                 u"1° Contenu du header2 :\n"
 45 |                 u"a) Contenu du header3")
 46 |             ),
 47 |             {'children':[
 48 |                 {
 49 |                     'order': 1,
 50 |                     'type': u'header1',
 51 |                     'children': [
 52 |                         {
 53 |                             'content': u'Contenu du header1 :',
 54 |                             'type': u'raw-content'
 55 |                         },
 56 |                         {
 57 |                             'type': u'header2',
 58 |                             'order': 1,
 59 |                             'children': [
 60 |                                 {
 61 |                                     'type': u'raw-content',
 62 |                                     'content': u'Contenu du header2 :'
 63 |                                 },
 64 |                                 {
 65 |                                     'order': 1,
 66 |                                     'type': u'header3',
 67 |                                     'children': [
 68 |                                         {
 69 |                                             'content': u'Contenu du header3',
 70 |                                             'type': u'raw-content'
 71 |                                         }
 72 |                                     ]
 73 |                                 }
 74 |                             ]
 75 |                         }
 76 |                     ]
 77 |                 }
 78 |             ]}
 79 |         )
 80 | 
 81 |     def test_n_header1_raw_content_n_header2_raw_content_n_header3_raw_content(self):
 82 |         self.assertEqualAST(
 83 |             self.call_parse_func(
 84 |                 lambda tokens, i, parent: parser.parse_for_each(parser.parse_header1, tokens, 0, parent),
 85 |                 (u"I. - Contenu du grand 1 :\n"
 86 |                 u"1° Contenu du grand 1 petit 1 :\n"
 87 |                 u"a) Contenu du grand 1 petit 1 a\n"
 88 |                 u"b) Contenu du grand 1 petit 1 b\n"
 89 |                 u"2° Contenu du grand 1 petit 2.\n"
 90 |                 u"II. - Contenu du grand 2 :\n"
 91 |                 u"1° Contenu du grand 2 petit 1.\n"
 92 |                 u"a) Contenu du grand 2 petit 1 a\n"
 93 |                 u"b) Contenu du grand 2 petit 1 b\n"
 94 |                 u"c) Contenu du grand 2 petit 1 c\n")
 95 |             ),
 96 |             {'children':[
 97 |                 {
 98 |                     'order': 1,
 99 |                     'type': u'header1',
100 |                     'children': [
101 |                         {
102 |                             'content': u'Contenu du grand 1 :',
103 |                             'type': u'raw-content'
104 |                         },
105 |                         {
106 |                             'order': 1,
107 |                             'type': u'header2',
108 |                             'children': [
109 |                                 {
110 |                                     'content': u'Contenu du grand 1 petit 1 :',
111 |                                     'type': u'raw-content'
112 |                                 },
113 |                                 {
114 |                                     'order': 1,
115 |                                     'type': u'header3',
116 |                                     'children': [
117 |                                         {
118 |                                             'content': u'Contenu du grand 1 petit 1 a',
119 |                                             'type': u'raw-content'
120 |                                         }
121 |                                     ]
122 |                                 },
123 |                                 {
124 |                                     'order': 2,
125 |                                     'type': u'header3',
126 |                                     'children': [
127 |                                         {
128 |                                             'content': u'Contenu du grand 1 petit 1 b',
129 |                                             'type': u'raw-content'
130 |                                         }
131 |                                     ]
132 |                                 }
133 |                             ]
134 |                         },
135 |                         {
136 |                             'order': 2,
137 |                             'type': u'header2',
138 |                             'children': [
139 |                                 {
140 |                                     'content': u'Contenu du grand 1 petit 2.',
141 |                                     'type': u'raw-content'
142 |                                 }
143 |                             ]
144 |                         }
145 |                     ]
146 |                 },
147 |                 {
148 |                     'order': 2,
149 |                     'type': u'header1',
150 |                     'children': [
151 |                         {
152 |                             'content': u'Contenu du grand 2 :',
153 |                             'type': u'raw-content'
154 |                         },
155 |                         {
156 |                             'order': 1,
157 |                             'type': u'header2',
158 |                             'children': [
159 |                                 {
160 |                                     'content': u'Contenu du grand 2 petit 1.',
161 |                                     'type': u'raw-content'
162 |                                 },
163 |                                 {
164 |                                     'order': 1,
165 |                                     'type': u'header3',
166 |                                     'children': [
167 |                                         {
168 |                                             'content': u'Contenu du grand 2 petit 1 a',
169 |                                             'type': u'raw-content'
170 |                                         }
171 |                                     ]
172 |                                 },
173 |                                 {
174 |                                     'order': 2,
175 |                                     'type': u'header3',
176 |                                     'children': [
177 |                                         {
178 |                                             'content': u'Contenu du grand 2 petit 1 b',
179 |                                             'type': u'raw-content'
180 |                                         }
181 |                                     ]
182 |                                 },
183 |                                 {
184 |                                     'order': 3,
185 |                                     'type': u'header3',
186 |                                     'children': [
187 |                                         {
188 |                                             'content': u'Contenu du grand 2 petit 1 c',
189 |                                             'type': u'raw-content'
190 |                                         }
191 |                                     ]
192 |                                 }
193 |                             ]
194 |                         }
195 |                     ]
196 |                 }
197 |             ]}
198 |         )
199 | 


--------------------------------------------------------------------------------
/tests/ParseWordReferenceTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseWordReferenceTest(DuralexTestCase):
  8 |     def test_single_word(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_word_reference,
 12 |                 u"le mot : \"test\""
 13 |             ),
 14 |             {'children':[
 15 |                 {
 16 |                     'type': u'word-reference',
 17 |                     'children': [
 18 |                         {
 19 |                             'type': u'quote',
 20 |                             'words': u'test'
 21 |                         }
 22 |                     ]
 23 |                 }
 24 |             ]}
 25 |         )
 26 | 
 27 |     def test_words(self):
 28 |         self.assertEqualAST(
 29 |             self.call_parse_func(
 30 |                 parser.parse_word_reference,
 31 |                 u"les mots : \"ceci est un test\""
 32 |             ),
 33 |             {'children':[
 34 |                 {
 35 |                     'type': u'word-reference',
 36 |                     'children': [
 37 |                         {
 38 |                             'type': u'quote',
 39 |                             'words': u'ceci est un test'
 40 |                         }
 41 |                     ]
 42 |                 }
 43 |             ]}
 44 |         )
 45 | 
 46 |     def test_reference(self):
 47 |         self.assertEqualAST(
 48 |             self.call_parse_func(
 49 |                 parser.parse_word_reference,
 50 |                 u"la référence : \"L. 321-5\""
 51 |             ),
 52 |             {'children':[
 53 |                 {
 54 |                     'type': u'word-reference',
 55 |                     'children': [
 56 |                         {
 57 |                             'type': u'quote',
 58 |                             'words': u'L. 321-5'
 59 |                         }
 60 |                     ]
 61 |                 }
 62 |             ]}
 63 |         )
 64 | 
 65 |     def test_references(self):
 66 |         self.assertEqualAST(
 67 |             self.call_parse_func(
 68 |                 parser.parse_word_reference,
 69 |                 u"les références : \"ceci est un test\""
 70 |             ),
 71 |             {'children':[
 72 |                 {
 73 |                     'type': u'word-reference',
 74 |                     'children': [
 75 |                         {
 76 |                             'type': u'quote',
 77 |                             'words': u'ceci est un test'
 78 |                         }
 79 |                     ]
 80 |                 }
 81 |             ]}
 82 |         )
 83 | 
 84 |     def test_after_words(self):
 85 |         self.assertEqualAST(
 86 |             self.call_parse_func(
 87 |                 parser.parse_word_reference,
 88 |                 u"après les mots : \"ceci est un test\""
 89 |             ),
 90 |             {'children':[
 91 |                 {
 92 |                     'type': u'word-reference',
 93 |                     'position': u'after',
 94 |                     'children': [
 95 |                         {
 96 |                             'type': u'quote',
 97 |                             'words': u'ceci est un test'
 98 |                         }
 99 |                     ]
100 |                 }
101 |             ]}
102 |         )
103 | 
104 |     def test_after_word(self):
105 |         self.assertEqualAST(
106 |             self.call_parse_func(
107 |                 parser.parse_word_reference,
108 |                 u"Après le mot : \"candidats\""
109 |             ),
110 |             {'children':[
111 |                 {
112 |                     'type': u'word-reference',
113 |                     'position': u'after',
114 |                     'children': [
115 |                         {
116 |                             'type': u'quote',
117 |                             'words': u'candidats'
118 |                         }
119 |                     ]
120 |                 }
121 |             ]}
122 |         )
123 | 
124 |     def test_words_reference_position_in_article(self):
125 |         self.assertEqualAST(
126 |             self.call_parse_func(
127 |                 parser.parse_word_reference,
128 |                 u"après les mots : \"aux dispositions de l'article L. 123-5\", la fin de l'article L. 112-3 du code de la recherche"
129 |             ),
130 |             {'children':[
131 |                 {
132 |                     'type': u'word-reference',
133 |                     'position': u'after',
134 |                     'children': [
135 |                         {
136 |                             'type': u'quote',
137 |                             'words': u'aux dispositions de l\'article L. 123-5'
138 |                         },
139 |                         {
140 |                             'type': u'article-reference',
141 |                             'id': u'L. 112-3',
142 |                             'scope': 'end',
143 |                             'children': [
144 |                                 {
145 |                                     'type': u'code-reference',
146 |                                     'id': u'code de la recherche'
147 |                                 }
148 |                             ]
149 |                         }
150 |                     ]
151 |                 }
152 |             ]}
153 |         )
154 | 
155 |     def test_alinea_ref_word_ref(self):
156 |         self.assertEqualAST(
157 |             self.call_parse_func(
158 |                 parser.parse_reference,
159 |                 u"au deuxième alinéa, le mot : \"test\""
160 |             ),
161 |             {'children':[
162 |                 {
163 |                     'type': u'alinea-reference',
164 |                     'order': 2,
165 |                     'children': [
166 |                         {
167 |                             'type': u'word-reference',
168 |                             'children': [
169 |                                 {
170 |                                     'type': u'quote',
171 |                                     'words': u'test'
172 |                                 }
173 |                             ]
174 |                         }
175 |                     ]
176 |                 }
177 |             ]}
178 |         )
179 | 
180 |     def test_alinea_ref_article_ref_word_ref(self):
181 |         self.assertEqualAST(
182 |             self.call_parse_func(
183 |                 parser.parse_reference,
184 |                 u"au deuxième alinéa de l'article L. 42, le mot : \"test\""
185 |             ),
186 |             {'children':[
187 |                 {
188 |                     'type': u'alinea-reference',
189 |                     'order': 2,
190 |                     'children': [
191 |                         {
192 |                             'type': u'article-reference',
193 |                             'id': u'L. 42',
194 |                             'children': [
195 |                                 {
196 |                                     'type': u'word-reference',
197 |                                     'children': [
198 |                                         {
199 |                                             'type': u'quote',
200 |                                             'words': u'test'
201 |                                         }
202 |                                     ]
203 |                                 }
204 |                             ]
205 |                         }
206 |                     ]
207 |                 }
208 |             ]}
209 |         )
210 | 
211 |     def test_alinea_ref_article_ref_law_ref_word_ref(self):
212 |         self.assertEqualAST(
213 |             self.call_parse_func(
214 |                 parser.parse_reference,
215 |                 u"au deuxième alinéa de l'article L. 42 de la loi n° 77-729, le mot : \"test\""
216 |             ),
217 |             {'children':[
218 |                 {
219 |                     'type': u'alinea-reference',
220 |                     'order': 2,
221 |                     'children': [
222 |                         {
223 |                             'type': u'article-reference',
224 |                             'id': u'L. 42',
225 |                             'children': [
226 |                                 {
227 |                                     'type': u'law-reference',
228 |                                     'id': u'77-729',
229 |                                     'children': [
230 |                                         {
231 |                                             'type': u'word-reference',
232 |                                             'children': [
233 |                                                 {
234 |                                                     'type': u'quote',
235 |                                                     'words': u'test'
236 |                                                 }
237 |                                             ]
238 |                                         }
239 |                                     ]
240 | 
241 |                                 }
242 |                             ]
243 |                         }
244 |                     ]
245 |                 }
246 |             ]}
247 |         )
248 | 


--------------------------------------------------------------------------------
/tests/ForkEditVisitorTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | from duralex.ForkEditVisitor import ForkEditVisitor
  6 | 
  7 | class ForkEditVisitorTest(DuralexTestCase):
  8 |     def test(self):
  9 |         self.assertEqualAST(
 10 |             self.call_visitor(ForkEditVisitor, self.make_tree({'children': [
 11 |                 {
 12 |                     'type': 'edit',
 13 |                     'children': [
 14 |                         {
 15 |                             'type': u'alinea-reference',
 16 |                             'order': 3,
 17 |                             'children': [
 18 |                                 {
 19 |                                     'id': u'2',
 20 |                                     'type': u'article-reference'
 21 |                                 }
 22 |                             ],
 23 |                         },
 24 |                         {
 25 |                             'order': 3,
 26 |                             'type': u'alinea-reference',
 27 |                             'children': [
 28 |                                 {
 29 |                                     'id': u'3',
 30 |                                     'type': u'article-reference'
 31 |                                 }
 32 |                             ]
 33 |                         }
 34 |                     ]
 35 |                 }
 36 |             ]})),
 37 |             {'children': [
 38 |                 {
 39 |                     'type': 'edit',
 40 |                     'children': [
 41 |                         {
 42 |                             'type': u'alinea-reference',
 43 |                             'order': 3,
 44 |                             'children': [
 45 |                                 {
 46 |                                     'id': u'2',
 47 |                                     'type': u'article-reference'
 48 |                                 }
 49 |                             ],
 50 |                         }
 51 |                     ]
 52 |                 },
 53 |                 {
 54 |                     'type': 'edit',
 55 |                     'children': [
 56 |                         {
 57 |                             'order': 3,
 58 |                             'type': u'alinea-reference',
 59 |                             'children': [
 60 |                                 {
 61 |                                     'id': u'3',
 62 |                                     'type': u'article-reference'
 63 |                                 }
 64 |                             ]
 65 |                         }
 66 |                     ]
 67 |                 }
 68 |             ]}
 69 |         )
 70 | 
 71 |     def test_2(self):
 72 |         self.assertEqualAST(
 73 |             self.call_visitor(ForkEditVisitor, self.make_tree({'children': [
 74 |                 {
 75 |                     'type': 'edit',
 76 |                     'children': [
 77 |                         {
 78 |                             'type': u'alinea-reference',
 79 |                             'order': 3,
 80 |                             'children': [
 81 |                                 {
 82 |                                     'id': u'2',
 83 |                                     'type': u'article-reference'
 84 |                                 }
 85 |                             ],
 86 |                         },
 87 |                         {
 88 |                             'order': 3,
 89 |                             'type': u'alinea-reference',
 90 |                             'children': [
 91 |                                 {
 92 |                                     'id': u'3',
 93 |                                     'type': u'article-reference'
 94 |                                 }
 95 |                             ]
 96 |                         },
 97 |                         {
 98 |                             'order': 4,
 99 |                             'type': u'alinea-reference',
100 |                             'children': [
101 |                                 {
102 |                                     'id': u'3',
103 |                                     'type': u'article-reference'
104 |                                 }
105 |                             ]
106 |                         }
107 |                     ]
108 |                 }
109 |             ]})),
110 |             {'children': [
111 |                 {
112 |                     'type': 'edit',
113 |                     'children': [
114 |                         {
115 |                             'type': u'alinea-reference',
116 |                             'order': 3,
117 |                             'children': [
118 |                                 {
119 |                                     'id': u'2',
120 |                                     'type': u'article-reference'
121 |                                 }
122 |                             ],
123 |                         }
124 |                     ]
125 |                 },
126 |                 {
127 |                     'type': 'edit',
128 |                     'children': [
129 |                         {
130 |                             'order': 3,
131 |                             'type': u'alinea-reference',
132 |                             'children': [
133 |                                 {
134 |                                     'id': u'3',
135 |                                     'type': u'article-reference'
136 |                                 }
137 |                             ]
138 |                         }
139 |                     ]
140 |                 },
141 |                 {
142 |                     'type': 'edit',
143 |                     'children': [
144 |                         {
145 |                             'order': 4,
146 |                             'type': u'alinea-reference',
147 |                             'children': [
148 |                                 {
149 |                                     'id': u'3',
150 |                                     'type': u'article-reference'
151 |                                 }
152 |                             ]
153 |                         }
154 |                     ]
155 |                 }
156 |             ]}
157 |         )
158 | 
159 |     def test(self):
160 |         self.assertEqualAST(
161 |             self.call_visitor(ForkEditVisitor, self.make_tree({'children': [
162 |                 {
163 |                     'type': 'edit',
164 |                     'children': [
165 |                         {
166 |                             'type': u'alinea-reference',
167 |                             'order': 3,
168 |                             'children': [
169 |                                 {
170 |                                     'id': u'2',
171 |                                     'type': u'article-reference'
172 |                                 }
173 |                             ],
174 |                         },
175 |                         {
176 |                             'order': 3,
177 |                             'type': u'alinea-reference',
178 |                             'children': [
179 |                                 {
180 |                                     'id': u'3',
181 |                                     'type': u'article-reference'
182 |                                 }
183 |                             ]
184 |                         },
185 |                         {
186 |                             'type': u'word-definition',
187 |                             'children': [
188 |                                 {
189 |                                     'type': u'quote',
190 |                                     'words': u'ceci est un test'
191 |                                 }
192 |                             ]
193 |                         }
194 |                     ]
195 |                 }
196 |             ]})),
197 |             {'children': [
198 |                 {
199 |                     'type': 'edit',
200 |                     'children': [
201 |                         {
202 |                             'type': u'alinea-reference',
203 |                             'order': 3,
204 |                             'children': [
205 |                                 {
206 |                                     'id': u'2',
207 |                                     'type': u'article-reference'
208 |                                 }
209 |                             ],
210 |                         },
211 |                         {
212 |                             'type': u'word-definition',
213 |                             'children': [
214 |                                 {
215 |                                     'type': u'quote',
216 |                                     'words': u'ceci est un test'
217 |                                 }
218 |                             ]
219 |                         }
220 |                     ]
221 |                 },
222 |                 {
223 |                     'type': 'edit',
224 |                     'children': [
225 |                         {
226 |                             'order': 3,
227 |                             'type': u'alinea-reference',
228 |                             'children': [
229 |                                 {
230 |                                     'id': u'3',
231 |                                     'type': u'article-reference'
232 |                                 }
233 |                             ]
234 |                         },
235 |                         {
236 |                             'type': u'word-definition',
237 |                             'children': [
238 |                                 {
239 |                                     'type': u'quote',
240 |                                     'words': u'ceci est un test'
241 |                                 }
242 |                             ]
243 |                         }
244 |                     ]
245 |                 }
246 |             ]}
247 |         )
248 | 


--------------------------------------------------------------------------------
/tests/ParseAlineaReferenceTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | import duralex.alinea_parser as parser
  6 | 
  7 | class ParseAlineaReferenceTest(DuralexTestCase):
  8 |     def test_alinea(self):
  9 |         self.assertEqualAST(
 10 |             self.call_parse_func(
 11 |                 parser.parse_alinea_reference,
 12 |                 u"l'alinéa 42"
 13 |             ),
 14 |             {'children': [
 15 |                 {
 16 |                     'type': u'alinea-reference',
 17 |                     'order': 42
 18 |                 }
 19 |             ]}
 20 |         )
 21 | 
 22 |     def test_alinea_number(self):
 23 |         self.assertEqualAST(
 24 |             self.call_parse_func(
 25 |                 parser.parse_alinea_reference,
 26 |                 u"alinéa 3"
 27 |             ),
 28 |             {'children': [
 29 |                 {
 30 |                     'order': 3,
 31 |                     'type': u'alinea-reference'
 32 |                 }
 33 |             ]}
 34 |         )
 35 | 
 36 |     def test_last_alinea(self):
 37 |         self.assertEqualAST(
 38 |             self.call_parse_func(
 39 |                 parser.parse_alinea_reference,
 40 |                 u"du dernier alinéa"
 41 |             ),
 42 |             {'children': [
 43 |                 {
 44 |                     'order': -1,
 45 |                     'type': u'alinea-reference'
 46 |                 }
 47 |             ]}
 48 |         )
 49 | 
 50 |     def test_last_alinea_2(self):
 51 |         self.assertEqualAST(
 52 |             self.call_parse_func(
 53 |                 parser.parse_alinea_reference,
 54 |                 u"au dernier alinéa"
 55 |             ),
 56 |             {'children': [
 57 |                 {
 58 |                     'order': -1,
 59 |                     'type': u'alinea-reference'
 60 |                 }
 61 |             ]}
 62 |         )
 63 | 
 64 |     def test_last_alinea_3(self):
 65 |         self.assertEqualAST(
 66 |             self.call_parse_func(
 67 |                 parser.parse_alinea_reference,
 68 |                 u"le dernier alinéa"
 69 |             ),
 70 |             {'children': [
 71 |                 {
 72 |                     'order': -1,
 73 |                     'type': u'alinea-reference'
 74 |                 }
 75 |             ]}
 76 |         )
 77 | 
 78 |     def test_before_the_last_alinea(self):
 79 |         self.assertEqualAST(
 80 |             self.call_parse_func(
 81 |                 parser.parse_alinea_reference,
 82 |                 u"avant le dernier alinéa"
 83 |             ),
 84 |             {'children': [
 85 |                 {
 86 |                     'type': u'alinea-reference',
 87 |                     'order': -1,
 88 |                     'position': u'before'
 89 |                 }
 90 |             ]}
 91 |         )
 92 | 
 93 |     def test_before_last_alinea(self):
 94 |         self.assertEqualAST(
 95 |             self.call_parse_func(
 96 |                 parser.parse_alinea_reference,
 97 |                 u"à l'avant dernier alinéa"
 98 |             ),
 99 |             {'children': [
100 |                 {
101 |                     'order': -2,
102 |                     'type': u'alinea-reference'
103 |                 }
104 |             ]}
105 |         )
106 | 
107 |     def test_before_last_alinea_2(self):
108 |         self.assertEqualAST(
109 |             self.call_parse_func(
110 |                 parser.parse_alinea_reference,
111 |                 u"l'avant-dernier alinéa"
112 |             ),
113 |             {'children': [
114 |                 {
115 |                     'order': -2,
116 |                     'type': u'alinea-reference'
117 |                 }
118 |             ]}
119 |         )
120 | 
121 |     def test_before_last_alinea_3(self):
122 |         self.assertEqualAST(
123 |             self.call_parse_func(
124 |                 parser.parse_alinea_reference,
125 |                 u"à l'avant-dernier alinéa"
126 |             ),
127 |             {'children': [
128 |                 {
129 |                     'order': -2,
130 |                     'type': u'alinea-reference'
131 |                 }
132 |             ]}
133 |         )
134 | 
135 |     def test_number_word_alinea(self):
136 |         self.assertEqualAST(
137 |             self.call_parse_func(
138 |                 parser.parse_alinea_reference,
139 |                 u"au premier alinéa"
140 |             ),
141 |             {'children': [
142 |                 {
143 |                     'order': 1,
144 |                     'type': u'alinea-reference'
145 |                 }
146 |             ]}
147 |         )
148 | 
149 |     def test_number_word_alinea_2(self):
150 |         self.assertEqualAST(
151 |             self.call_parse_func(
152 |                 parser.parse_alinea_reference,
153 |                 u"le second alinéa"
154 |             ),
155 |             {'children': [
156 |                 {
157 |                     'order': 2,
158 |                     'type': u'alinea-reference'
159 |                 }
160 |             ]}
161 |         )
162 | 
163 |     def test_number_word_alinea_3(self):
164 |         self.assertEqualAST(
165 |             self.call_parse_func(
166 |                 parser.parse_alinea_reference,
167 |                 u"du troisième alinéa"
168 |             ),
169 |             {'children': [
170 |                 {
171 |                     'order': 3,
172 |                     'type': u'alinea-reference'
173 |                 }
174 |             ]}
175 |         )
176 | 
177 |     def test_number_word_alinea_article_id(self):
178 |         self.assertEqualAST(
179 |             self.call_parse_func(
180 |                 parser.parse_alinea_reference,
181 |                 u"le deuxième alinéa de l'article L. 121-3"
182 |             ),
183 |             {'children': [
184 |                 {
185 |                     'order': 2,
186 |                     'type': u'alinea-reference',
187 |                     'children': [
188 |                         {
189 |                             'id': u'L. 121-3',
190 |                             'type': u'article-reference'
191 |                         }
192 |                     ]
193 |                 }
194 |             ]}
195 |         )
196 | 
197 |     def test_number_word_alinea_header1_article_id(self):
198 |         self.assertEqualAST(
199 |             self.call_parse_func(
200 |                 parser.parse_alinea_reference,
201 |                 u"le premier alinéa du II de l'article L. 121-3"
202 |             ),
203 |             {'children': [
204 |                 {
205 |                     'order': 1,
206 |                     'type': u'alinea-reference',
207 |                     'children': [
208 |                         {
209 |                             'order': 2,
210 |                             'type': u'header1-reference',
211 |                             'children': [
212 |                                 {
213 |                                     'id': u'L. 121-3',
214 |                                     'type': u'article-reference'
215 |                                 }
216 |                             ]
217 |                         }
218 |                     ]
219 |                 }
220 |             ]}
221 |         )
222 | 
223 |     def test_number_word_alinea_header1_article_id_code(self):
224 |         self.assertEqualAST(
225 |             self.call_parse_func(
226 |                 parser.parse_alinea_reference,
227 |                 u"le premier alinéa du II de l'article L. 121-3 du code de l'éducation"
228 |             ),
229 |             {'children': [
230 |                 {
231 |                     'order': 1,
232 |                     'type': u'alinea-reference',
233 |                     'children': [
234 |                         {
235 |                             'order': 2,
236 |                             'type': u'header1-reference',
237 |                             'children': [
238 |                                 {
239 |                                     'id': u'L. 121-3',
240 |                                     'type': u'article-reference',
241 |                                     'children': [
242 |                                         {
243 |                                             'id': u'code de l\'éducation',
244 |                                             'type': u'code-reference'
245 |                                         }
246 |                                     ]
247 |                                 }
248 |                             ]
249 |                         }
250 |                     ]
251 |                 }
252 |             ]}
253 |         )
254 | 
255 |     def test_the_same_alinea(self):
256 |         self.assertEqualAST(
257 |             self.call_parse_func(
258 |                 parser.parse_alinea_reference,
259 |                 u"le même alinéa",
260 |                 {'children':[
261 |                     {
262 |                         'type': u'alinea-reference',
263 |                         'order': 42
264 |                     }
265 |                 ]}
266 |             ),
267 |             {'children':[
268 |                 {
269 |                     'type': u'alinea-reference',
270 |                     'order': 42
271 |                 },
272 |                 {
273 |                     'type': u'alinea-reference',
274 |                     'order': 42
275 |                 }
276 |             ]}
277 |         )
278 | 
279 |     def test_before_the_last_alinea_article_ref(self):
280 |         self.assertEqualAST(
281 |             self.call_parse_func(
282 |                 parser.parse_alinea_reference,
283 |                 u"avant le dernier alinéa"
284 |             ),
285 |             {'children':[
286 |                 {
287 |                     'type': u'alinea-reference',
288 |                     'position': u'before',
289 |                     'order': -1
290 |                 }
291 |             ]}
292 |         )
293 | 
294 |     def test_alinea_id_list(self):
295 |         self.assertEqualAST(
296 |             self.call_parse_func(
297 |             parser.parse_alinea_reference,
298 |                 u"les alinéas 3, 4 et 5"
299 |             ),
300 |             {'children':[
301 |                 {
302 |                     'type': u'alinea-reference',
303 |                     'order': 3
304 |                 },
305 |                 {
306 |                     'type': u'alinea-reference',
307 |                     'order': 4
308 |                 },
309 |                 {
310 |                     'type': u'alinea-reference',
311 |                     'order': 5
312 |                 }
313 |             ]}
314 |         )
315 | 
316 |     def test_alinea_id_of_article_id_list(self):
317 |         self.assertEqualAST(
318 |             self.call_parse_func(
319 |             parser.parse_alinea_reference,
320 |                 u"l'alinéa 3 des articles 2 et 3"
321 |             ),
322 |             {'children':[
323 |                 {
324 |                     'type': u'alinea-reference',
325 |                     'order': 3,
326 |                     'children': [
327 |                         {
328 |                             'type': u'article-reference',
329 |                             'id': u'2'
330 |                         },
331 |                         {
332 |                             'type': u'article-reference',
333 |                             'id': u'3'
334 |                         }
335 |                     ]
336 |                 }
337 |             ]}
338 |         )
339 | 


--------------------------------------------------------------------------------
/tests/ResolveFullyQualifiedReferencesVisitorTest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from DuralexTestCase import DuralexTestCase
  4 | 
  5 | from duralex.ResolveFullyQualifiedReferencesVisitor import ResolveFullyQualifiedReferencesVisitor
  6 | 
  7 | class ResolveFullyQualifiedReferencesVisitorTest(DuralexTestCase):
  8 |     def test_code_danling_reference(self):
  9 |         self.assertEqualAST(
 10 |             self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [
 11 |                 {
 12 |                     'editType': u'edit',
 13 |                     'type': u'edit',
 14 |                     'children': [
 15 |                         {
 16 |                             'id': u'code de l\'éducation',
 17 |                             'type': u'code-reference'
 18 |                         }
 19 |                     ]
 20 |                 },
 21 |                 {
 22 |                     'editType': u'replace',
 23 |                     'type': u'edit',
 24 |                     'children': [
 25 |                         {
 26 |                             'type': u'word-definition',
 27 |                             'children': [
 28 |                                 {
 29 |                                     'type': u'quote',
 30 |                                     'words': u'mots de remplacement'
 31 |                                 }
 32 |                             ]
 33 |                         },
 34 |                         {
 35 |                             'type': u'word-reference',
 36 |                             'children': [
 37 |                                 {
 38 |                                     'type': u'quote',
 39 |                                     'words': u'mots d\'origine'
 40 |                                 }
 41 |                             ]
 42 |                         }
 43 |                     ]
 44 |                 }
 45 |             ]})),
 46 |             {'children': [
 47 |                 {
 48 |                     'editType': u'replace',
 49 |                     'type': u'edit',
 50 |                     'children': [
 51 |                         {
 52 |                             'id': u'code de l\'éducation',
 53 |                             'type': u'code-reference',
 54 |                             'children': [
 55 |                                 {
 56 |                                     'type': u'word-reference',
 57 |                                     'children': [
 58 |                                         {
 59 |                                             'type': u'quote',
 60 |                                             'words': u'mots d\'origine'
 61 |                                         }
 62 |                                     ]
 63 |                                 }
 64 |                             ]
 65 |                         },
 66 |                         {
 67 |                             'type': u'word-definition',
 68 |                             'children': [
 69 |                                 {
 70 |                                     'type': u'quote',
 71 |                                     'words': u'mots de remplacement'
 72 |                                 }
 73 |                             ]
 74 |                         }
 75 |                     ]
 76 |                 }
 77 |             ]}
 78 |         )
 79 | 
 80 |     def test_code_danling_reference_2(self):
 81 |         self.assertEqualAST(
 82 |             self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [
 83 |                 {
 84 |                     'editType': u'edit',
 85 |                     'type': u'edit',
 86 |                     'children': [
 87 |                         {
 88 |                             'id': u'L. 42',
 89 |                             'type': u'article-reference',
 90 |                             'children': [
 91 |                                 {
 92 |                                     'id': u'code de l\'éducation',
 93 |                                     'type': u'code-reference'
 94 |                                 }
 95 |                             ]
 96 |                         }
 97 |                     ]
 98 |                 },
 99 |                 {
100 |                     'editType': u'replace',
101 |                     'type': u'edit',
102 |                     'children': [
103 |                         {
104 |                             'type': u'word-definition',
105 |                             'children': [
106 |                                 {
107 |                                     'type': u'quote',
108 |                                     'words': u'mots de remplacement'
109 |                                 }
110 |                             ]
111 |                         },
112 |                         {
113 |                             'type': u'word-reference',
114 |                             'children': [
115 |                                 {
116 |                                     'type': u'quote',
117 |                                     'words': u'mots d\'origine'
118 |                                 }
119 |                             ]
120 |                         }
121 |                     ]
122 |                 }
123 |             ]})),
124 |             {'children': [
125 |                 {
126 |                     'editType': u'replace',
127 |                     'type': u'edit',
128 |                     'children': [
129 |                         {
130 |                             'id': u'code de l\'éducation',
131 |                             'type': u'code-reference',
132 |                             'children': [
133 |                                 {
134 |                                     'id': u'L. 42',
135 |                                     'type': u'article-reference',
136 |                                     'children': [
137 |                                         {
138 |                                             'type': u'word-reference',
139 |                                             'children': [
140 |                                                 {
141 |                                                     'type': u'quote',
142 |                                                     'words': u'mots d\'origine'
143 |                                                 }
144 |                                             ]
145 |                                         }
146 |                                     ]
147 |                                 }
148 |                             ]
149 |                         },
150 |                         {
151 |                             'type': u'word-definition',
152 |                             'children': [
153 |                                 {
154 |                                     'type': u'quote',
155 |                                     'words': u'mots de remplacement'
156 |                                 }
157 |                             ]
158 |                         }
159 |                     ]
160 |                 }
161 |             ]}
162 |         )
163 | 
164 |     def test_code_danling_reference_3(self):
165 |         self.assertEqualAST(
166 |             self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [
167 |                 {
168 |                     'type': u'header1-definition',
169 |                     'order': 1,
170 |                     'children': [
171 |                         {
172 |                             'editType': u'edit',
173 |                             'type': u'edit',
174 |                             'children': [
175 |                                 {
176 |                                     'id': u'L. 42',
177 |                                     'type': u'article-reference'
178 |                                 }
179 |                             ]
180 |                         },
181 |                         {
182 |                             'type': u'header2-definition',
183 |                             'order': 1,
184 |                             'children': [
185 |                                 {
186 |                                     'editType': u'edit',
187 |                                     'type': u'edit',
188 |                                     'children': [
189 |                                         {
190 |                                             'order': 42,
191 |                                             'type': u'alinea-reference'
192 |                                         }
193 |                                     ]
194 |                                 },
195 |                                 {
196 |                                     'type': u'header3-definition',
197 |                                     'order': 1,
198 |                                     'children': [
199 |                                         {
200 |                                             'editType': u'replace',
201 |                                             'type': u'edit',
202 |                                             'children': [
203 |                                                 {
204 |                                                     'type': u'word-definition',
205 |                                                     'children': [
206 |                                                         {
207 |                                                             'type': u'quote',
208 |                                                             'words': u'mots de remplacement'
209 |                                                         }
210 |                                                     ]
211 |                                                 },
212 |                                                 {
213 |                                                     'type': u'word-reference',
214 |                                                     'children': [
215 |                                                         {
216 |                                                             'type': u'quote',
217 |                                                             'words': u'mots d\'origine'
218 |                                                         }
219 |                                                     ]
220 |                                                 }
221 |                                             ]
222 |                                         }
223 |                                     ]
224 |                                 }
225 |                             ]
226 |                         }
227 |                     ]
228 |                 }
229 |             ]})),
230 |             {'children': [
231 |                 {
232 |                     'type': u'header1-definition',
233 |                     'order': 1,
234 |                     'children': [
235 |                         {
236 |                             'type': u'header2-definition',
237 |                             'order': 1,
238 |                             'children': [
239 |                                 {
240 |                                     'type': u'header3-definition',
241 |                                     'order': 1,
242 |                                     'children': [
243 |                                         {
244 |                                             'editType': u'replace',
245 |                                             'type': u'edit',
246 |                                             'children': [
247 |                                                 {
248 |                                                     'id': u'L. 42',
249 |                                                     'type': u'article-reference',
250 |                                                     'children': [
251 |                                                         {
252 |                                                             'order': 42,
253 |                                                             'type': u'alinea-reference',
254 |                                                             'children': [
255 |                                                                 {
256 |                                                                     'type': u'word-reference',
257 |                                                                     'children': [
258 |                                                                         {
259 |                                                                             'type': u'quote',
260 |                                                                             'words': u'mots d\'origine'
261 |                                                                         }
262 |                                                                     ]
263 |                                                                 }
264 |                                                             ]
265 |                                                         }
266 |                                                     ]
267 |                                                 },
268 |                                                 {
269 |                                                     'type': u'word-definition',
270 |                                                     'children': [
271 |                                                         {
272 |                                                             'type': u'quote',
273 |                                                             'words': u'mots de remplacement'
274 |                                                         }
275 |                                                     ]
276 |                                                 }
277 |                                             ]
278 |                                         }
279 |                                     ]
280 |                                 }
281 |                             ]
282 |                         }
283 |                     ]
284 |                 }
285 |             ]}
286 |         )
287 | 
288 |     def test_do_nothing_when_no_nested_edits(self):
289 |         self.assertEqualAST(
290 |             self.call_visitor(ResolveFullyQualifiedReferencesVisitor, self.make_tree({'children': [
291 |                 {
292 |                     'children': [
293 |                         {
294 |                             'children': [
295 |                                 {
296 |                                     'children': [
297 |                                         {
298 |                                             'children': [
299 |                                                 {
300 |                                                     'type': u'quote',
301 |                                                     'words': u'Art. 4. - Le territoire de la République forme une circonscription unique.'
302 |                                                 }
303 |                                             ],
304 |                                             'type': u'word-definition'
305 |                                         },
306 |                                         {
307 |                                             'children': [
308 |                                                 {
309 |                                                     'id': u'4',
310 |                                                     'type': u'article-reference'
311 |                                                 }
312 |                                             ],
313 |                                             'lawDate': u'1977-7-7',
314 |                                             'id': u'77-729',
315 |                                             'type': u'law-reference'
316 |                                         }
317 |                                     ],
318 |                                     'editType': u'edit',
319 |                                     'type': u'edit'
320 |                                 }
321 |                             ],
322 |                             'order': 1,
323 |                             'type': u'header1-definition'
324 |                         }
325 |                     ],
326 |                     'isNew': False,
327 |                     'order': 2,
328 |                     'type': u'article-definition'
329 |                 }
330 |             ]})),
331 |             {'children': [
332 |                 {
333 |                     'children': [
334 |                         {
335 |                             'children': [
336 |                                 {
337 |                                     'children': [
338 |                                         {
339 |                                             'children': [
340 |                                                 {
341 |                                                     'type': u'quote',
342 |                                                     'words': u'Art. 4. - Le territoire de la République forme une circonscription unique.'
343 |                                                 }
344 |                                             ],
345 |                                             'type': u'word-definition'
346 |                                         },
347 |                                         {
348 |                                             'children': [
349 |                                                 {
350 |                                                     'id': u'4',
351 |                                                     'type': u'article-reference'
352 |                                                 }
353 |                                             ],
354 |                                             'lawDate': u'1977-7-7',
355 |                                             'id': u'77-729',
356 |                                             'type': u'law-reference'
357 |                                         }
358 |                                     ],
359 |                                     'editType': u'edit',
360 |                                     'type': u'edit'
361 |                                 }
362 |                             ],
363 |                             'order': 1,
364 |                             'type': u'header1-definition'
365 |                         }
366 |                     ],
367 |                     'isNew': False,
368 |                     'order': 2,
369 |                     'type': u'article-definition'
370 |                 }
371 |             ]}
372 |         )
373 | 


--------------------------------------------------------------------------------
/duralex/bill_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding=utf-8 -*-
  2 | """
  3 | Original code by RegardsCitoyen (https://github.com/RegardsCitoyens) for the-law-factory-parser
  4 | (https://github.com/regardscitoyens/the-law-factory-parser).
  5 | """
  6 | 
  7 | import sys, re, html5lib
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | from duralex.alinea_parser import word_to_number, month_to_number
 11 | 
 12 | import duralex.tree
 13 | 
 14 | bister = u'(un|duo|tre|bis|qua|quin[tqu]*|sex|sept|octo?|novo?|non|dec|vic|ter|ies)+'
 15 | 
 16 | ORDER = ''
 17 | 
 18 | # Warning changing parenthesis in this regexp has multiple consequences throughout the code
 19 | section_titles = u"((chap|t)itre|volume|livre|tome|(sous-)?section)"
 20 | 
 21 | re_definitif = re.compile(r'<p[^>]*align[=:\s\-]*center"?>\(?<(b|strong)>\(?texte d[^f]*finitif\)?</(b|strong)>\)?</p>', re.I)
 22 | 
 23 | clean_texte_regexps = [
 24 |     (re.compile(r'[\n\t\r\s]+'), ' '),
 25 |     (re.compile(r'(<t[rdh][^>]*>) ?<p [^>]*> ?'), r'\1'),
 26 |     (re.compile(r' ?</p> ?(</t[rdh]>)'), r'\1'),
 27 |     (re.compile(r'(>%s\s*[\dIVXLCDM]+(<sup>[eE][rR]?</sup>)?)\s+-\s+([^<]*?)\s*</p>' % section_titles.upper()), r'\1</p><p><b>\6</b></p>'),
 28 | ]
 29 | 
 30 | re_clean_title_legif = re.compile("[\s|]*l[eé]gifrance(.gouv.fr)?$", re.I)
 31 | clean_legifrance_regexps = [
 32 |     (re.compile(r'[\n\t\r\s]+'), ' '),
 33 |     (re.compile(r'<a[^>]*>\s*En savoir plus sur ce[^<]*</a>', re.I), ''),
 34 |     (re.compile(r'<a/?[^>]*>', re.I), ''),
 35 |     (re.compile(r'\s*<br/>\s*', re.I), '</p><p>'),
 36 |     (re.compile(r'<div[^>]*class="titreSection[^>]*>\s*(%s\s+[\dIVXLCDM]+e?r?)\s*:\s*([^<]*?)\s*</div>' % section_titles, re.I), r'<p>\1</p><p><b>\5</b></p>'),
 37 |     (re.compile(r'<div[^>]*class="titreArt[^>]*>(.*?)\s*</div>', re.I), r'<p><b>\1</b></p>'),
 38 | ]
 39 | 
 40 | # Convert from roman numbers
 41 | re_mat_romans = re.compile(r"[IVXCLDM]+", re.I)
 42 | romans_map = zip(
 43 |     (1000,  900, 500, 400 , 100,  90 , 50 ,  40 , 10 ,   9 ,  5 ,  4  ,  1),
 44 |     ( u'M', u'CM', u'D', u'CD', u'C', u'XC', u'L', u'XL', u'X', u'IX', u'V', u'IV', u'I')
 45 | )
 46 | 
 47 | def romans(n):
 48 |     n = n.upper()
 49 |     i = res = 0
 50 |     for d, r in romans_map:
 51 |         while n[i:i + len(r)] == r:
 52 |             res += d
 53 |             i += len(r)
 54 |     return res
 55 | 
 56 | upcase_accents = u"ÇÀÂÄÉÈÊËÎÏÔÖÙÛÜ"
 57 | locase_accents = u"çàâäéèêëîïôöùûü"
 58 | 
 59 | 
 60 | def real_lower(text):
 61 |     for a in upcase_accents:
 62 |         text = text.replace(a, locase_accents[upcase_accents.find(a)])
 63 |     return text.lower()
 64 | 
 65 | 
 66 | def lower_but_first(text):
 67 |     return text[0].upper() + real_lower(text[1:])
 68 | 
 69 | 
 70 | re_fullupcase = re.compile(r"^([\W0-9]*)([A-Z%s][\W0-9A-Z%s]*)$" % (upcase_accents, upcase_accents), re.U)
 71 | 
 72 | 
 73 | def clean_full_upcase(text):
 74 |     mat = re_fullupcase.match(text)
 75 |     if mat:
 76 |         text = mat.group(1) + lower_but_first(mat.group(2))
 77 |     return text
 78 | 
 79 | re_clean_premier = re.compile(r'((PREM)?)(1|I)ER?')
 80 | re_clean_bister = re.compile(r'([IXV\d]+e?r?)\s+(%s)' % bister, re.I)
 81 | re_clean_subsec_space = re.compile(r'^("?[IVX0-9]{1,4}(\s+[a-z]+)?(\s+[A-Z]{1,4})?)\s*([\.°\-]+)\s*([^\s\)])', re.I)
 82 | re_clean_subsec_space2 = re.compile(r'^("?[IVX0-9]{1,4})\s*([a-z]*)\s*([A-H]{1,4})([\.°\-])', re.I)
 83 | re_clean_punc_space = re.compile(u'([°«»:;,\.!\?\]\)%€&\$])([^\s\)\.,\d"])')
 84 | re_clean_spaces = re.compile(r'(\s|\xc2\xa0|\xa0)+')
 85 | re_clean_coord = re.compile(r'^["\(]*(pour)?\s*coordination[\)\s\.]*$', re.I)
 86 | # Clean html and special chars
 87 | lower_inner_title = lambda x: x.group(1)+lower_but_first(x.group(3))+" "
 88 | html_replace = [
 89 |     (re_clean_spaces, " "),
 90 |     (re.compile(r"\s*\n+\s*"), " "),
 91 |     (re.compile(r'</p><p>'), u'\n'),
 92 |     (re.compile(r"−"), "-"),
 93 |     (re.compile(r" "), " "),
 94 |     (re.compile(r"<!--.*?-->", re.I), ""),
 95 |     # (re.compile(r"</?br/?>[«\"\s]+", re.I), " "),
 96 |     (re.compile(r'(«\s+|\s+»)'), '"'),
 97 |     (re.compile(r'(«|»|“|”|„|‟|❝|❞|＂|〟|〞|〝)'), '"'),
 98 |     (re.compile(r"(’|＇|’|ߴ|՚|ʼ|❛|❜)"), "'"),
 99 |     (re.compile(r"(‒|–|—|―|⁓|‑|‐|⁃|⏤)"), "-"),
100 |     (re.compile(r"(</?\w+)[^>]*>"), r"\1>"),
101 |     (re.compile(r"(</?)em>", re.I), r"\1i>"),
102 |     (re.compile(r"(</?)strong>", re.I), r"\1b>"),
103 |     (re.compile(r"<(![^>]*|/?(p|span))>", re.I), ""),
104 |     (re.compile(r"<[^>]*></[^>]*>"), ""),
105 |     (re.compile(r"^<b><i>", re.I), "<i><b>"),
106 |     (re.compile(r"</b>(\s*)<b>", re.I), r"\1"),
107 |     (re.compile(r"</?sup>", re.I), ""),
108 |     (re.compile(r"^((<[bi]>)*)\((S|AN)[12]\)\s*", re.I), r"\1"),
109 |     (re.compile(r"^(<b>Article\s*)\d+\s*<s>\s*", re.I), r"\1"),
110 |     (re.compile(r"<s>(.*)</s>", re.I), ""),
111 |     (re.compile(r"</?s>", re.I), ""),
112 |     (re.compile(r"\s*</?img>\s*", re.I), ""),
113 |     (re.compile(r"œ([A-Z])"), r"OE\1"),
114 |     (re.compile(r"œ\s*", re.I), "oe"),
115 |     (re.compile(r'^((<[^>]*>)*")%s ' % section_titles, re.I), lower_inner_title),
116 |     (re.compile(r' pr..?liminaire', re.I), ' préliminaire'),
117 |     (re.compile(r'<strike>[^<]*</strike>', re.I), ''),
118 |     (re.compile(r'^<a>(\w)', re.I), r"\1"),
119 | ]
120 | 
121 | 
122 | def clean_html(t):
123 |     for regex, repl in html_replace:
124 |         t = regex.sub(repl, t)
125 |     return t.strip()
126 | 
127 | re_clean_et = re.compile(r'(,|\s+et)\s+', re.I)
128 | 
129 | def cleanup(dic):
130 |     # Clean empty articles with only "Supprimé" as text
131 |     if not dic:
132 |         return
133 |     if 'alineas' in dic:
134 |         if len(dic['alineas']) == 1 and dic['alineas']['001'].startswith("(Supprimé)"):
135 |             dic['statut'] = "supprimé"
136 |             dic['alineas'] = {'001': ''}
137 |         elif dic['statut'].startswith('conforme') and not len(dic['alineas']):
138 |             dic['alineas'] = {'001': '(Non modifié)'}
139 |         multiples = re_clean_et.sub(',', dic['titre']).split(',')
140 |         if len(multiples) > 1:
141 |             for d in multiples:
142 |                 new = dict(dic)
143 |                 new['titre'] = d
144 |             return new
145 | 
146 |     return dic
147 | 
148 | def save_text(txt):
149 |     if "done" not in txt:
150 |         return cleanup(txt)
151 |     txt["done"] = True
152 |     return txt
153 | 
154 | blank_none = lambda x: x if x else ""
155 | re_cl_html = re.compile(r"<[^>]+>")
156 | re_cl_html_except_tables = re.compile(r"</?[^t/][^>]*>", re.I)
157 | re_fix_missing_table = re.compile(r'(<td>\W*)$', re.I)
158 | cl_html_except_tables = lambda x: re_fix_missing_table.sub(r'\1</td></tr></tbody></table>', re_cl_html_except_tables.sub('', x)).strip().replace('> ', '>').replace(' <', '<').replace('<td><tr>', '<td></td></tr><tr>')
159 | re_cl_par  = re.compile(r"[()]")
160 | re_cl_uno  = re.compile(r"(premie?r?|unique?)", re.I)
161 | re_cl_sec_uno = re.compile(r"^[Ii1][eE][rR]?")
162 | re_mat_sec = re.compile(r"%s(\s+(.+)e?r?)" % section_titles, re.I)
163 | re_mat_n = re.compile(r"((pr..?)?limin|unique|premier|[IVX\d]+)", re.I)
164 | re_mat_art = re.compile(r"articles?\s*([^(]*)(\([^)]*\))?$", re.I)
165 | re_mat_ppl = re.compile(r"(<b>)?pro.* loi", re.I)
166 | re_mat_tco = re.compile(r"\s*<b>\s*(ANNEXE[^:]*:\s*|\d+\)\s+)?TEXTES?\s*(ADOPTÉS?\s*PAR|DE)\s*LA\s*COMMISSION.*</b>\s*$")
167 | re_mat_exp = re.compile(r"(<b>)?expos[eéÉ]", re.I)
168 | re_mat_end = re.compile(r"((<i>)?Délibéré en|(<i>)?NB[\s:<]+|(<b>)?RAPPORT ANNEX|Fait à .*, le|\s*©|\s*N.?B.?\s*:|(</?i>)*<a>[1*]</a>\s*(</?i>)*\(\)(</?i>)*|<i>\(1\)\s*Nota[\s:]+|<a>\*</a>\s*(<i>)?1)", re.I)
169 | re_mat_ann = re.compile(r"\s*<b>\s*ANNEXES?[\s<]+")
170 | re_mat_dots = re.compile(r"^(<i>)?[.…]+(</i>)?$")
171 | re_mat_st = re.compile(r"(<i>|\()+\s*(conform|non[\s\-]*modif|suppr|nouveau).{0,10}$", re.I)
172 | re_mat_new = re.compile(r"\s*\(\s*nouveau\s*\)\s*", re.I)
173 | re_mat_texte = re.compile(r'\(texte (modifié|élaboré|d(u|e l))', re.I)
174 | re_mat_single_char = re.compile(r'^\s*[LMN]\s*$')
175 | re_clean_idx_spaces = re.compile(r'^([IVXLCDM0-9]+)\s*\.\s*')
176 | re_clean_art_spaces = re.compile(r'^\s*("?)\s+')
177 | re_clean_art_spaces2 = re.compile(r'\s+\.\s*-\s+')
178 | re_clean_conf = re.compile(r"\((conforme|non[\s-]*modifi..?)s?\)", re.I)
179 | re_clean_supr = re.compile(r'\((dispositions?\s*d..?clar..?es?\s*irrecevable.*article 4.*Constitution.*|(maintien de la )?suppr(ession|im..?s?)(\s*(conforme|maintenue|par la commission mixte paritaire))*)\)["\s]*$', re.I)
180 | re_echec_hemi = re.compile(r"L('Assemblée nationale|e Sénat) (a rejeté|n'a pas adopté)[, ]+", re.I)
181 | re_echec_hemi2 = re.compile(r"de loi a été rejetée par l('Assemblée nationale|e Sénat)\.$", re.I)
182 | re_echec_com = re.compile(r" la commission .*(effet est d'entraîner le rejet|demande de rejeter|a rejeté|n'a pas adopté)[dleau\s]*(projet|proposition|texte)[.\s]", re.I)
183 | re_echec_cmp = re.compile(r" (a conclu à l'échec de ses travaux|(ne|pas) .*parven(u[es]?|ir) à (élaborer )?un texte commun)", re.I)
184 | re_rap_mult = re.compile(r'[\s<>/ai]*N[°\s]*\d+\s*(,|et)\s*[N°\s]*\d+', re.I)
185 | re_src_mult = re.compile(r'^- L(?:A PROPOSITION|E PROJET) DE LOI n°\s*(\d+)\D')
186 | re_clean_mult_1 = re.compile(r'\s*et\s*', re.I)
187 | re_clean_mult_2 = re.compile(r'[^,\d]', re.I)
188 | re_clean_footer_notes = re.compile(r"[\.\s]*\(*\d*\([\d\*]+[\)\d\*\.\s]*$")
189 | re_sep_text = re.compile(r'\s*<b>\s*(article|%s)\s*(I|uniqu|pr..?limina|1|prem)[ier]*\s*</b>\s*$' % section_titles, re.I)
190 | re_stars = re.compile(r'^[\s*_]+$')
191 | re_art_uni = re.compile(r'\s*article\s*unique\s*$', re.I)
192 | re_all_caps = re.compile(r'[A-Z' + upcase_accents + r' ]+')
193 | section = {"type": "section", "id": ""}
194 | 
195 | def parse_bill(string, url):
196 |     section_id = ""
197 |     curtext = -1
198 |     srclst = []
199 |     article = None
200 |     read = art_num = ali_num = 0
201 |     indextext = -1
202 | 
203 |     definitif = re_definitif.search(string) is not None
204 |     soup = BeautifulSoup(string, "html5lib")
205 | 
206 |     texte = {
207 |         "type": "projet de loi",
208 |         "definitive": definitif,
209 |         "articles": [],
210 |         "url": url,
211 |         "expose": ""
212 |     }
213 |     expose = False
214 | 
215 |     if url:
216 |         url = re.sub(r"^.*/http", "http", url)
217 |         url = re.sub(r"%3A", ":", re.sub(r"%2F", "/", url))
218 |         # Generate Senat or AN ID from URL
219 |         if "legifrance.gouv.fr" in url:
220 |             m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I)
221 |             texte["id"] = ORDER + m.group(1)
222 |         elif re.search(r"assemblee-?nationale", url, re.I):
223 |             m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I)
224 |             numero = int(m.group(3))
225 |             texte["id"] = ORDER+"A" + m.group(1) + "-"
226 |             if m.group(2) is not None:
227 |                 texte["id"] += m.group(2)
228 |             texte["id"] += str(numero)
229 |         else:
230 |             m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})\d?\.", url, re.I)
231 |             if m is None:
232 |                 m = re.search(r"/(-)?20(\d+)-\d+/(\d+).html", url, re.I)
233 |             numero = int(m.group(3))
234 |             texte["id"] = ORDER+"S" + m.group(2) + "-"
235 |             if m.group(1) is not None:
236 |                 texte["id"] += m.group(1)
237 |             texte["id"] += "%03d" % numero
238 | 
239 |     is_html = string.find('<html>') == 0 or string.find('<?xml version="1.0" encoding="UTF-8"?>') == 0
240 |     lines = soup.body.find_all('p') if is_html else string.split(u'\n')
241 | 
242 |     for line in lines:
243 |         line = clean_html(line.text if is_html else line)
244 | 
245 |         if re_stars.match(line):
246 |             continue
247 | 
248 |         match = re.compile(r'^N°\D+(\d+)$', re.MULTILINE).search(line)
249 |         if match:
250 |             texte['id'] = int(match.group(1))
251 | 
252 |         match = re.compile(r'^(.*) LÉGISLATURE$', re.MULTILINE).search(line)
253 |         if match:
254 |             texte['legislature'] = word_to_number(match.group(1))
255 | 
256 |         match = re.compile(r'Enregistré à la Présidence (du |de l\')(.*) le (\d+) (\w+) (\d{4})').search(line)
257 |         if match:
258 |             texte['date'] = match.group(5) + '-' + str(month_to_number(match.group(4))) + '-' + match.group(3)
259 |             texte['place'] = match.group(2).lower()
260 | 
261 |         if line == u'PROPOSITION DE LOI':
262 |             texte['type'] = duralex.tree.TYPE_LAW_PROPOSAL
263 |         elif line == u'PROJET DE LOI':
264 |             texte['type'] = duralex.tree.TYPE_LAW_PROJECT
265 | 
266 |         if 'description' not in texte and line in [u'PROPOSITION DE LOI', u'PROJET DE LOI']:
267 |             texte['description'] = line.lower()
268 |             read = 3
269 |             continue
270 |         if read == 3:
271 |             if real_lower(line).startswith(u'transmise par') or real_lower(line).startswith(u'présentée par'):
272 |                 read = 0
273 |             else:
274 |                 if re_all_caps.match(line):
275 |                     line = real_lower(line)
276 |                 line = line.replace(',', '')
277 |                 if line:
278 |                     texte['description'] += ' ' + line
279 |             continue
280 | 
281 |         if line == "<b>RAPPORT</b>" or line == "Mesdames, Messieurs,":
282 |             read = -1
283 |         if (srclst or indextext != -1) and re_sep_text.match(line):
284 |             curtext += 1
285 |             art_num = 0
286 |         srcl = re_src_mult.search(line)
287 |         cl_line = re_cl_html.sub("", line).strip()
288 |         if srcl and read < 1:
289 |             srclst.append(int(srcl.group(1)))
290 |             continue
291 |         elif re_rap_mult.match(line):
292 |             line = cl_line
293 |             line = re_clean_mult_1.sub(",", line)
294 |             line = re_clean_mult_2.sub("", line)
295 |             cl_line = re_cl_html.sub("", line).strip()
296 |             for n_t in line.split(','):
297 |                 indextext += 1
298 |                 if int(n_t) == numero:
299 |                     break
300 |         elif re_mat_ppl.match(line) or re_mat_tco.match(line):
301 |             read = 0
302 |             texte = save_text(texte)
303 |         elif re_mat_exp.match(line):
304 |             read = -1 # Deactivate description lecture
305 |             expose = True
306 |         elif re_echec_cmp.search(cl_line) or re_echec_com.search(cl_line) or re_echec_hemi.match(cl_line) or re_echec_hemi2.search(cl_line):
307 |             texte = save_text(texte)
308 |             cleanup({"type": "echec", "texte": cl_line})
309 |             break
310 |         elif read == -1 or (indextext != -1 and curtext != indextext):
311 |             continue
312 | 
313 |         # Identify section zones
314 |         m = re_mat_sec.match(line)
315 |         if m:
316 |             read = 1 # Activate titles lecture
317 |             section["type_section"] = real_lower(m.group(1))
318 |             section_typ = m.group(1).upper()[0]
319 |             if m.group(3) is not None:
320 |                 section_typ += "S"
321 | 
322 |             if " LIMINAIRE" in line:
323 |                 section_num = "L"
324 |             else:
325 |                 section_num = re_cl_uno.sub('1', re_cl_sec_uno.sub('1', re_cl_html.sub('', m.group(5).strip())).strip())
326 |                 section_num = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), section_num)
327 |                 section_num = re_mat_new.sub('', section_num).strip()
328 |                 m2 = re_mat_romans.match(section_num)
329 |                 if m2:
330 |                     rest = section_num.replace(m2.group(0), '')
331 |                     section_num = romans(m2.group(0))
332 |                     if rest: section_num = str(section_num) + rest
333 |             # Get parent section id to build current section id
334 |             section_par = re.sub(r""+section_typ+"[\dL].*$", "", section["id"])
335 |             section["id"] = section_par + section_typ + str(section_num)
336 | 
337 |         # Identify titles and new article zones
338 |         elif (not expose and re_mat_end.match(line)) or (read == 2 and re_mat_ann.match(line)):
339 |             break
340 |         elif re.match(r"(<i>)?<b>", line) or re_art_uni.match(line) or re.match(r"^Articles? ", line):
341 |             line = cl_line
342 |             # Read a new article
343 |             if re_mat_art.match(line):
344 |                 if article is not None:
345 |                     texte = save_text(texte)
346 |                     cleanup(article)
347 |                 read = 2 # Activate alineas lecture
348 |                 expose = False
349 |                 art_num += 1
350 |                 ali_num = 0
351 |                 article = {"type": "article", "order": art_num, "alineas": {}, "statut": "none"}
352 |                 texte['articles'].append(article)
353 |                 if srclst:
354 |                     article["source_text"] = srclst[curtext]
355 |                 m = re_mat_art.match(line)
356 |                 article["titre"] = re_cl_uno.sub("1er", re_cl_sec_uno.sub("1er", m.group(1).strip())).strip(" -'")
357 |                 if m.group(2) is not None:
358 |                     article["statut"] = re_cl_par.sub("", real_lower(m.group(2))).strip()
359 |                 if section["id"] != "":
360 |                     article["section"] = section["id"]
361 |             # Read a section's title
362 |             elif read == 1:
363 |                 texte = save_text(texte)
364 |                 section["titre"] = lower_but_first(line)
365 |                 if article is not None:
366 |                     cleanup(article)
367 |                     article = None
368 |                 cleanup(section)
369 |                 read = 0
370 | 
371 |         # Read articles with alineas
372 |         if read == 2 and not m:
373 |             # Find extra status information
374 |             if ali_num == 0 and re_mat_st.match(line):
375 |                 article["statut"] = re_cl_html.sub("", re_cl_par.sub("", real_lower(line)).strip())
376 |                 continue
377 |             if re_mat_dots.match(line):
378 |                 continue
379 |             if "<table>" in line:
380 |                 cl_line = cl_html_except_tables(line)
381 |             line = re_clean_art_spaces2.sub('. - ', re_clean_art_spaces.sub(r'\1', re_clean_idx_spaces.sub(r'\1. ', re_mat_new.sub(" ", cl_line).strip())))
382 |             # Clean low/upcase issues with BIS TER etc.
383 |             line = line.replace("oeUVRE", "OEUVRE")
384 |             line = clean_full_upcase(line)
385 |             line = re_clean_premier.sub(lambda m: (real_lower(m.group(0)) if m.group(1) else "")+m.group(3)+"er", line)
386 |             line = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), line)
387 |             # Clean different versions of same comment.
388 |             line = re_clean_supr.sub('(Supprimé)', line)
389 |             line = re_clean_conf.sub('(Non modifié)', line)
390 |             line = re_clean_coord.sub('', line)
391 |             line = re_clean_subsec_space.sub(r'\1\4 \5', line)
392 |             line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line)
393 |             line = re_clean_punc_space.sub(r'\1 \2', line)#.encode('utf-8')
394 |             line = re_clean_spaces.sub(' ', line)
395 |             line = re_mat_sec.sub(lambda x: lower_but_first(x.group(1))+x.group(4) if re_mat_n.match(x.group(4)) else x.group(0), line)
396 |             line = re_clean_footer_notes.sub(".", line)
397 |             # Clean comments (Texte du Sénat), (Texte de la Commission), ...
398 |             if ali_num == 0 and re_mat_texte.match(line):
399 |                 continue
400 |             line = re_mat_single_char.sub("", line)
401 |             line = line.strip()
402 |             if line:
403 |                 ali_num += 1
404 |                 # match alinea numbering in the form of "(ali_num) actual alinea content goes here..."
405 |                 m = re.compile(r"^\((\d)\) (.*)$", re.MULTILINE).match(line)
406 |                 if m:
407 |                     ali_num = int(m.group(1))
408 |                     line = m.group(2)
409 |                 article["alineas"]["%03d" % ali_num] = line
410 |         else:
411 |             #metas
412 |             continue
413 | 
414 |     # save_text(texte)
415 |     cleanup(texte)
416 | 
417 |     return texte
418 | 


--------------------------------------------------------------------------------