├── .github
    └── retriever-reader-pipeline.png
├── .gitignore
├── README.md
├── dpr
    ├── index.py
    ├── query.py
    └── requirements.txt
├── rocketqa
    ├── index.py
    ├── query.py
    └── requirements.txt
└── toy_data
    └── marco.tp.1k


/.github/retriever-reader-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/example-odqa/a3c7c56dac886b8d9a2def68ffe71ec77a00f130/.github/retriever-reader-pipeline.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .cache/
2 | .idea/
3 | workspace_dpr/
4 | workspace_rocketqa/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Build An Open Domain Question-Answering (ODQA) System with Jina
 2 | 
 3 | Check out the full story at [our blog site](https://jina.ai/blog/2021-11-29-odqa-part-1/).
 4 | 
 5 | ## Usage
 6 | 
 7 | In an ODQA system, the two-stage pipeline consisting of `retriever` and `reader` is a common choice. In this example, we
 8 | offer two implementations including
 9 | 
10 | - Dense Passage Retrieval
11 | - RocketQA
12 | 
13 | ![retriever_reader_pipeline](.github/retriever-reader-pipeline.png)
14 | 
15 | As for the `retriever` part, instead of the term-based methods, both implementations use the dense-vector-based methods.
16 | In both cases, we use the `reader` implementation from DPR.
17 | 
18 | ### RocketQA
19 | 
20 | ```shell
21 | cd rocketqa
22 | python -m venv rocketqa-env
23 | source dpr-env/bin/activate
24 | pip install -r requirements.txt
25 | python index.py
26 | python query.py
27 | ```
28 | 
29 | <!--gif-->
30 | 
31 | ### Dense Passage Retrieval
32 | 
33 | ```shell
34 | cd dpr
35 | python -m venv dpr-env
36 | source dpr-env/bin/activate
37 | pip install -r requirements.txt
38 | python index.py
39 | python query.py
40 | ```
41 | 
42 | <!--gif-->
43 | 
44 | 


--------------------------------------------------------------------------------
/dpr/index.py:
--------------------------------------------------------------------------------
 1 | from jina import Document, Flow
 2 | 
 3 | 
 4 | def get_doc(fn):
 5 |     with open(fn, 'r') as fh:
 6 |         for idx, l in enumerate(fh):
 7 |             if idx >= 10:
 8 |                 break
 9 |             title, para = l.strip().split('\t')
10 |             doc = Document(text=para, tags={'title': title})
11 |             yield doc
12 | 
13 | 
14 | def main():
15 |     fn = '../toy_data/marco.tp.1k'
16 |     f = (Flow()
17 |          .add(uses='jinahub+docker://DPRTextEncoder',
18 |               volumes='.cache:/root/.cache/huggingface',
19 |               uses_with={
20 |                   'encoder_type': 'context',
21 |                   'traversal_paths': 'r',
22 |                   'pretrained_model_name_or_path': 'facebook/dpr-ctx_encoder-single-nq-base'})
23 |          .add(uses='jinahub://SimpleIndexer',
24 |               uses_metas={
25 |                   'workspace': 'workspace_dpr',
26 |                   'title_tag_key': 'title'}))
27 | 
28 |     with f:
29 |         f.post(on='/index', inputs=get_doc(fn))
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/dpr/query.py:
--------------------------------------------------------------------------------
 1 | from jina import Flow, Document
 2 | 
 3 | 
 4 | def print_answers(resp):
 5 |     for d in resp.docs:
 6 |         for m in d.matches:
 7 |             score = m.scores['relevance_score'].value
 8 |             ans = m.text
 9 |             print(f'Answer (score: {score:.4f}): {ans}')
10 |             print('-'*20)
11 |         print('\n')
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     f = (Flow()
16 |          .add(uses='jinahub+docker://DPRTextEncoder',
17 |               volumes='.cache:/root/.cache/huggingface',
18 |               uses_with={
19 |                   'encoder_type': 'question',
20 |                   'traversal_paths': 'r',
21 |                   'pretrained_model_name_or_path': 'facebook/dpr-question_encoder-single-nq-base'})
22 |          .add(uses='jinahub://SimpleIndexer',
23 |               uses_metas={'workspace': 'workspace_dpr'},
24 |               uses_with={'match_args': {'limit': 3}})
25 |          .add(uses='jinahub+docker://DPRReaderRanker/v0.3',
26 |               uses_with={'title_tag_key': 'title', 'num_spans_per_match': 1},
27 |               volumes='.cache:/root/.cache/huggingface'))
28 | 
29 |     with f:
30 |         while True:
31 |             q = input('Question?: ')
32 |             if not q:
33 |                 break
34 |             f.post(on='/search', inputs=Document(text=q), on_done=print_answers)
35 | 


--------------------------------------------------------------------------------
/dpr/requirements.txt:
--------------------------------------------------------------------------------
1 | jina==2.5.4
2 | scipy==1.6.1
3 | transformers==4.9.1
4 | torch==1.9.0
5 | 


--------------------------------------------------------------------------------
/rocketqa/index.py:
--------------------------------------------------------------------------------
 1 | from jina import Document, Flow
 2 | 
 3 | 
 4 | def get_doc(fn):
 5 |     with open(fn, 'r') as fh:
 6 |         for idx, l in enumerate(fh):
 7 |             if idx >= 10:
 8 |                 break
 9 |             title, para = l.strip().split('\t')
10 |             doc = Document(tags={'title': title, 'para': para})
11 |             yield doc
12 | 
13 | 
14 | def main():
15 |     fn = '../toy_data/marco.tp.1k'
16 |     f = (Flow()
17 |          .add(
18 |         uses='jinahub+docker://RocketQADualEncoder/latest',
19 |         volumes='.rocketqa:/root/.rocketqa',
20 |         uses_with={'use_cuda': False})
21 |          .add(
22 |         uses='jinahub://SimpleIndexer',
23 |         install_requirements=True,
24 |         uses_metas={'workspace': 'workspace_rocketqa'}))
25 | 
26 |     with f:
27 |         f.post(on='/index', inputs=get_doc(fn))
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/rocketqa/query.py:
--------------------------------------------------------------------------------
 1 | from jina import Flow, Document
 2 | 
 3 | 
 4 | def print_answers(resp):
 5 |     for d in resp.docs:
 6 |         for m in d.matches:
 7 |             score = m.scores['relevance_score'].value
 8 |             ans = m.text
 9 |             title = m.tags['title']
10 |             para = m.tags['para']
11 |             print(f'Answer (score: {score:.4f}): {ans}')
12 |             print(f'Support: {title} [SEP] {para}')
13 |             print('-'*20)
14 |         print('\n')
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     f = (Flow()
19 |          .add(uses='jinahub+docker://RocketQADualEncoder',
20 |               volumes='.rocketqa:/root/.rocketqa',
21 |               uses_with={'use_cuda': False})
22 |          .add(uses='jinahub://SimpleIndexer/v0.10',
23 |               uses_metas={'workspace': 'workspace_rocketqa'},
24 |               uses_with={'match_args': {'limit': 3}})
25 |          .add(uses='jinahub+docker://RocketQAReranker',
26 |               volumes='.rocketqa:/root/.rocketqa',
27 |               uses_with={'model': 'v1_marco_ce', 'use_cuda': False})
28 |          .add(uses='jinahub+docker://DPRReaderRanker/v0.3',
29 |               uses_with={'title_tag_key': 'title', 'num_spans_per_match': 1},
30 |               volumes='.cache:/root/.cache/huggingface'))
31 | 
32 |     with f:
33 |         while True:
34 |             q = input('Question?: ')
35 |             if not q:
36 |                 break
37 |             f.post(on='/search', inputs=Document(text=q), on_done=print_answers)
38 | 


--------------------------------------------------------------------------------
/rocketqa/requirements.txt:
--------------------------------------------------------------------------------
1 | paddlepaddle>=2.2.0
2 | rocketqa~=1.0.0
3 | jina==2.5.4
4 | scipy==1.6.1
5 | 


--------------------------------------------------------------------------------