├── Dockerfile
├── README.md
├── data
    ├── input
    │   ├── medium.txt
    │   └── medium_rules.pdf
    └── squad20
    │   └── answers.json
├── gunicorn.conf.py
├── main.py
├── requirements.txt
└── util-trainer.py


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/anaconda3
 2 | 
 3 | 
 4 | RUN apt-get update \
 5 |  && apt-get install -y locales \
 6 |  && apt-get update \
 7 |  && dpkg-reconfigure -f noninteractive locales \
 8 |  && locale-gen C.UTF-8 \
 9 |  && /usr/sbin/update-locale LANG=C.UTF-8 \
10 |  && echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen \
11 |  && locale-gen \
12 |  && apt-get install -y curl unzip \
13 |  && apt-get clean \
14 |  && apt-get autoremove
15 | 
16 | 
17 | # Creating Application Source Code Directory
18 | RUN mkdir -p /usr/src/app
19 | 
20 | # Setting Home Directory for containers
21 | WORKDIR /usr/src/app
22 | 
23 | # Installing python dependencies
24 | COPY requirements.txt /usr/src/app/
25 | 
26 | RUN apt-get install -y poppler-utils
27 | 
28 | RUN pip install --no-cache-dir -r requirements.txt
29 | 
30 | # Copying src code to Container
31 | COPY . /usr/src/app
32 | 
33 | RUN chmod -R 777 /usr/src/app/data/input
34 | RUN chmod -R 777 /usr/src/app/data/squad20
35 | RUN chmod -R 777 /usr/src/app/data/train_model
36 | RUN chmod -R 777 /usr/src/app/data
37 | RUN chmod -R 777 /usr/src/app
38 | RUN chmod -R 777 /usr/src
39 | 
40 | # Application Environment variables
41 | #ENV APP_ENV development
42 | ENV PORT 8777
43 | 
44 | # Exposing Ports
45 | EXPOSE $PORT
46 | 
47 | # Setting Persistent data
48 | VOLUME ["/app-data"]
49 | 
50 | # Running Python Application
51 | CMD gunicorn -b :$PORT -c gunicorn.conf.py main:app
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # qna-api
 2 | Its complete rest API for Question and Answering with any unstructure document. The API is developed using haystack framework. The following features are supported:
 3 | 1) Upload document to Elastic Document storage
 4 | 2) Pretrain and Trained QNA
 5 | 
 6 | # Installation
 7 | `docker build -t qna:v1 .`
 8 | 
 9 | `docker run — name qna_app -d -p 8777:8777 xxxxxxxxx`
10 | 
11 | #### Note : xxxxxxxxx is the image id
12 | 
13 | # Training
14 | ```python
15 | from haystack.reader.farm import FARMReader
16 | 
17 | #input directory of the labels answers.json file
18 | train_data = "/usr/src/app/data/squad20"
19 | # output directory of the model
20 | train_model = "/usr/src/app/data/train_model"
21 | 
22 | reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)
23 | 
24 | reader.train(
25 |     data_dir=train_data,
26 |     train_filename="answers.json",
27 |     n_epochs=10,
28 |     dev_split = 0,
29 |     save_dir=train_model)
30 | 
31 | print('Training successfully completed')
32 | ```
33 | # Testing
34 | ```python
35 | document_store = ElasticsearchDocumentStore(host=app.config["host"],
36 |                                                 port=app.config["port"],
37 |                                                 username=app.config["username"],
38 |                                                 password=app.config["password"],
39 |                                                 index='electrical')
40 | 
41 | retriever = DensePassageRetriever(document_store=document_store,
42 |                                   embedding_model="dpr-bert-base-nq",
43 |                                   do_lower_case=True, use_gpu=False)
44 | 
45 | # Farm reader using the trained model
46 | reader = FARMReader(model_name_or_path=app.config["train_model"] ,use_gpu=False)
47 | 
48 | finder = Finder(reader, retriever)
49 | 
50 | n = 5
51 | question="asked your query here?"
52 | prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=n)
53 | ```
54 | # Code Explaination
55 | The complete step by step explaination are provided [here](https://medium.com/analytics-vidhya/how-to-create-your-question-and-answering-flask-api-using-haystack-e97205a240d1)
56 | 
57 | 


--------------------------------------------------------------------------------
/data/input/medium.txt:
--------------------------------------------------------------------------------
  1 | Medium exists to share ideas and perspectives from the world’s most insightful writers, thinkers, and storytellers.
  2 | We welcome thoughtful and civil discussion from a broad spectrum of viewpoints. Nevertheless, to maintain a safe and welcoming environment for a wide range of people to engage in meaningful conversations, we prohibit certain conduct.
  3 | Each participant in our community is responsible for maintaining these standards.
  4 | In deciding whether someone has violated the rules, we will take into account things like newsworthiness, the context and nature of the posted information, the likelihood and severity of actual or potential harms, and applicable laws.
  5 | Violations of our rules may result in consequences such as account restrictions, limited distribution of your posts, and suspension of your account. We may limit the distribution of controversial or extreme content.
  6 | Threats of violence and incitement
  7 | We do not allow content or actions that threaten, encourage, or incite violence against anyone, directly or indirectly.
  8 | Hate speech
  9 | We do not allow content that promotes violence or hatred against people based on characteristics like race, ethnicity, national origin, religion, disability, disease, age, sexual orientation, gender, or gender identity.
 10 | We do not allow posts or accounts that glorify, celebrate, downplay, or trivialize violence, suffering, abuse, or deaths of individuals or groups. This includes the use of scientific or pseudoscientific claims to pathologize, dehumanize, or disempower others. We do not allow calls for intolerance, exclusion, or segregation based on protected characteristics, nor do we allow the glorification of groups which do any of the above.
 11 | We do not allow hateful text, images, symbols, or other content in your username, profile, or bio.
 12 | Harassment
 13 | Medium exists to share and discuss ideas. We don’t tolerate harassment, which includes:
 14 | Bullying, threatening, or shaming someone, or posting things likely to encourage others to do so
 15 | Using Medium features like responses, private notes, mentions, follows, story requests, or writer requests in a way intended to annoy or harass someone
 16 | Reviewing businesses or products in a gratuitously harmful or abusive manner
 17 | Privacy and Reputation
 18 | We do not allow the following:
 19 | Posting copies of private communications between private individuals without the explicit consent of all parties to the communication
 20 | Doxing, which includes not only private or obscure personal information but also the aggregation of publicly available information to target, shame, blackmail, harass, intimidate, threaten, or endanger
 21 | Confidential C
 22 | Posting intimate or explicit images taken or posted without the subject’s express consent
 23 | Content that violates others’ privacy, including sensitive or confidential information such as credit card numbers, social security numbers, non-public phone numbers, physical addresses, email addresses, or other non-public information
 24 | Restricted categories
 25 | We do not allow posts or accounts that engage in the following restricted categories of activity:
 26 | Promotion of harmful conspiracies
 27 | Facilitation of gambling or betting
 28 | Facilitation of buying or selling social media interactions, including off-platform
 29 | Facilitation of illegal sexual services
 30 | Facilitation of copyright violation
 31 | Related conduct
 32 | We do not allow posts or accounts that engage in on-platform, off-platform, or cross-platform campaigns of targeting, harassment, hate speech, violence, or disinformation. We may consider off-platform actions in assessing a Medium account, and restrict access or availability to that account.
 33 | Duplicate Content
 34 | We do not allow posting duplicate content, whether from a single account or across multiple accounts, either publicly or as an unlisted story.
 35 | Spam
 36 | We do not allow spam on Medium. All spam will be immediately removed from Medium, without notification. While it is hard to define this content with precision or completeness, here are some representative behaviors we look for that are characteristic of spam:
 37 | Posting content primarily to drive traffic to, or increase the search rankings of, an external site, product, or service
 38 | Scraping and reposting content from other sources for the primary purpose of generating revenue or other personal gains
 39 | Posting duplicate content, whether from a single account or across multiple accounts
 40 | Stories where the content is clipped with the sole purpose of linking to the rest of the article on a different website
 41 | Performing a disproportionately large number of interactions, particularly by automated means. This includes bulk, indiscriminate interactions, such as following of other accounts (follow spam) clapping, highlighting, or leaving notes
 42 | Repeatedly using responses or other interactions as a method of promotion
 43 | Participating in bounty campaigns or brigades to artificially inflate rankings for posts, accounts, businesses, or products
 44 | Use or re-use content templates with slight modifications across multiple posts and accounts
 45 | Confidential C
 46 | For each of these behaviors, when we talk about “content,” we mean not only posts but also any other feature that allows you to add your own text or media. When we talk about “interactions,” we mean any feature that allows one user to interact with another, or with a post.
 47 | Paid, automatic, bulk, or non-genuine interactions
 48 | Medium depends on various user behaviors — like follows and claps — to determine what content to feature and make the site work well for everyone. So, we don’t allow artificial behaviors that skew this system and as a result degrade or distort other users’ experiences. This includes:
 49 | Buying, selling, or trading in accounts or account interactions — including follows, claps, highlights, responses or other traffic
 50 | Using services, apps, or arrangements that offer you more follows, claps, or other interactions on your Medium account or content
 51 | Registering accounts, posting content, or interacting with users or content automatically, systematically, or programmatically
 52 | Copyright and trademark infringement
 53 | Respect the copyrights and trademarks of others. Unless you’re authorized to use someone else’s copyrighted work or trademark (either expressly or by legal exceptions and limitations like fair use), don’t do it. We respond to notices of alleged infringement as described in our Terms of Service, Copyright and DMCA Policy, and Trademark Policy.
 54 | Deceptive conduct
 55 | We do not allow deceptive conduct on Medium. This includes:
 56 | Posting content or impersonating a person or organization in a way likely to deceive people. Parody and satire are fine, but make clear that is what you’re doing. Our Username Policy has specific requirements for parody accounts
 57 | Using Medium for phishing or fraud. Don’t use tags, links, titles, or other metadata in a misleading way. Don’t link to or embed malicious or harmful code or software in your posts
 58 | Don’t use deception to generate revenue or traffic
 59 | If you have received compensation, free goods/services or anything of value in connection with the topic of a post, you must make this clear
 60 | Ads, Promotions, and Marketing
 61 | Third-party advertising and sponsorships are not allowed. You may not advertise or promote third-party products, services, or brands through Medium posts, publications, or letters. This includes images that indicate brand sponsorship in a post or letter, or as part of a publication name or logo.
 62 | Images functioning as third-party ads are not allowed. Inline images or embeds that link out and function as banner ads for third-party brands are not allowed.
 63 | You must disclose affiliate links or payment for a post. Affiliate links, such as link out to Amazon with your code, or any other link out where you will receive a commission or other value, are allowed in posts. But, you must disclose somewhere in the post that it includes affiliate links. If you have
 64 | Confidential C
 65 | received payment, goods or services, or something else of value in exchange for writing a post, you must still disclose this fact in writing within your post (as per FTC Rules and Guides).
 66 | Multi-level marketing content is not allowed.
 67 | Embedded Content
 68 | In ongoing compliance with the European GDPR, we are restricting the behavior of certain embedded content:
 69 | Embeds that directly collect data through form fields are not allowed. This includes embeds that facilitate the submission of email addresses and other personally identifying information through forms and the submission of credit card information. If you want to collect information from your readers, you will need to link out from your post on Medium to a form hosted elsewhere that makes it clear to a user that they are no longer in the Medium network
 70 | Cryptocurrency Accounts, Posts, and Publications
 71 | Posts and accounts that focus on cryptocurrencies which do not meet the following requirements may be considered spam and are subject to suspension, depending on context. Medium does not endorse or verify any coin, token, or similar announcement.
 72 | To prevent fraud and abuse, you must:
 73 | Include a link to your active project domain in your user account bio.
 74 | Use an email address from that domain as your verified Medium account email, and maintain that email account actively.
 75 | Link at least one consistently-branded social media account (Facebook or Twitter) to your Medium account. That social account should also link out prominently to the same domain as is included in your Medium user bio.
 76 | Include a prominent about page with up-to-date contact information on your project website.
 77 | You may not:
 78 | Use an anonymous email address which is not linked to your project domain. (e.g., gmail, protonmail, mail.ru, etc.)
 79 | Advertise or participate in bounty campaigns, pump and dumps, reviewing for reward, or other forms of brigading or inauthentic activity.
 80 | Include shortened URLs in your posts.
 81 | Graphic content
 82 | We do not allow pornographic images or videos. We do allow erotic writing and non-graphic erotic images.
 83 | We do not allow gratuitously graphic or disturbing media, even if it’s not pornographic.
 84 | Exploitation of minors
 85 | We do not allow content promoting the sexual or violent exploitation of minors, including the sexualization of fictional minors.
 86 | Confidential C
 87 | Promotion and glorification of self-harm
 88 | We do not allow activities that encourage, promote or glorify acts of self-harm, such as cutting, eating disorders like anorexia or bulimia, and suicide. If you encounter users contemplating or threatening self-harm, please report it to us via the form or email address linked below.
 89 | How to report a violation
 90 | If you find a post or account on Medium that violates these rules, please flag it. You can use this form to provide more detail or to report other conduct you believe violates our rules. Additionally, you can send us an email to trust@medium.com.
 91 | If you break the rules
 92 | We strive to be fair, but we reserve the right to suspend accounts or remove content, without notice, for any reason, particularly to protect our services, infrastructure, users, or community. If you attempt to evade suspension by creating new accounts or posts, we will suspend your new accounts and posts.
 93 | Notice
 94 | Upon investigating or disabling content associated with your account, we will notify you, unless we believe your account is automated or operating in bad faith, or that notifying you is likely to cause, maintain or exacerbate harm to someone.
 95 | Appeals
 96 | If you believe your content or account has been restricted or disabled in error, or believe there is relevant context we were not aware of in reaching our determination, you can write to us at yourfriends@medium.com. We will consider all good faith efforts to appeal.
 97 | Government Takedown Requests
 98 | If Medium receives a request from a government actor to restrict access to content associated with your account, we will notify you unless we are prohibited by law or believe doing so may endanger others. Where applicable, we will work to limit legally-ordered content restrictions to jurisdictions where we have a good faith belief that we are legally required to restrict the content. Medium submits to the Lumen database government requests to restrict access to content (redacted where appropriate to protect privacy or prevent harm to a person).
 99 | We may enforce, or not enforce, these policies at our sole discretion. These policies don’t create a duty or contractual obligation for us to act.
100 | We also may change these rules at any time. We track changes to our rules on Github so you can see how they evolve.
101 | Medium is committed to providing a transparent, open platform for expression and therefore supports the goals and spirit of The Santa Clara Principles on Transparency and Accountability in Content Moderation as a starting point for further discussion.
102 | Member Content Guidelines
103 | To be eligible for inclusion and potential monetization from member-only content, user posts must meet all Medium rules as well as the following content requirements. We’ve set these additional
104 | Confidential C
105 | guidelines for member-only stories to create a fair environment for writers in our Partner Program and a richer reading experience for members.
106 | You may not sell advertising in or on your member-only content.
107 | Member stories should not exist with the predominant purpose of driving traffic to an external website, business, or product. All included content must be relevant, substantial and informative.
108 | If you have received compensation, free goods/services, or anything of value in connection with the topic of a post, you must make this clear.
109 | Do not ask for claps or include other calls to action, including gifs of clapping, requests for donations, or other links or embeds for the purpose of capturing user information or soliciting money.
110 | Do not use content or images licensed as non-commercial in your Partner Program posts.
111 | If you violate these Member content rules
112 | Medium reserves the right to review posts or accounts at any time and stop payment for any post or to any account, and/or remove from the Member section content that violates these Member Content Guidelines, or Medium’s general rules. In deciding whether something has violated the Rules and Guidelines, we will take into account things like the context, newsworthiness, and nature of the posted information and applicable privacy laws. Repeated violations of the Rules and Requirements will result in permanent ineligibility to participate in the program.
113 | How to report a Member post violation
114 | If you find a member post or account on Medium that violates these rules, please flag it. You can use this form to provide more detail or to report other conduct you believe violates our rules. Additionally, you can send us an email to yourfriends@medium.com.
115 | If you have questions or feedback about these rules, let us know. You can contact us at trust@medium.com.
116 | Updated November 2019


--------------------------------------------------------------------------------
/data/input/medium_rules.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kohimax/qna-api/94284b898352e86255b63d235982fef89f4c8279/data/input/medium_rules.pdf


--------------------------------------------------------------------------------
/data/squad20/answers.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |     {
 4 |       "paragraphs": [
 5 |         {
 6 |           "qas": [
 7 |             {
 8 |               "question": "Is Third party advertising allow?",
 9 |               "id": 61727,
10 |               "answers": [
11 |                 {
12 |                   "answer_id": 66399,
13 |                   "document_id": 67444,
14 |                   "question_id": 61727,
15 |                   "text": "Third-party advertising and sponsorships are not allowed",
16 |                   "answer_start": 7120,
17 |                   "answer_category": null
18 |                 }
19 |               ],
20 |               "is_impossible": false
21 |             }
22 |           ],
23 |           "context": "Medium exists to share ideas and perspectives from the world’s most insightful writers, thinkers, and storytellers.\nWe welcome thoughtful and civil discussion from a broad spectrum of viewpoints. Nevertheless, to maintain a safe and welcoming environment for a wide range of people to engage in meaningful conversations, we prohibit certain conduct.\nEach participant in our community is responsible for maintaining these standards.\nIn deciding whether someone has violated the rules, we will take into account things like newsworthiness, the context and nature of the posted information, the likelihood and severity of actual or potential harms, and applicable laws.\nViolations of our rules may result in consequences such as account restrictions, limited distribution of your posts, and suspension of your account. We may limit the distribution of controversial or extreme content.\nThreats of violence and incitement\nWe do not allow content or actions that threaten, encourage, or incite violence against anyone, directly or indirectly.\nHate speech\nWe do not allow content that promotes violence or hatred against people based on characteristics like race, ethnicity, national origin, religion, disability, disease, age, sexual orientation, gender, or gender identity.\nWe do not allow posts or accounts that glorify, celebrate, downplay, or trivialize violence, suffering, abuse, or deaths of individuals or groups. This includes the use of scientific or pseudoscientific claims to pathologize, dehumanize, or disempower others. We do not allow calls for intolerance, exclusion, or segregation based on protected characteristics, nor do we allow the glorification of groups which do any of the above.\nWe do not allow hateful text, images, symbols, or other content in your username, profile, or bio.\nHarassment\nMedium exists to share and discuss ideas. We don’t tolerate harassment, which includes:\nBullying, threatening, or shaming someone, or posting things likely to encourage others to do so\nUsing Medium features like responses, private notes, mentions, follows, story requests, or writer requests in a way intended to annoy or harass someone\nReviewing businesses or products in a gratuitously harmful or abusive manner\nPrivacy and Reputation\nWe do not allow the following:\nPosting copies of private communications between private individuals without the explicit consent of all parties to the communication\nDoxing, which includes not only private or obscure personal information but also the aggregation of publicly available information to target, shame, blackmail, harass, intimidate, threaten, or endanger\nConfidential C\nPosting intimate or explicit images taken or posted without the subject’s express consent\nContent that violates others’ privacy, including sensitive or confidential information such as credit card numbers, social security numbers, non-public phone numbers, physical addresses, email addresses, or other non-public information\nRestricted categories\nWe do not allow posts or accounts that engage in the following restricted categories of activity:\nPromotion of harmful conspiracies\nFacilitation of gambling or betting\nFacilitation of buying or selling social media interactions, including off-platform\nFacilitation of illegal sexual services\nFacilitation of copyright violation\nRelated conduct\nWe do not allow posts or accounts that engage in on-platform, off-platform, or cross-platform campaigns of targeting, harassment, hate speech, violence, or disinformation. We may consider off-platform actions in assessing a Medium account, and restrict access or availability to that account.\nDuplicate Content\nWe do not allow posting duplicate content, whether from a single account or across multiple accounts, either publicly or as an unlisted story.\nSpam\nWe do not allow spam on Medium. All spam will be immediately removed from Medium, without notification. While it is hard to define this content with precision or completeness, here are some representative behaviors we look for that are characteristic of spam:\nPosting content primarily to drive traffic to, or increase the search rankings of, an external site, product, or service\nScraping and reposting content from other sources for the primary purpose of generating revenue or other personal gains\nPosting duplicate content, whether from a single account or across multiple accounts\nStories where the content is clipped with the sole purpose of linking to the rest of the article on a different website\nPerforming a disproportionately large number of interactions, particularly by automated means. This includes bulk, indiscriminate interactions, such as following of other accounts (follow spam) clapping, highlighting, or leaving notes\nRepeatedly using responses or other interactions as a method of promotion\nParticipating in bounty campaigns or brigades to artificially inflate rankings for posts, accounts, businesses, or products\nUse or re-use content templates with slight modifications across multiple posts and accounts\nConfidential C\nFor each of these behaviors, when we talk about “content,” we mean not only posts but also any other feature that allows you to add your own text or media. When we talk about “interactions,” we mean any feature that allows one user to interact with another, or with a post.\nPaid, automatic, bulk, or non-genuine interactions\nMedium depends on various user behaviors — like follows and claps — to determine what content to feature and make the site work well for everyone. So, we don’t allow artificial behaviors that skew this system and as a result degrade or distort other users’ experiences. This includes:\nBuying, selling, or trading in accounts or account interactions — including follows, claps, highlights, responses or other traffic\nUsing services, apps, or arrangements that offer you more follows, claps, or other interactions on your Medium account or content\nRegistering accounts, posting content, or interacting with users or content automatically, systematically, or programmatically\nCopyright and trademark infringement\nRespect the copyrights and trademarks of others. Unless you’re authorized to use someone else’s copyrighted work or trademark (either expressly or by legal exceptions and limitations like fair use), don’t do it. We respond to notices of alleged infringement as described in our Terms of Service, Copyright and DMCA Policy, and Trademark Policy.\nDeceptive conduct\nWe do not allow deceptive conduct on Medium. This includes:\nPosting content or impersonating a person or organization in a way likely to deceive people. Parody and satire are fine, but make clear that is what you’re doing. Our Username Policy has specific requirements for parody accounts\nUsing Medium for phishing or fraud. Don’t use tags, links, titles, or other metadata in a misleading way. Don’t link to or embed malicious or harmful code or software in your posts\nDon’t use deception to generate revenue or traffic\nIf you have received compensation, free goods/services or anything of value in connection with the topic of a post, you must make this clear\nAds, Promotions, and Marketing\nThird-party advertising and sponsorships are not allowed. You may not advertise or promote third-party products, services, or brands through Medium posts, publications, or letters. This includes images that indicate brand sponsorship in a post or letter, or as part of a publication name or logo.\nImages functioning as third-party ads are not allowed. Inline images or embeds that link out and function as banner ads for third-party brands are not allowed.\nYou must disclose affiliate links or payment for a post. Affiliate links, such as link out to Amazon with your code, or any other link out where you will receive a commission or other value, are allowed in posts. But, you must disclose somewhere in the post that it includes affiliate links. If you have\nConfidential C\nreceived payment, goods or services, or something else of value in exchange for writing a post, you must still disclose this fact in writing within your post (as per FTC Rules and Guides).\nMulti-level marketing content is not allowed.\nEmbedded Content\nIn ongoing compliance with the European GDPR, we are restricting the behavior of certain embedded content:\nEmbeds that directly collect data through form fields are not allowed. This includes embeds that facilitate the submission of email addresses and other personally identifying information through forms and the submission of credit card information. If you want to collect information from your readers, you will need to link out from your post on Medium to a form hosted elsewhere that makes it clear to a user that they are no longer in the Medium network\nCryptocurrency Accounts, Posts, and Publications\nPosts and accounts that focus on cryptocurrencies which do not meet the following requirements may be considered spam and are subject to suspension, depending on context. Medium does not endorse or verify any coin, token, or similar announcement.\nTo prevent fraud and abuse, you must:\nInclude a link to your active project domain in your user account bio.\nUse an email address from that domain as your verified Medium account email, and maintain that email account actively.\nLink at least one consistently-branded social media account (Facebook or Twitter) to your Medium account. That social account should also link out prominently to the same domain as is included in your Medium user bio.\nInclude a prominent about page with up-to-date contact information on your project website.\nYou may not:\nUse an anonymous email address which is not linked to your project domain. (e.g., gmail, protonmail, mail.ru, etc.)\nAdvertise or participate in bounty campaigns, pump and dumps, reviewing for reward, or other forms of brigading or inauthentic activity.\nInclude shortened URLs in your posts.\nGraphic content\nWe do not allow pornographic images or videos. We do allow erotic writing and non-graphic erotic images.\nWe do not allow gratuitously graphic or disturbing media, even if it’s not pornographic.\nExploitation of minors\nWe do not allow content promoting the sexual or violent exploitation of minors, including the sexualization of fictional minors.\nConfidential C\nPromotion and glorification of self-harm\nWe do not allow activities that encourage, promote or glorify acts of self-harm, such as cutting, eating disorders like anorexia or bulimia, and suicide. If you encounter users contemplating or threatening self-harm, please report it to us via the form or email address linked below.\nHow to report a violation\nIf you find a post or account on Medium that violates these rules, please flag it. You can use this form to provide more detail or to report other conduct you believe violates our rules. Additionally, you can send us an email to trust@medium.com.\nIf you break the rules\nWe strive to be fair, but we reserve the right to suspend accounts or remove content, without notice, for any reason, particularly to protect our services, infrastructure, users, or community. If you attempt to evade suspension by creating new accounts or posts, we will suspend your new accounts and posts.\nNotice\nUpon investigating or disabling content associated with your account, we will notify you, unless we believe your account is automated or operating in bad faith, or that notifying you is likely to cause, maintain or exacerbate harm to someone.\nAppeals\nIf you believe your content or account has been restricted or disabled in error, or believe there is relevant context we were not aware of in reaching our determination, you can write to us at yourfriends@medium.com. We will consider all good faith efforts to appeal.\nGovernment Takedown Requests\nIf Medium receives a request from a government actor to restrict access to content associated with your account, we will notify you unless we are prohibited by law or believe doing so may endanger others. Where applicable, we will work to limit legally-ordered content restrictions to jurisdictions where we have a good faith belief that we are legally required to restrict the content. Medium submits to the Lumen database government requests to restrict access to content (redacted where appropriate to protect privacy or prevent harm to a person).\nWe may enforce, or not enforce, these policies at our sole discretion. These policies don’t create a duty or contractual obligation for us to act.\nWe also may change these rules at any time. We track changes to our rules on Github so you can see how they evolve.\nMedium is committed to providing a transparent, open platform for expression and therefore supports the goals and spirit of The Santa Clara Principles on Transparency and Accountability in Content Moderation as a starting point for further discussion.\nMember Content Guidelines\nTo be eligible for inclusion and potential monetization from member-only content, user posts must meet all Medium rules as well as the following content requirements. We’ve set these additional\nConfidential C\nguidelines for member-only stories to create a fair environment for writers in our Partner Program and a richer reading experience for members.\nYou may not sell advertising in or on your member-only content.\nMember stories should not exist with the predominant purpose of driving traffic to an external website, business, or product. All included content must be relevant, substantial and informative.\nIf you have received compensation, free goods/services, or anything of value in connection with the topic of a post, you must make this clear.\nDo not ask for claps or include other calls to action, including gifs of clapping, requests for donations, or other links or embeds for the purpose of capturing user information or soliciting money.\nDo not use content or images licensed as non-commercial in your Partner Program posts.\nIf you violate these Member content rules\nMedium reserves the right to review posts or accounts at any time and stop payment for any post or to any account, and/or remove from the Member section content that violates these Member Content Guidelines, or Medium’s general rules. In deciding whether something has violated the Rules and Guidelines, we will take into account things like the context, newsworthiness, and nature of the posted information and applicable privacy laws. Repeated violations of the Rules and Requirements will result in permanent ineligibility to participate in the program.\nHow to report a Member post violation\nIf you find a member post or account on Medium that violates these rules, please flag it. You can use this form to provide more detail or to report other conduct you believe violates our rules. Additionally, you can send us an email to yourfriends@medium.com.\nIf you have questions or feedback about these rules, let us know. You can contact us at trust@medium.com.\nUpdated November 2019",
24 |           "document_id": 67444
25 |         }
26 |       ]
27 |     }
28 |   ]
29 | }


--------------------------------------------------------------------------------
/gunicorn.conf.py:
--------------------------------------------------------------------------------
 1 | #By default the multi-process version of the runtime is launched via the Gunicorn webserver and is configured to use gevent-based concurrency and a number of processes equal to the number of CPU cores available.
 2 | 
 3 | #This can be changed by creating a file called gunicorn.conf.py in your applications root directory, which will override the default gunicorn.conf.py included with this project
 4 | 
 5 | 
 6 | import multiprocessing
 7 | 
 8 | # Use threaded workers. Thread-based concurrency is provided via the 'futures'
 9 | # package. 'gevent' or other workers would be candidates, except that the ndb
10 | # library has its own concurrency model that conflicts with gevent and possibly
11 | # with similar approaches.
12 | worker_class = 'gthread'
13 | 
14 | # Use a number of workers equal to the number of CPU cores available.
15 | # Reducing this number on a multicore instance will reduce memory consumption,
16 | # but will also reduce the app's ability to utilize all available CPU resources.
17 | #workers = multiprocessing.cpu_count()
18 | #workers = 8 # good
19 | workers = 2 # good
20 | 
21 | # Use an arbitrary number of threads for concurrency. This will dictate the
22 | # maximum number of requests handled concurrently by EACH worker.
23 | threads = 25
24 | 
25 | # Settings specific to the Managed VMs production environment such as "bind"
26 | # and "logfile" are set in the Dockerfile's ENTRYPOINT directive.
27 | 
28 | # Store the process ID of gunicorn.  Used for testing.
29 | pidfile = 'gunicorn_pid.txt'
30 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # import packages
  2 | import json
  3 | import os
  4 | import logging
  5 | from flask_cors import CORS
  6 | from flask import Flask, request, jsonify
  7 | from haystack import Finder
  8 | from haystack.preprocessor.cleaning import clean_wiki_text
  9 | from haystack.preprocessor.utils import convert_files_to_dicts
 10 | from haystack.reader.farm import FARMReader
 11 | from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
 12 | from haystack.file_converter.pdf import PDFToTextConverter
 13 | from haystack.retriever.dense import DensePassageRetriever
 14 | from haystack.retriever.sparse import ElasticsearchRetriever
 15 | 
 16 | #application settings
 17 | app = Flask(__name__)
 18 | CORS(app)
 19 | 
 20 | # Application directory for inputs and training
 21 | app.config["input"] = "/usr/src/app/data/input"
 22 | app.config["train_model"] = "/usr/src/app/data/train_model"
 23 | app.config["squad_data"] = "/usr/src/app/data/squad20"
 24 | 
 25 | # ElasticSearch server host information
 26 | app.config["host"] = "0.0.0.0"
 27 | app.config["username"] = ""
 28 | app.config["password"] = ""
 29 | app.config["port"] = "9200"
 30 | 
 31 | @app.route('/')
 32 | def home():
 33 |     """Return a friendly HTTP greeting."""
 34 |     return 'Hello QNA API is running'
 35 | 
 36 | #endpoint to update embedded method
 37 | @app.route('/set_embeded', methods=['POST'])
 38 | def set_embeded():
 39 |     """Return a friendly HTTP greeting."""
 40 |     index = request.form['index']
 41 |     document_store = ElasticsearchDocumentStore(host=app.config["host"],
 42 |                                                 port=app.config["port"],
 43 |                                                 username=app.config["username"],
 44 |                                                 password=app.config["password"],
 45 |                                                 index=index,embedding_field="embedding",
 46 |                                                 embedding_dim=768)
 47 |     retriever = DensePassageRetriever(document_store=document_store,
 48 |                                       embedding_model="dpr-bert-base-nq",
 49 |                                       do_lower_case=True, use_gpu=False)
 50 |     #Now update the retriever embedded to the elasticsearch document
 51 |     document_store.update_embeddings(retriever)
 52 |     return json.dumps({'status':'Susccess','message': 'Sucessfully embeded method updated in ElasticSearch Document', 'result': []})
 53 | 
 54 | @app.route('/update_document', methods=['POST'])
 55 | def update_document():
 56 |     """Return a the url of the index document."""
 57 |     if request.files:
 58 |         # index is the target document where queries need to sent.
 59 |         index = request.form['index']
 60 |         # uploaded document for target source
 61 |         doc = request.files["doc"]
 62 | 
 63 |         file_path = os.path.join(app.config["input"], doc.filename)
 64 | 
 65 |         # saving the file to the input directory
 66 |         doc.save(file_path)
 67 |         #initialization of the Haystack Elasticsearch document storage
 68 |         document_store = ElasticsearchDocumentStore(host=app.config["host"],
 69 |                                                     port=app.config["port"],
 70 |                                                     username=app.config["username"],
 71 |                                                     password=app.config["password"],
 72 |                                                     index=index)
 73 |         # convert the pdf files into dictionary and update to ElasticSearch Document
 74 |         dicts = convert_files_to_dicts(
 75 |             app.config["input"],
 76 |             clean_func=clean_wiki_text,
 77 |             split_paragraphs=False)
 78 |         document_store.write_documents(dicts)
 79 |         os.remove(file_path)
 80 |         return json.dumps(
 81 |             {'status':'Susccess','message':
 82 |                 'document available at http://'+ app.config["host"] +':'
 83 |                 + app.config["port"] +'/' + index + '/_search',
 84 |                 'result': []})
 85 |     else:
 86 |         return json.dumps({'status':'Failed','message': 'No file uploaded', 'result': []})
 87 | 
 88 | 
 89 | @app.route('/qna', methods=['POST'])
 90 | def qna():
 91 |     """Return the n answers."""
 92 | 
 93 |     question = request.form['question']
 94 |     # index is the target document where queries need to sent.
 95 |     index = request.form['index']
 96 | 
 97 |     # to select train or untrained model
 98 |     mode = request.form['mode']
 99 | 
100 |     #initialization of the Haystack Elasticsearch document storage
101 |     document_store = ElasticsearchDocumentStore(
102 |         host=app.config["host"],
103 |         username=app.config["username"],
104 |         password=app.config["password"],
105 |         index=index)
106 | 
107 |     if mode == 'trained':
108 |         # base on the search mode train_model
109 |         reader = FARMReader(model_name_or_path=app.config["train_model"] ,
110 |                             use_gpu=False)
111 |     else:
112 |         # base on the search mode pre_train
113 |         reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad",
114 |                             use_gpu=False)
115 | 
116 |     #initialization of ElasticRetriever
117 |     retriever = ElasticsearchRetriever(document_store= document_store)
118 |     # Finder sticks together reader and retriever
119 |     # in a pipeline to answer our actual questions.
120 |     finder = Finder(reader, retriever)
121 | 
122 |     # predict n answers
123 |     n = int(request.form['n'])
124 |     prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=n)
125 |     answer = []
126 |     for res in prediction['answers']:
127 |         answer.append(res['answer'])
128 | 
129 |     return json.dumps({'status':'success','message': 'Process succesfully', 'result': answer})
130 | 
131 | 
132 | @app.route('/qna_pretrain', methods=['POST'])
133 | def qna_pretrain():
134 |     """Return the n answers."""
135 | 
136 |     question = request.form['question']
137 |     # index is the target document where queries need to sent.
138 |     index = request.form['index']
139 | 
140 |     #initialization of the Haystack Elasticsearch document storage
141 |     document_store = ElasticsearchDocumentStore(
142 |         host=app.config["host"],
143 |         username=app.config["username"],
144 |         password=app.config["password"],
145 |         index=index)
146 | 
147 |     # using pretrain model
148 |     reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad",
149 |                             use_gpu=False)
150 | 
151 |     #initialization of ElasticRetriever
152 |     retriever = ElasticsearchRetriever(document_store= document_store)
153 |     # Finder sticks together reader and retriever
154 |     # in a pipeline to answer our actual questions.
155 |     finder = Finder(reader, retriever)
156 | 
157 |     # predict n answers
158 |     n = int(request.form['n'])
159 |     prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=n)
160 |     answer = []
161 |     for res in prediction['answers']:
162 |         answer.append(res['answer'])
163 | 
164 |     return json.dumps({'status':'success','message': 'Process succesfully', 'result': answer})
165 | 
166 | 
167 | @app.errorhandler(500)
168 | def server_error(e):
169 |     logging.exception('An error occurred during a request.')
170 |     return json.dumps({'status':'failed','message':
171 |         """An internal error occurred: <pre>{}</pre>See logs for full stacktrace.""".format(e),
172 |                        'result': []})
173 | 
174 | if __name__ == '__main__':
175 |     app.run(host='0.0.0.0', port=8777)
176 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask
2 | gunicorn
3 | futures
4 | farm-haystack


--------------------------------------------------------------------------------
/util-trainer.py:
--------------------------------------------------------------------------------
 1 | from haystack.reader.farm import FARMReader
 2 | 
 3 | #input directory of the labels answers.json file
 4 | train_data = "/usr/src/app/data/squad20"
 5 | # output directory of the model
 6 | train_model = "/usr/src/app/data/train_model"
 7 | 
 8 | reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)
 9 | 
10 | reader.train(
11 |     data_dir=train_data,
12 |     train_filename="answers.json",
13 |     n_epochs=20,
14 |     dev_split = 0,
15 |     save_dir=train_model)
16 | 
17 | print('Training successfully completed')


--------------------------------------------------------------------------------