├── .gitignore ├── pyproject.toml ├── images ├── main.png └── similarity.png ├── requirements.txt ├── .github └── dependabot.yml ├── .env.dist ├── main.py ├── README.md ├── LICENSE ├── Templates ├── report.html └── index.html ├── similarity.py └── websearch.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | __pycache__ 3 | .env 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff.lint] 2 | select = ["ALL"] 3 | -------------------------------------------------------------------------------- /images/main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harirakul/Plagiarism-Detection/HEAD/images/main.png -------------------------------------------------------------------------------- /images/similarity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harirakul/Plagiarism-Detection/HEAD/images/similarity.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.13.3 2 | Flask==3.1.0 3 | nltk==3.9.1 4 | pandas==2.2.3 5 | requests==2.32.3 6 | loguru==0.7.3 7 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /.env.dist: -------------------------------------------------------------------------------- 1 | # Search system settings 2 | 3 | # https://www.google.com/search?q= 4 | # https://ya.ru/search/?text= 5 | # https://www.bing.com/search?q= 6 | # https://duckduckgo.com/?q= 7 | 8 | SEARCH_LINK=https://ya.ru/search/?text= 9 | USER_AGENT=Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 10 | COOKIE= 11 | BLACK_LIST= 12 | 13 | 14 | # Flask server settings 15 | 16 | DEBUG=True 17 | HOST=localhost 18 | PORT=5555 19 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Main module: starting app.""" 2 | 3 | import os 4 | 5 | from dotenv import load_dotenv 6 | from flask import Flask, render_template, request 7 | 8 | import similarity 9 | 10 | app = Flask(__name__, template_folder="Templates") 11 | 12 | 13 | @app.route("/", methods=["GET", "POST"]) 14 | def main_page() -> str: 15 | """Render and return main page.""" 16 | return render_template("index.html") 17 | 18 | 19 | @app.route("/report", methods=["POST", "GET"]) 20 | def report_page() -> str: 21 | """Render and return report page.""" 22 | result = request.form["text"] 23 | return render_template("report.html") + similarity.return_table( 24 | similarity.report(str(result)), 25 | ) 26 | 27 | 28 | if __name__ == "__main__": 29 | # Loading consts from .env 30 | load_dotenv() 31 | 32 | IS_DEBUG = os.getenv("DEBUG").lower() == "true" 33 | HOST = os.getenv("HOST") 34 | PORT = os.getenv("PORT") 35 | 36 | # Starting flask app 37 | app.run(debug=IS_DEBUG, host=HOST, port=PORT) 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Plagiarism Detection with Python 2 | 3 | ![main page preview](images/main.png) 4 | 5 | ![similarity page preview](images/similarity.png) 6 | 7 | Finds percentage similarity between inputted text and relevant articles on the web. 8 | 9 | ## How to install? 10 | 11 | 1. Clone repository: 12 | 13 | ``` 14 | git clone https://github.com/harirakul/Plagiarism-Detection.git 15 | ``` 16 | 17 | 2. Create and activate venv: 18 | 19 | ``` 20 | python3 -m venv venv 21 | . venv/bin/activate 22 | ``` 23 | 24 | 3. Install requirements: 25 | 26 | ``` 27 | pip install -r requirements.txt 28 | ``` 29 | 30 | 4. Rename `.env.dist` to `.env`. 31 | 32 | > For better work you need fill `COOKIE` in `.env`. You can grab on dev tools in your browser. 33 | 34 | 4. Start web-app: 35 | 36 | ``` 37 | python3 main.py 38 | ``` 39 | 40 | ## Website Link: 41 | http://plagiarism-detection--hariambethkar.repl.co/ 42 | 43 | ## How To: 44 | - Click on the link to the website 45 | - Enter or paste your text in the input field 46 | - Click on `Generate Report` 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 harirakul 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Templates/report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Similarity Report 8 | 51 | 52 | 53 | 54 |

Similarity Report

55 | 56 | 57 | -------------------------------------------------------------------------------- /similarity.py: -------------------------------------------------------------------------------- 1 | """Module for checking text.""" 2 | 3 | from difflib import SequenceMatcher 4 | 5 | import nltk 6 | import pandas as pd 7 | 8 | import websearch 9 | 10 | nltk.download("stopwords") 11 | nltk.download("punkt") 12 | stop_words = set(nltk.corpus.stopwords.words("english")) 13 | 14 | 15 | def purify_text(string: str) -> str: 16 | """Clear text.""" 17 | words = nltk.word_tokenize(string) 18 | return " ".join([word for word in words if word not in stop_words]) 19 | 20 | 21 | def web_verify(string: str, results_per_sentence: str) -> list: 22 | """Web verify function.""" 23 | sentences = nltk.sent_tokenize(string) 24 | matching_sites = [] 25 | for url in websearch.search(query=string, num=results_per_sentence): 26 | matching_sites.append(url) 27 | for sentence in sentences: 28 | for url in websearch.search(query=sentence, num=results_per_sentence): 29 | matching_sites.append(url) 30 | 31 | return list(set(matching_sites)) 32 | 33 | 34 | def similarity(str1: str, str2: str) -> float: 35 | """Calculate the similarity in percentages.""" 36 | return (SequenceMatcher(None, str1, str2).ratio()) * 100 37 | 38 | 39 | def report(text: str) -> dict: 40 | """Forming a report.""" 41 | matching_sites = web_verify(purify_text(text), 2) 42 | matches = {} 43 | 44 | for i in range(len(matching_sites)): 45 | matches[matching_sites[i]] = similarity( 46 | text, 47 | websearch.extract_text(matching_sites[i]), 48 | ) 49 | 50 | return { 51 | k: v for k, v in sorted(matches.items(), key=lambda item: item[1], reverse=True) 52 | } 53 | 54 | 55 | def return_table(dictionary: dict) -> str: 56 | """Return the table.""" 57 | search_result = pd.DataFrame({"Similarity (%)": dictionary}) 58 | return search_result.to_html() 59 | -------------------------------------------------------------------------------- /websearch.py: -------------------------------------------------------------------------------- 1 | """Search module.""" 2 | 3 | import os 4 | import warnings 5 | 6 | import requests 7 | from bs4 import BeautifulSoup 8 | from loguru import logger 9 | 10 | warnings.filterwarnings("ignore", module="bs4") 11 | 12 | 13 | def search(query: str, num: int) -> list[str]: 14 | """Do a request and colleting result.""" 15 | user_agent = os.getenv("USER_AGENT") 16 | search_link = os.getenv("SEARCH_LINK") 17 | cookie = os.getenv("COOKIE") 18 | user_black_list = os.getenv("BLACK_LIST").split(", ") 19 | 20 | black_list = [ 21 | # Yandex black list 22 | "https://passport.yandex.ru/", 23 | "https://yandexwebcache.net/", 24 | "https://yandex.ru/support/", 25 | "https://cloud.yandex.ru/", 26 | "https://yandex.ru/", 27 | "https://www.ya.ru", 28 | ] 29 | 30 | black_list += user_black_list 31 | 32 | url = f"{search_link}{query}" 33 | urls = [] 34 | 35 | page = requests.get( 36 | url, 37 | headers={ 38 | "user-agent": user_agent, 39 | "cookie": cookie, 40 | }, 41 | timeout=20, 42 | ) 43 | soup = BeautifulSoup(page.text, "html.parser") 44 | 45 | for link in soup.find_all("a"): 46 | url = str(link.get("href")) 47 | 48 | black = False 49 | if url.startswith("http"): 50 | for black_url in black_list: 51 | if black_url in url: 52 | black = True 53 | break 54 | if not black: 55 | urls.append(url) 56 | logger.debug("URL: {}", url) 57 | else: 58 | logger.error("URL: {}", url) 59 | 60 | return urls[:num] 61 | 62 | 63 | def extract_text(url: str) -> str: 64 | """Extract text from url.""" 65 | page = requests.get(url, timeout=40) 66 | soup = BeautifulSoup(page.text, "html.parser") 67 | return soup.get_text() 68 | -------------------------------------------------------------------------------- /Templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Plagiarism Detector 4 | 53 | 54 |
55 |
56 |

Plagiarism Detector

57 | 58 |
59 |

Paste your text here:


60 | 61 |
62 | 63 |
64 | 65 |
66 | 67 |
68 |

You will be redirected once your report is generated.

69 |

Time to generate report depends on length of text.

70 |
71 |
72 | 73 | 74 | --------------------------------------------------------------------------------