├── .gitignore
├── pyproject.toml
├── images
    ├── main.png
    └── similarity.png
├── requirements.txt
├── .github
    └── dependabot.yml
├── .env.dist
├── main.py
├── README.md
├── LICENSE
├── Templates
    ├── report.html
    └── index.html
├── similarity.py
└── websearch.py


/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | __pycache__
3 | .env
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff.lint]
2 | select = ["ALL"]
3 | 


--------------------------------------------------------------------------------
/images/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harirakul/Plagiarism-Detection/HEAD/images/main.png


--------------------------------------------------------------------------------
/images/similarity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harirakul/Plagiarism-Detection/HEAD/images/similarity.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.13.3
2 | Flask==3.1.0
3 | nltk==3.9.1
4 | pandas==2.2.3
5 | requests==2.32.3
6 | loguru==0.7.3
7 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "pip"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"
7 | 


--------------------------------------------------------------------------------
/.env.dist:
--------------------------------------------------------------------------------
 1 | # Search system settings
 2 | 
 3 | # https://www.google.com/search?q=
 4 | # https://ya.ru/search/?text=
 5 | # https://www.bing.com/search?q=
 6 | # https://duckduckgo.com/?q=
 7 | 
 8 | SEARCH_LINK=https://ya.ru/search/?text=
 9 | USER_AGENT=Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0
10 | COOKIE=
11 | BLACK_LIST=
12 | 
13 | 
14 | # Flask server settings
15 | 
16 | DEBUG=True
17 | HOST=localhost
18 | PORT=5555
19 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | """Main module: starting app."""
 2 | 
 3 | import os
 4 | 
 5 | from dotenv import load_dotenv
 6 | from flask import Flask, render_template, request
 7 | 
 8 | import similarity
 9 | 
10 | app = Flask(__name__, template_folder="Templates")
11 | 
12 | 
13 | @app.route("/", methods=["GET", "POST"])
14 | def main_page() -> str:
15 |     """Render and return main page."""
16 |     return render_template("index.html")
17 | 
18 | 
19 | @app.route("/report", methods=["POST", "GET"])
20 | def report_page() -> str:
21 |     """Render and return report page."""
22 |     result = request.form["text"]
23 |     return render_template("report.html") + similarity.return_table(
24 |         similarity.report(str(result)),
25 |     )
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     # Loading consts from .env
30 |     load_dotenv()
31 | 
32 |     IS_DEBUG = os.getenv("DEBUG").lower() == "true"
33 |     HOST = os.getenv("HOST")
34 |     PORT = os.getenv("PORT")
35 | 
36 |     # Starting flask app
37 |     app.run(debug=IS_DEBUG, host=HOST, port=PORT)
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Plagiarism Detection with Python
 2 | 
 3 | ![main page preview](images/main.png)
 4 | 
 5 | ![similarity page preview](images/similarity.png)
 6 | 
 7 | Finds percentage similarity between inputted text and relevant articles on the web.
 8 | 
 9 | ## How to install?
10 | 
11 | 1. Clone repository:
12 | 
13 | ```
14 | git clone https://github.com/harirakul/Plagiarism-Detection.git
15 | ```
16 | 
17 | 2. Create and activate venv:
18 | 
19 | ```
20 | python3 -m venv venv
21 | . venv/bin/activate
22 | ```
23 | 
24 | 3. Install requirements:
25 | 
26 | ```
27 | pip install -r requirements.txt
28 | ```
29 | 
30 | 4. Rename `.env.dist` to `.env`.
31 | 
32 | > For better work you need fill `COOKIE` in `.env`. You can grab on dev tools in your browser.
33 | 
34 | 4. Start web-app:
35 | 
36 | ```
37 | python3 main.py
38 | ```
39 | 
40 | ## Website Link:
41 | http://plagiarism-detection--hariambethkar.repl.co/
42 | 
43 | ## How To:
44 | - Click on the link to the website
45 | - Enter or paste your text in the input field
46 | - Click on `Generate Report`
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 harirakul
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Templates/report.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <title>Similarity Report</title>
 8 |     <style>
 9 |         body {
10 |         color: white;
11 |         background-color: #181616;
12 |         padding: 20px;
13 |         }
14 |         table {
15 |             width: 100%;
16 |             border-collapse: collapse;
17 |             margin: 20px 0;
18 |             font-family: Arial, sans-serif;
19 |             background-color: #2c2c2c;
20 |             color: #ffffff;
21 |             border-radius: 8px;
22 |             overflow: hidden;
23 |         }
24 |         th, td {
25 |             border: 1px solid #444444;
26 |             text-align: left;
27 |             padding: 12px;
28 |             max-width: 30ch;
29 |             overflow: hidden;
30 |             text-overflow: ellipsis;
31 |             white-space: nowrap;
32 |         }
33 |         th {
34 |             background-color: #3a3a3a;
35 |             font-weight: bold;
36 |         }
37 |         tr:nth-child(even) {
38 |             background-color: #3a3a3a;
39 |         }
40 |         tr:hover {
41 |             background-color: #555555;
42 |             transform: scale(1.02);
43 |             transition: background-color 0.3s, transform 0.3s;
44 |         }
45 |         caption {
46 |             font-size: 1.5em;
47 |             margin: 10px;
48 |             color: #ffffff;
49 |         }
50 |     </style>
51 | </head>
52 | 
53 | <body>
54 |     <h2>Similarity Report</h2>
55 | </body>
56 | 
57 | </html>


--------------------------------------------------------------------------------
/similarity.py:
--------------------------------------------------------------------------------
 1 | """Module for checking text."""
 2 | 
 3 | from difflib import SequenceMatcher
 4 | 
 5 | import nltk
 6 | import pandas as pd
 7 | 
 8 | import websearch
 9 | 
10 | nltk.download("stopwords")
11 | nltk.download("punkt")
12 | stop_words = set(nltk.corpus.stopwords.words("english"))
13 | 
14 | 
15 | def purify_text(string: str) -> str:
16 |     """Clear text."""
17 |     words = nltk.word_tokenize(string)
18 |     return " ".join([word for word in words if word not in stop_words])
19 | 
20 | 
21 | def web_verify(string: str, results_per_sentence: str) -> list:
22 |     """Web verify function."""
23 |     sentences = nltk.sent_tokenize(string)
24 |     matching_sites = []
25 |     for url in websearch.search(query=string, num=results_per_sentence):
26 |         matching_sites.append(url)
27 |     for sentence in sentences:
28 |         for url in websearch.search(query=sentence, num=results_per_sentence):
29 |             matching_sites.append(url)
30 | 
31 |     return list(set(matching_sites))
32 | 
33 | 
34 | def similarity(str1: str, str2: str) -> float:
35 |     """Calculate the similarity in percentages."""
36 |     return (SequenceMatcher(None, str1, str2).ratio()) * 100
37 | 
38 | 
39 | def report(text: str) -> dict:
40 |     """Forming a report."""
41 |     matching_sites = web_verify(purify_text(text), 2)
42 |     matches = {}
43 | 
44 |     for i in range(len(matching_sites)):
45 |         matches[matching_sites[i]] = similarity(
46 |             text,
47 |             websearch.extract_text(matching_sites[i]),
48 |         )
49 | 
50 |     return {
51 |         k: v for k, v in sorted(matches.items(), key=lambda item: item[1], reverse=True)
52 |     }
53 | 
54 | 
55 | def return_table(dictionary: dict) -> str:
56 |     """Return the table."""
57 |     search_result = pd.DataFrame({"Similarity (%)": dictionary})
58 |     return search_result.to_html()
59 | 


--------------------------------------------------------------------------------
/websearch.py:
--------------------------------------------------------------------------------
 1 | """Search module."""
 2 | 
 3 | import os
 4 | import warnings
 5 | 
 6 | import requests
 7 | from bs4 import BeautifulSoup
 8 | from loguru import logger
 9 | 
10 | warnings.filterwarnings("ignore", module="bs4")
11 | 
12 | 
13 | def search(query: str, num: int) -> list[str]:
14 |     """Do a request and colleting result."""
15 |     user_agent = os.getenv("USER_AGENT")
16 |     search_link = os.getenv("SEARCH_LINK")
17 |     cookie = os.getenv("COOKIE")
18 |     user_black_list = os.getenv("BLACK_LIST").split(", ")
19 | 
20 |     black_list = [
21 |         # Yandex black list
22 |         "https://passport.yandex.ru/",
23 |         "https://yandexwebcache.net/",
24 |         "https://yandex.ru/support/",
25 |         "https://cloud.yandex.ru/",
26 |         "https://yandex.ru/",
27 |         "https://www.ya.ru",
28 |     ]
29 | 
30 |     black_list += user_black_list
31 | 
32 |     url = f"{search_link}{query}"
33 |     urls = []
34 | 
35 |     page = requests.get(
36 |         url,
37 |         headers={
38 |             "user-agent": user_agent,
39 |             "cookie": cookie,
40 |         },
41 |         timeout=20,
42 |     )
43 |     soup = BeautifulSoup(page.text, "html.parser")
44 | 
45 |     for link in soup.find_all("a"):
46 |         url = str(link.get("href"))
47 | 
48 |         black = False
49 |         if url.startswith("http"):
50 |             for black_url in black_list:
51 |                 if black_url in url:
52 |                     black = True
53 |                     break
54 |             if not black:
55 |                 urls.append(url)
56 |                 logger.debug("URL: {}", url)
57 |             else:
58 |                 logger.error("URL: {}", url)
59 | 
60 |     return urls[:num]
61 | 
62 | 
63 | def extract_text(url: str) -> str:
64 |     """Extract text from url."""
65 |     page = requests.get(url, timeout=40)
66 |     soup = BeautifulSoup(page.text, "html.parser")
67 |     return soup.get_text()
68 | 


--------------------------------------------------------------------------------
/Templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <title>Plagiarism Detector</title>
 4 | <style>
 5 |     body {
 6 |         color: white;
 7 |         background-color: #181616;
 8 |         padding: 20px;
 9 |     }
10 |     button {
11 |         width: 100%;
12 |         border: none;
13 |         color: white;
14 |         padding: 15px 32px;
15 |         text-align: center;
16 |         text-decoration: none;
17 |         display: inline-block;
18 |         font-size: 16px;
19 |         margin: 4px 2px;
20 |         cursor: pointer;
21 |         background-color: #3c82f9;
22 |         border-radius: 15px;
23 |     }
24 |     textarea {
25 |         color: white;
26 |         background-color: #1d1b1b;
27 |         min-width: 500px;
28 |         width: 100%;
29 |         min-height: 300px;
30 |         height: 40%;
31 |         font-size: 11px;
32 |         resize: none;
33 |         border-radius: 15px;
34 |     }
35 |     .center {
36 |         background-color:#1d1b1b;
37 |         display: flex;
38 |         justify-content: center;
39 |         align-items: center;
40 |         flex-direction: column;
41 |         margin: 0 auto;
42 |         max-width: 800px;
43 |         padding: 20px;
44 |         box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
45 |         border-radius: 8px;
46 |     }
47 |     @media (max-width: 600px) {
48 |         .center {
49 |             width: 90%;
50 |         }
51 |     }
52 | </style>
53 | <body>
54 |     <br>
55 |     <div class="center">
56 |         <h2>Plagiarism Detector</h2>
57 | 
58 |         <form action="/report" method="POST">
59 |         <p>Paste your text here:</p><br>
60 |         <textarea for="fname" name="text"></textarea>
61 |         <br>
62 |         <button type="Submit" value="Send">Generate Report</button>
63 |         </form> 
64 | 
65 |         <br>
66 | 
67 |         <div class="center">
68 |         <p>You will be redirected once your report is generated.</p>
69 |         <p>Time to generate report depends on length of text.</p>
70 |         </div>
71 |     </div>
72 | </body>
73 | </html>
74 | 


--------------------------------------------------------------------------------