├── util ├── __init__.py ├── constants │ ├── __init__.py │ └── error_codes.py ├── error_handlers │ ├── __init__.py │ └── exceptions │ │ ├── __init__.py │ │ └── exceptions.py ├── logger.py ├── singleton.py ├── commons.py ├── injector.py └── response.py ├── service ├── __init__.py ├── base.py ├── plag_dao.py └── plag_detector.py ├── tests ├── __init__.py └── plagiarism_detection_tests.py ├── controller ├── __init__.py ├── base.py ├── plag_detection.py └── document.py ├── output ├── add_doc.png ├── detect.png ├── get_docs.png └── Plag.postman_collection.json ├── mysql_connector.py ├── model ├── __init__.py ├── document.py └── base.py ├── routes.py ├── requirements.txt ├── settings.py ├── .gitignore ├── app.py └── README.md /util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /util/constants/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /util/error_handlers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /util/error_handlers/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | from .exceptions import * -------------------------------------------------------------------------------- /service/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" -------------------------------------------------------------------------------- /controller/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" -------------------------------------------------------------------------------- /output/add_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suyash248/plagiarism_detection/HEAD/output/add_doc.png -------------------------------------------------------------------------------- /output/detect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suyash248/plagiarism_detection/HEAD/output/detect.png -------------------------------------------------------------------------------- /output/get_docs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/suyash248/plagiarism_detection/HEAD/output/get_docs.png -------------------------------------------------------------------------------- /mysql_connector.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from flask_sqlalchemy import SQLAlchemy 5 | 6 | db = SQLAlchemy() -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from model.document import Document 5 | 6 | # Import and register all the models here. 7 | __all__ = ["Document"] 8 | -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import logging 5 | 6 | Logger = logging.getLogger('Plagiarism-Detection') 7 | 8 | logger_names = ('sqlalchemy.engine.base.Engine') 9 | for logger_name in logger_names: 10 | _logger = logging.getLogger(logger_name) 11 | for handler in _logger.handlers: 12 | Logger.addHandler(handler) 13 | -------------------------------------------------------------------------------- /routes.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from controller import plag_detection 5 | from controller import document 6 | 7 | def add_prefix(uri): 8 | return "{}{}".format('/api/v1/plagiarism', uri) 9 | 10 | def register_urls(api): 11 | """ 12 | Maps all the endpoints with controllers. 13 | """ 14 | api.add_resource(plag_detection.PlagiarismDetection, add_prefix('/detect')) 15 | api.add_resource(document.Document, add_prefix('/documents')) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aniso8601==4.0.1 2 | asn1crypto==0.24.0 3 | certifi==2018.11.29 4 | cffi==1.11.5 5 | chardet==3.0.4 6 | Click==7.0 7 | cryptography==2.4.2 8 | Flask==1.0.2 9 | Flask-RESTful==0.3.6 10 | Flask-SQLAlchemy==2.3.2 11 | idna==2.7 12 | itsdangerous==1.1.0 13 | Jinja2==2.10 14 | MarkupSafe==1.1.0 15 | nltk==3.4 16 | numpy==1.15.4 17 | pycparser==2.19 18 | PyMySQL==0.9.2 19 | pytz==2018.7 20 | requests==2.20.1 21 | scikit-learn==0.20.1 22 | scipy==1.1.0 23 | singledispatch==3.4.0.3 24 | six==1.11.0 25 | sklearn==0.0 26 | SQLAlchemy==1.2.14 27 | urllib3==1.24.1 28 | Werkzeug==0.14.1 29 | -------------------------------------------------------------------------------- /util/singleton.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | class SingletonMeta(type): 5 | """ 6 | Metaclass to implement **Single design pattern**. 7 | 8 | Usage: Can be used in following ways while defining a class - 9 | 10 | * `class SomeClass(BaseClass, metaclass=Singleton):` 11 | * `class SomeClass(metaclass=Singleton):` 12 | """ 13 | 14 | _instances = {} 15 | def __call__(cls, *args, **kwargs): 16 | if cls not in cls._instances: 17 | cls._instances[cls] = super(SingletonMeta, cls).__call__(*args, **kwargs) 18 | return cls._instances[cls] -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import os 5 | from flask import Flask 6 | 7 | __basedir__ = os.path.abspath(os.path.dirname(__file__)) 8 | 9 | app = Flask(__name__) 10 | 11 | # Ideally, there will be one config class per environment(dev, qa, uat, prod) 12 | class __Config__(object): 13 | MYSQL_DB_CONFIG = { 14 | 'URI_CONFIG': { 15 | 'database': os.environ['MYSQL_DB_NAME'], 16 | 'host': os.environ['MYSQL_DB_HOST'], 17 | 'username': os.environ['MYSQL_DB_USERNAME'], 18 | 'password': os.environ['MYSQL_DB_PASSWORD'], 19 | 'port': os.environ['MYSQL_DB_PORT'] 20 | }, 21 | 'MYSQL_CONNECTION_POOL_SIZE': os.environ.get('MYSQL_CONNECTION_POOL_SIZE', 5) 22 | } 23 | FIELDS_SEPARATOR = '|' 24 | LOGGING = { 25 | 'LEVEL': 'INFO' 26 | } 27 | 28 | app.config.from_object(__Config__) 29 | config = app.config 30 | config['APPLICATION_ROOT'] = __basedir__ 31 | 32 | __all__ = ["config", "app"] -------------------------------------------------------------------------------- /controller/base.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from flask_restful import Resource 5 | from util.injector import Injectable, inject 6 | 7 | class BaseController(Resource): 8 | """ 9 | Every controller must extend this class. 10 | """ 11 | 12 | # Child controllers must override this property with default service for a specific module. 13 | __service_class__ = inject('service.base.BaseService') 14 | 15 | @property 16 | def service(self): 17 | """ 18 | Returns an instance of service class(as defined under `__service_class__`. i.e. default service) to be used 19 | inside a controller. Usage inside controller's action methods:: 20 | service_obj = self.service 21 | """ 22 | if self.__service_class__ is None: 23 | raise NotImplementedError("Controller {} must override '__service_class__' property".format(self.__class__.__name__)) 24 | return self.__service_class__.inject if isinstance(self.__service_class__, (Injectable)) else self.__service_class__ -------------------------------------------------------------------------------- /model/document.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import json 5 | from model.base import BaseModel 6 | from mysql_connector import db 7 | 8 | class Document(BaseModel): 9 | __tablename__ = 'documents' 10 | 11 | content = db.Column('content', db.Text(), nullable=False) 12 | title = db.Column('title', db.String(200), nullable=False) 13 | description = db.Column('description', db.Text(), nullable=True) 14 | author = db.Column('author', db.String(200), nullable=True) 15 | 16 | # Table metadata can be specified as follows - 17 | __table_args__ = ( 18 | db.UniqueConstraint('title', 'is_deleted'), 19 | db.Index(BaseModel.create_index(__tablename__, 'title', 'is_deleted'), 'title', 'is_deleted'), 20 | ) 21 | 22 | def __str__(self): 23 | return self.title 24 | 25 | def __repr__(self): 26 | return json.dumps(self.to_dict()) 27 | 28 | def to_dict(self, *args, **kwargs): 29 | return { 30 | 'id': self.id, 31 | 'author': self.author, 32 | 'title': self.title, 33 | 'description': self.description, 34 | 'content': self.content 35 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | local.py 9 | test.py 10 | 11 | env.sh 12 | # C extensions 13 | *.so 14 | nohup.out 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | media/ 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # PyBuilder 65 | target/ 66 | 67 | # pyenv 68 | .python-version 69 | 70 | # SageMath parsed files 71 | *.sage.py 72 | 73 | # Environments 74 | .env 75 | .venv 76 | venv 77 | 78 | .idea/ 79 | .DS_Store 80 | -------------------------------------------------------------------------------- /service/base.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from util.singleton import SingletonMeta 5 | 6 | 7 | class BaseService(object, metaclass=SingletonMeta): 8 | """ 9 | Every service must extend this class. child classes will be **Singleton** by default. 10 | 11 | If child service overrides `__init__`, it should be made sure that parent's `__init__` method is called. 12 | example:: 13 | 14 | Inside child service -> 15 | def __init__(self, arg1, arg2, kw1=12, kw2=23): 16 | super(ChildService, self).__init__(arg1, arg2, kw1=kw1, kw2=kw2) 17 | self.arg1 = arg1 18 | self.arg2 = arg2 19 | self.kw1 = kw1 20 | self.kw2 = kw2 21 | 22 | Child service classes can be used inside controllers as follows:: 23 | * self.service - returns instance of default service as defined under **__service_class__** inside controller. 24 | * other_service_obj = self.inject("arg1", kw1="kw1val", service_class='my_package.my_services.OtherService') 25 | """ 26 | def __init__(self, *args, **kwargs): 27 | from mysql_connector import db 28 | self.db = db 29 | 30 | @classmethod 31 | def instance(cls, *args, **kwargs): 32 | """ 33 | Factory method to instantiate underlying concrete service class(`cls`). 34 | """ 35 | return cls(*args, **kwargs) 36 | -------------------------------------------------------------------------------- /util/commons.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import importlib 5 | import base64, json 6 | import types 7 | from collections import OrderedDict 8 | from settings import config 9 | 10 | def load_class(fully_qualified_class_name): 11 | """ 12 | Dynamically loads/imports a class by it's fully qualified name. 13 | 14 | Note - It returns class **type**, NOT the instance of class. 15 | 16 | Usage - 17 | `my_class = load_class('my_package.my_module.MyClass')` 18 | 19 | `my_class_obj = my_class()` 20 | 21 | """ 22 | class_data = fully_qualified_class_name.split(".") 23 | module_path = ".".join(class_data[:-1]) 24 | class_str = class_data[-1] 25 | 26 | module = importlib.import_module(module_path) 27 | # Finally, we retrieve the Class 28 | return getattr(module, class_str) 29 | 30 | def get_fully_qualified_classname(cls=None, obj=None): 31 | """ 32 | Returns `fully-qualified-name` of the class represented by **cls** or **obj** 33 | 34 | :param cls: 35 | :param obj: 36 | :return: 37 | """ 38 | if obj: 39 | module = obj.__class__.__module__ 40 | if module is None or module == str.__class__.__module__: 41 | return obj.__class__.__name__ 42 | return module + '.' + obj.__class__.__name__ 43 | elif cls: 44 | module = cls.__module__ 45 | if module is None or module == str.__class__.__module__: 46 | return cls.__name__ 47 | return module + '.' + cls.__name__ -------------------------------------------------------------------------------- /controller/plag_detection.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from flask import request 5 | from util.response import intercept, Response 6 | from controller.base import BaseController 7 | from util.injector import inject 8 | from service.plag_detector import PlagiarismDetector 9 | from typing import Dict 10 | from util.constants.error_codes import HttpErrorCode 11 | from util.error_handlers.exceptions import ExceptionBuilder, BadRequest 12 | 13 | class PlagiarismDetection(BaseController): 14 | plag_detector: PlagiarismDetector = inject(PlagiarismDetector) 15 | 16 | @intercept() 17 | def post(self, *args, **kwargs): 18 | """Detects plagiarism""" 19 | 20 | data = request.get_json(force=True) 21 | input_doc = data.get('text', None) 22 | if input_doc is None: 23 | ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD, 'text').throw() 24 | most_similar_doc_info: Dict = self.plag_detector.compute_similarity(input_doc) 25 | 26 | most_similar_doc = most_similar_doc_info['doc'] 27 | similarity_score = most_similar_doc_info['similarity_score'] 28 | similarity_percentage = round(similarity_score * 100, 2) 29 | 30 | message = "Input text is {}% similar to the doc `{}` with similarity score of {}".format( 31 | similarity_percentage, most_similar_doc.title, similarity_score 32 | ) 33 | 34 | res_data = { 35 | 'similarity_score': similarity_score, 36 | 'similarity_percentage': similarity_percentage, 37 | 'doc': most_similar_doc.to_dict() 38 | } 39 | 40 | return Response(status_code=200, message=message, data=res_data) 41 | -------------------------------------------------------------------------------- /controller/document.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from flask import request 5 | from util.response import intercept, Response 6 | from controller.base import BaseController 7 | from util.injector import inject 8 | from service.plag_dao import PlagiarismDAO 9 | from util.constants.error_codes import HttpErrorCode 10 | from util.error_handlers.exceptions import ExceptionBuilder, BadRequest 11 | 12 | class Document(BaseController): 13 | plag_dao: PlagiarismDAO = inject(PlagiarismDAO) 14 | 15 | @intercept() 16 | def post(self, *args, **kwargs): 17 | """Adds a new document to repo""" 18 | 19 | data = request.get_json(force=True) 20 | 21 | content = data.get('content', '') 22 | title = data.get('title', '') 23 | description = data.get('description', '') 24 | author = data.get('author', '') 25 | 26 | if content and title: 27 | doc = self.plag_dao.create_doc(content, title, description=description, author=author) 28 | else: 29 | ExceptionBuilder(BadRequest).error(HttpErrorCode.REQUIRED_FIELD, 'content', 'title').throw() 30 | 31 | return Response(status_code=201, message='Document added successfully!') 32 | 33 | @intercept() 34 | def get(self): 35 | """ 36 | Fetches all the documents(paginated). 37 | :return: 38 | """ 39 | res = self.plag_dao.get_docs(page=int(request.args.get("page", 1)), 40 | per_page=int(request.args.get("per_page", 10)), all='all' in request.args) 41 | docs_info = dict(data=[d.to_dict() for d in res['data']], count=res['count']) 42 | print(docs_info) 43 | return Response(data=docs_info) 44 | -------------------------------------------------------------------------------- /util/injector.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | class Injectable(object): 5 | """ 6 | Injects a class in another class. 7 | 8 | Usage:: 9 | * service_obj = Injectable('my_package.my_services.MyService', "arg1", "some_arg", kw1="val1").inject 10 | * service_obj = Injectable(MyService, "arg1", "some_arg", kw1="val1").inject 11 | """ 12 | def __init__(self, injectable_class, *args, **kwargs): 13 | self.injectable_class = injectable_class 14 | self.args = args 15 | self.kwargs = kwargs 16 | 17 | @property 18 | def inject(self): 19 | return inject(self.injectable_class, *self.args, **self.kwargs) 20 | 21 | def inject(injectable_class, *args, **kwargs): 22 | 23 | """ 24 | :param injectable_class: Either fully qualified name of **injectable_class**. i.e. `my_module.some_pkg.SomeService` 25 | or type of class 26 | :param args: Arguments required to instantiate **injectable_class**, as defined under `__init__` of **injectable_class**. 27 | :param kwargs: Keyword args required to instantiate **injectable_class**, as defined under `__init__` of **injectable_class**. 28 | :return: An instance of **injectable_class** 29 | 30 | Injects the **injectable_class** inside a another class. 31 | 32 | Usage:: 33 | * service_obj = inject('my_package.my_services.MyService', "arg1", "some_arg", kw1="val1") 34 | * service_obj = inject(MyService, "arg1", "some_arg", kw1="val1") 35 | 36 | """ 37 | if type(injectable_class) == str: 38 | from util.commons import load_class 39 | injectable_class = load_class(injectable_class) 40 | return getattr(injectable_class, 'instance')(*args, **kwargs) 41 | -------------------------------------------------------------------------------- /util/constants/error_codes.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | class ErrorCode(object): 5 | NON_STANDARD_ERROR = 'NON_STANDARD_ERROR' 6 | INVALID_FIELD = "INVALID_FIELD" 7 | REQUIRED_FIELD = "REQUIRED_FIELD" 8 | INVALID_DATA_TYPE = "INVALID_DATA_TYPE" 9 | UNSUPPORTED_FIELDS = "UNSUPPORTED_FIELDS" 10 | INVALID_IMAGE_FORMAT = 'INVALID_IMAGE_FORMAT' 11 | INVALID_IMAGE_SIZE = 'INVALID_IMAGE_SIZE' 12 | NOT_ENOUGH_PARAMETERS = 'NOT_ENOUGH_PARAMETERS' 13 | INVALID_OPERATOR = 'INVALID_OPERATOR' 14 | INVALID_FILE_FORMAT = 'INVALID_FILE_FORMAT' 15 | INVALID_DATA = 'INVALID_DATA' 16 | EXTERNAL_APP_ERROR = 'EXTERNAL_APP_ERROR' 17 | INVALID_SOURCE = 'INVALID_SOURCE' 18 | DB_ERROR = 'DB_ERROR' 19 | 20 | class HttpErrorCode(ErrorCode): 21 | BAD_REQUEST = 'BAD_REQUEST' 22 | UNAUTHORIZED = 'UNAUTHORIZED' 23 | FORBIDDEN = 'FORBIDDEN' 24 | NOT_FOUND = 'NOT_FOUND' 25 | METHOD_NOT_ALLOWED = 'METHOD_NOT_ALLOWED' 26 | NOT_ACCEPTABLE = 'NOT_ACCEPTABLE' 27 | REQUEST_TIMEOUT = 'REQUEST_TIMEOUT' 28 | CONFLICT = 'CONFLICT' 29 | GONE = 'GONE' 30 | LENGTH_REQUIRED = 'LENGTH_REQUIRED' 31 | PRECONDITION_FAILED = 'PRECONDITION_FAILED' 32 | REQUEST_ENTITY_TOO_LARGE = 'REQUEST_ENTITY_TOO_LARGE' 33 | REQUEST_URI_TOO_LARGE = 'REQUEST_URI_TOO_LARGE' 34 | UNSUPPORTED_MEDIA_TYPE = 'UNSUPPORTED_MEDIA_TYPE' 35 | REQUESTED_RANGE_NOT_SATISFIABLE = 'REQUESTED_RANGE_NOT_SATISFIABLE' 36 | EXPECTATION_FAILED = 'EXPECTATION_FAILED' 37 | TOO_MANY_REQUESTS = 'TOO_MANY_REQUESTS' 38 | REQUEST_HEADER_FIELDS_TOO_LARGE = 'REQUEST_HEADER_FIELDS_TOO_LARGE' 39 | 40 | # Custom error constants starts from here... 41 | NULL_VALUE_NOT_ALLOWED = "NULL_VALUE_NOT_ALLOWED" 42 | BLANK_VALUE_NOT_ALLOWED = "BLANK_VALUE_NOT_ALLOWED" 43 | REGEX_NOT_MATCHED = "REGEX_NOT_MATCHED" 44 | INVALID_CHOICE = "INVALID_CHOICE" -------------------------------------------------------------------------------- /service/plag_dao.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from service.base import BaseService 5 | from model import Document 6 | from math import ceil 7 | 8 | class PlagiarismDAO(BaseService): 9 | 10 | def yield_docs(self, page=1, per_page=10): 11 | """ 12 | Yields a list of documents per page. 13 | :param page: 14 | :param per_page: 15 | :return: 16 | """ 17 | docs = self.get_docs(page=page, per_page=per_page) 18 | iterations = ceil(docs['count'] / per_page) 19 | 20 | yield docs 21 | for _page in range(page+1, iterations+1): 22 | docs = self.get_docs(page=_page, per_page=per_page) 23 | yield docs 24 | 25 | def get_docs(self, page=1, per_page=10, all=False): 26 | """ 27 | Fetches documents' list. 28 | :param page: Current page, defaults to 1 29 | :param per_page: Number of records per page, defaults to 10 30 | :return: List of documents 31 | """ 32 | 33 | start, stop = per_page * (page - 1), per_page * page 34 | query = {'is_deleted': 0} 35 | 36 | doc_queryset = Document.query.filter_by(**query) 37 | count = doc_queryset.count() 38 | doc_queryset = doc_queryset.order_by(Document.created_date.desc()) 39 | docs = doc_queryset.all() if all else doc_queryset.slice(start, stop).all() 40 | 41 | return { 42 | "data": docs, 43 | "count": count 44 | } 45 | 46 | def create_doc(self, content, title, description='', author=''): 47 | """ 48 | Creates an document. 49 | :param data: document's properties as json. 50 | :return: 51 | """ 52 | doc = Document(content=content, title=title, description=description, author=author) 53 | self.db.session.add(doc) 54 | self.db.session.commit() 55 | return doc 56 | -------------------------------------------------------------------------------- /model/base.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from uuid import uuid4 5 | from mysql_connector import db 6 | from settings import config 7 | from sqlalchemy.orm import class_mapper 8 | from sqlalchemy.orm import ColumnProperty 9 | 10 | def generate_uuid(): 11 | """ 12 | :return: Random UUID4 String. 13 | """ 14 | return str(uuid4()) 15 | 16 | 17 | class BaseModel(db.Model): 18 | """ 19 | `BaseModel` for providing common-fields for audit & trail, to child models. 20 | """ 21 | __abstract__ = True 22 | 23 | id = db.Column("id", db.String(75), primary_key=True, default=generate_uuid) 24 | created_date = db.Column(db.DateTime, default=db.func.now()) 25 | updated_date = db.Column(db.DateTime, default=db.func.now(), onupdate=db.func.now()) 26 | 27 | # Soft-delete, 0 means that record is NOT deleted, any UTC timestamp value(>0) means 28 | # record is deleted at that UTC time. 29 | is_deleted = db.Column(db.Integer, default=0) 30 | 31 | def to_dict(self, *args, **kwargs): 32 | raise NotImplementedError("All the models must implement to_dict") 33 | 34 | @classmethod 35 | def field_names(cls): 36 | return [prop.key for prop in class_mapper(cls).iterate_properties if isinstance(prop, ColumnProperty)] 37 | 38 | @classmethod 39 | def column_names(cls): 40 | return cls.__table__.columns.keys() 41 | 42 | @staticmethod 43 | def create_index(tablename, *cols, **kwargs): 44 | """ 45 | Used for to create index, index name will be in following format - 46 | 47 | - col1{SEPARATOR}col2{SEPARATOR}...{SEPARATOR}colN 48 | 49 | :param tablename: Target table. 50 | :param cols: column names which will be part of the index. 51 | 52 | :return: Name of the index. 53 | """ 54 | return ''.join([tablename] + ['{sep}' + col for col in cols]).format( 55 | sep=config['FIELDS_SEPARATOR'] 56 | ) -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import json, logging 5 | from flask_restful import Api 6 | from settings import app, config 7 | from mysql_connector import db 8 | from util.logger import Logger 9 | 10 | app.url_map.strict_slashes = False 11 | api = Api(app) 12 | 13 | def initialize_sqlalchemy(): 14 | """ Initializes MySQL database connection. """ 15 | 16 | config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'.format( 17 | **config['MYSQL_DB_CONFIG']['URI_CONFIG'] 18 | ) 19 | 20 | config['MYSQL_CONNECTION_POOL_SIZE'] = config['MYSQL_DB_CONFIG']['MYSQL_CONNECTION_POOL_SIZE'] 21 | config['SQLALCHEMY_TRACK_MODIFICATIONS'] = config['DEBUG'] 22 | config['SQLALCHEMY_ECHO'] = config['DEBUG'] 23 | config['SQLALCHEMY_RECORD_QUERIES'] = config['DEBUG'] 24 | db.init_app(app) 25 | 26 | # For creating the tables(via models) for the first time. 27 | # import model 28 | # app.app_context().push() 29 | # db.create_all() 30 | 31 | def init_logger(): 32 | log_level = getattr(logging, config['LOGGING']['LEVEL'], logging.INFO) 33 | 34 | Logger.setLevel(log_level) 35 | 36 | stream_handler = logging.StreamHandler() 37 | stream_handler.setFormatter(logging.Formatter('[%(levelname)s -> %(name)s] at %(asctime)s in %(filename)s: %(lineno)s - %(message)s')) 38 | 39 | Logger.addHandler(stream_handler) 40 | 41 | logging.getLogger('sqlalchemy.engine.base.Engine').handlers = Logger.handlers 42 | 43 | app.logger.handlers = Logger.handlers 44 | app.logger.setLevel(log_level) 45 | 46 | Logger.info('Initializing logger...') 47 | 48 | init_logger() 49 | initialize_sqlalchemy() 50 | 51 | # Registering routes. 52 | from routes import register_urls 53 | register_urls(api) 54 | 55 | @app.route("/") 56 | @app.route("/api/v1/plagiarism") 57 | def index(): 58 | return json.dumps({"message": "Welcome to Plagiarism Detector"}) 59 | 60 | if __name__ == '__main__': 61 | app.run(debug=True, host='0.0.0.0', port=5000, threaded=True) 62 | 63 | 64 | -------------------------------------------------------------------------------- /service/plag_detector.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | from model import Document 5 | import nltk, string 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from util.injector import inject 8 | from service.base import BaseService 9 | from service.plag_dao import PlagiarismDAO 10 | from typing import List, Dict 11 | 12 | class PlagiarismDetector(BaseService): 13 | 14 | plag_dao: PlagiarismDAO = inject(PlagiarismDAO) 15 | vectorizer = None 16 | 17 | @staticmethod 18 | def tokenize_and_stem(doc): 19 | """ 20 | Splits the document in tokens and then perform stemming. 21 | :param doc: 22 | :return: 23 | """ 24 | punctuation_remover = dict((ord(char), None) for char in string.punctuation) 25 | tokens = nltk.word_tokenize(doc.lower().translate(punctuation_remover)) 26 | return PlagiarismDetector.stem_tokens(tokens) 27 | 28 | @staticmethod 29 | def stem_tokens(tokens): 30 | """ 31 | Stems the tokenized document. 32 | :param tokens: 33 | :return: 34 | """ 35 | stemmer = nltk.stem.porter.PorterStemmer() 36 | stemmed_tokens = [stemmer.stem(item) for item in tokens] 37 | return stemmed_tokens 38 | 39 | def cosine_similarity(self, source_doc, input_doc): 40 | """ 41 | Computes the similarity score for `input_doc` by matching it against `source_doc` 42 | using `TF-IDF` & `Cosine-similarity` 43 | 44 | :param source_doc: 45 | :param input_doc: 46 | :return: 47 | """ 48 | vectorizer = self.vectorizer or TfidfVectorizer(tokenizer=PlagiarismDetector.tokenize_and_stem, stop_words='english') 49 | tfidf = vectorizer.fit_transform([source_doc, input_doc]) 50 | return ((tfidf * tfidf.T).A)[0, 1] 51 | 52 | def compute_similarity(self, input_doc) -> Dict: 53 | """ 54 | Returns a dict containing highest possible similarity score and the most similar doc. 55 | :param input_doc: 56 | :return: 57 | """ 58 | most_similar_so_far = dict(similarity_score=-1, doc=None) 59 | 60 | for doc_info in self.plag_dao.yield_docs(): 61 | docs: List[Document] = doc_info['data'] 62 | 63 | for doc in docs: 64 | similarity_score = self.cosine_similarity(doc.content, input_doc) 65 | if similarity_score > most_similar_so_far['similarity_score']: 66 | most_similar_so_far['similarity_score'] = similarity_score 67 | most_similar_so_far['doc'] = doc 68 | return most_similar_so_far -------------------------------------------------------------------------------- /tests/plagiarism_detection_tests.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import unittest 5 | from service.plag_dao import PlagiarismDAO 6 | from service.plag_detector import PlagiarismDetector 7 | from sqlalchemy.exc import SQLAlchemyError 8 | from util.injector import inject 9 | import json 10 | import requests 11 | from uuid import uuid4 12 | 13 | class TestPlagiarismDetection(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.plag_dao: PlagiarismDAO = inject(PlagiarismDAO) 17 | self.plag_detector: PlagiarismDetector = inject(PlagiarismDetector) 18 | self.host = 'http://0.0.0.0:5000' 19 | 20 | def test_1_validate_doc_title(self): 21 | url = '{}{}'.format(self.host, '/api/v1/plagiarism/documents') 22 | response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps({})) 23 | self.assertEqual(response.status_code, 400, msg='title and content are required!') 24 | 25 | url = '{}{}'.format(self.host, '/api/v1/plagiarism/documents') 26 | payload = { 27 | 'title': str(uuid4()) * 6, 28 | 'author': str(uuid4()) * 6, 29 | 'content': str(uuid4()), 30 | 'description': str(uuid4()) 31 | 32 | } 33 | response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload)) 34 | self.assertRaises(SQLAlchemyError) 35 | 36 | def test_2_add_doc(self): 37 | url = '{}{}'.format(self.host, '/api/v1/plagiarism/documents') 38 | payload = { 39 | 'title': 'test_title_{}'.format(str(uuid4())), 40 | 'author': 'test_author_{}'.format(str(uuid4())), 41 | 'content': 'test_content_{}'.format(str(uuid4())), 42 | 'description': 'test_description_{}'.format(str(uuid4())), 43 | 44 | } 45 | response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload)) 46 | self.assertIn(response.status_code, (200, 201), msg='Document added successfully!') 47 | 48 | def test_3_different_docs(self): 49 | sim_score = self.plag_detector.cosine_similarity('test', 'check') 50 | self.assertEqual(sim_score, 0, msg='These 2 strings are completely different.') 51 | 52 | def test_4_identical_docs(self): 53 | sim_score = self.plag_detector.cosine_similarity('test', 'test') 54 | self.assertEqual(sim_score, 1.0, msg='These 2 strings are identical.') 55 | 56 | def test_5_similar_docs(self): 57 | sim_score = self.plag_detector.cosine_similarity('bird parrot', 'cockatiel bird') 58 | self.assertGreater(sim_score, 0, msg='These 2 strings are similar with similarity score of {}.'.format(sim_score)) 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | # python -m unittest discover -s 'tests' -p '*.py' -------------------------------------------------------------------------------- /output/Plag.postman_collection.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "name": "Plag", 4 | "_postman_id": "86bfb603-6947-a37c-5bc7-de811405b62f", 5 | "description": "", 6 | "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json" 7 | }, 8 | "item": [ 9 | { 10 | "name": "Add doc", 11 | "request": { 12 | "method": "POST", 13 | "header": [ 14 | { 15 | "key": "Content-Type", 16 | "value": "application/json" 17 | } 18 | ], 19 | "body": { 20 | "mode": "raw", 21 | "raw": "{\n\t\"content\": \"Sachin Ramesh Tendulkar is a former Indian international cricketer and a former captain of the Indian national team, regarded as one of the greatest batsmen of all time. He is the highest run scorer of all time in International cricket.\",\n\t\"title\": \"Sachin Tendulkar\",\n\t\"author\": \"James neshley\",\n\t\"description\": \"About the legacy of the great Sachin Tendulkar\"\n}" 22 | }, 23 | "url": { 24 | "raw": "http://0.0.0.0:5000/api/v1/plagiarism/documents", 25 | "protocol": "http", 26 | "host": [ 27 | "0", 28 | "0", 29 | "0", 30 | "0" 31 | ], 32 | "port": "5000", 33 | "path": [ 34 | "api", 35 | "v1", 36 | "plagiarism", 37 | "documents" 38 | ] 39 | }, 40 | "description": "Adding a doc" 41 | }, 42 | "response": [] 43 | }, 44 | { 45 | "name": "Detect plag", 46 | "request": { 47 | "method": "POST", 48 | "header": [ 49 | { 50 | "key": "Content-Type", 51 | "value": "application/json" 52 | } 53 | ], 54 | "body": { 55 | "mode": "raw", 56 | "raw": "{\n\t\"text\": \"white cockteil\"\n}" 57 | }, 58 | "url": { 59 | "raw": "http://0.0.0.0:5000/api/v1/plagiarism/detect", 60 | "protocol": "http", 61 | "host": [ 62 | "0", 63 | "0", 64 | "0", 65 | "0" 66 | ], 67 | "port": "5000", 68 | "path": [ 69 | "api", 70 | "v1", 71 | "plagiarism", 72 | "detect" 73 | ] 74 | }, 75 | "description": "" 76 | }, 77 | "response": [] 78 | }, 79 | { 80 | "name": "Get all docs", 81 | "request": { 82 | "method": "GET", 83 | "header": [ 84 | { 85 | "key": "Content-Type", 86 | "value": "application/json" 87 | } 88 | ], 89 | "body": { 90 | "mode": "raw", 91 | "raw": "{\n\t\"text\": \"cockteil bird\"\n}" 92 | }, 93 | "url": { 94 | "raw": "http://0.0.0.0:5000/api/v1/plagiarism/documents?page=1&per_page=10", 95 | "protocol": "http", 96 | "host": [ 97 | "0", 98 | "0", 99 | "0", 100 | "0" 101 | ], 102 | "port": "5000", 103 | "path": [ 104 | "api", 105 | "v1", 106 | "plagiarism", 107 | "documents" 108 | ], 109 | "query": [ 110 | { 111 | "key": "page", 112 | "value": "1", 113 | "equals": true 114 | }, 115 | { 116 | "key": "per_page", 117 | "value": "10", 118 | "equals": true 119 | } 120 | ] 121 | }, 122 | "description": "" 123 | }, 124 | "response": [] 125 | } 126 | ] 127 | } -------------------------------------------------------------------------------- /util/response.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import time 5 | from flask_restful import fields, marshal 6 | from flask import request 7 | from util.error_handlers.exceptions import BaseException, Error 8 | from util.constants.error_codes import ErrorCode 9 | from flask import Response as FlaskResponse 10 | from pymysql import MySQLError 11 | from sqlalchemy.exc import SQLAlchemyError 12 | from util.logger import Logger 13 | 14 | class Response(object): 15 | def __init__(self, success=True, data=None, errors=(), message="", status_code=200, headers=None, mimetype=None): 16 | self.success = success 17 | self.data = data 18 | self.errors = errors 19 | self.message = message 20 | 21 | # These fields won't be marshaled and hence won't be part of `data` in underlying response. 22 | self.status_code = status_code 23 | self.headers = headers 24 | self.mimetype = mimetype 25 | 26 | def response_builder(response): 27 | """ 28 | Marshals & transforms `Response` object to the response format supported by Flask-Restful, and return it. 29 | 30 | :param response: instance of `Response` class as returned by controller's action method. 31 | 32 | :return: `response data`, `status code`, `response headers` 33 | """ 34 | # Actual response structure in the form of JSON. 35 | marshal_fields = { 36 | "success": fields.Boolean, 37 | "message": fields.String, 38 | "data": fields.Raw, 39 | "errors": fields.Raw 40 | } 41 | 42 | if response.data and not isinstance(response.data, (list, set, dict, str, tuple, int, float, bool)): 43 | return FlaskResponse(response.data, mimetype=response.mimetype) 44 | 45 | marshaled_res = marshal(response, marshal_fields) 46 | headers = dict(response.headers) if response.headers else None 47 | return marshaled_res, response.status_code, headers 48 | 49 | 50 | def intercept(profiler=True): 51 | """ 52 | Request-Response interceptor which handles response format. 53 | 54 | :param profiler: Logs the the time taken by the underlying request. 55 | """ 56 | def _intercept_decorator(func): 57 | def wrapper(controller, *args, **kwargs): 58 | try: 59 | 60 | start = time.time() 61 | response_obj = func(controller, *args, **kwargs) 62 | end = time.time() 63 | if profiler: 64 | api_name = "{}: {}".format(func.__name__.upper(), request.url_rule) 65 | Logger.info("{} took {} sec(s)".format(api_name, end-start)) 66 | 67 | except BaseException as be: 68 | # Custom exceptions are handled here. 69 | import traceback 70 | print(traceback.format_exc()) 71 | if not isinstance(be.errors, (list, tuple)): 72 | be.errors = [be.errors] 73 | response_obj = Response(success=False, status_code=be.status_code, message=be.message, 74 | errors=be.errors) 75 | except (MySQLError, SQLAlchemyError) as dbe: 76 | # Default exceptions thrown by SQLAlchemy are handled here. 77 | import traceback 78 | print(traceback.format_exc()) 79 | 80 | message = 'Database error occurred.' 81 | response_obj = Response(success=False, status_code=500, message=message, 82 | errors=[Error(ErrorCode.DB_ERROR, 'database_error', message=str(dbe)).to_dict]) 83 | except Exception as e: 84 | # Any uncaught exception will be handled here. 85 | import traceback 86 | print (traceback.format_exc()) 87 | response_obj = Response(success=False, status_code=500, 88 | errors=[Error(ErrorCode.NON_STANDARD_ERROR, message=str(e)).to_dict]) 89 | 90 | return response_builder(response_obj) 91 | return wrapper 92 | 93 | return _intercept_decorator -------------------------------------------------------------------------------- /util/error_handlers/exceptions/exceptions.py: -------------------------------------------------------------------------------- 1 | __author__ = "Suyash Soni" 2 | __email__ = "suyash.soni248@gmail.com" 3 | 4 | import itertools 5 | from settings import config 6 | from util.constants.error_codes import ErrorCode 7 | 8 | class Error(object): 9 | """ 10 | Every time error needs to thrown, instance of this class must be used to represent an error. 11 | """ 12 | def __init__(self, error_constant, *fields, message=None): 13 | self.error_constant = error_constant or ErrorCode.NON_STANDARD_ERROR 14 | self.fields = tuple(fields) 15 | self.message = message 16 | 17 | @property 18 | def to_dict(self): 19 | self.fields = tuple(itertools.chain(*map(lambda field: field.split(config['FIELDS_SEPARATOR'])[1::] if config['FIELDS_SEPARATOR'] in field else field.split(config['FIELDS_SEPARATOR']), self.fields))) 20 | err_dict = dict(error_constant = self.error_constant) 21 | if self.fields: err_dict['fields'] = self.fields 22 | if self.message: err_dict['message'] = self.message 23 | return err_dict 24 | 25 | class BaseException(Exception): 26 | """ 27 | Base class for custom exceptions. 28 | Subclasses should provide `status_code`, `message` and `errors` properties. 29 | """ 30 | status_code = 500 31 | message = 'A server error occurred.' 32 | errors = [] 33 | 34 | def __init__(self, message=None, status_code=500, errors=()): 35 | self.status_code = status_code 36 | self.errors = errors 37 | if message: self.message = message 38 | 39 | def __str__(self): 40 | return "Error({}): {}".format(self.status_code, self.message) 41 | 42 | class DatabaseException(BaseException): 43 | message = 'Error occurred while performing DB operation.' 44 | def __init__(self, message=None, status_code=400, errors=()): 45 | super(DatabaseException, self).__init__(message=message, status_code=400, errors=errors) 46 | 47 | class SqlAlchemyException(DatabaseException): 48 | message = 'Error occurred while performing an operation on RDBMS.' 49 | def __init__(self, message=None, status_code=400, errors=()): 50 | # from thirdparty import db 51 | # db.session.rollback() 52 | message = message or self.message 53 | super(SqlAlchemyException, self).__init__(message=message, status_code=status_code, errors=errors) 54 | 55 | class NotFound(BaseException): 56 | message = 'Resource not found.' 57 | def __init__(self, message=None, errors=()): 58 | message = message or self.message 59 | super(NotFound, self).__init__(message=message, status_code=404, errors=errors) 60 | 61 | class BadRequest(BaseException): 62 | message = 'Bad request.' 63 | def __init__(self, message=None, errors=()): 64 | message = message or self.message 65 | super(BadRequest, self).__init__(message=message, status_code=400, errors=errors) 66 | 67 | class NotAuthenticated(BaseException): 68 | message = 'Authentication required.' 69 | def __init__(self, message=None, errors=()): 70 | message = message or self.message 71 | super(NotAuthenticated, self).__init__(message=message, status_code=401, errors=errors) 72 | 73 | class Unauthrorized(BaseException): 74 | message = 'You are not authorized to perform this action.' 75 | def __init__(self, message=None, errors=()): 76 | message = message or self.message 77 | super(Unauthrorized, self).__init__(message=message, status_code=403, errors=errors) 78 | 79 | class ExceptionBuilder(object): 80 | def __init__(self, exc_cls=BaseException): 81 | self._exc_cls = exc_cls 82 | self._errors = [] 83 | self._message = '' 84 | self._code = None 85 | 86 | def error(self, error_constant, *fields, message=None): 87 | self._errors.append(Error(error_constant, *fields, message=message).to_dict) 88 | return self 89 | 90 | def message(self, msg): 91 | self._message = msg 92 | return self 93 | 94 | def status_code(self, code): 95 | self._code = code 96 | return self 97 | 98 | def throw(self): 99 | if self._code: 100 | raise self._exc_cls(errors=self._errors, message=self._message, status_code=self._code) 101 | else: 102 | raise self._exc_cls(errors=self._errors, message=self._message) 103 | 104 | 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Plagiarism Detection 2 | 3 | 1. Plagiarism detection using TF-IDF and cosine similarity. 4 | 2. Input text will be matched against all the documents present in the DB(`document` table) to get the maximum similarity score. 5 | 6 | ### Requirements 7 | Python 3.x, pip3, MySQL 8 | 9 | ### How to run? 10 | 11 | 1. Move to ``````, create virual environment and then activate it as 12 | 13 | ```sh 14 | $ cd 15 | $ virtualenv .environment 16 | $ source .environment/bin/activate 17 | ``` 18 | 19 | 2. Edit configuration under ```settings.py```. i.e. provide configuration/settings related to DB and other constants. 20 | 21 | > If you are using PyCharm then environment variables can be specified under `run configuration`. 22 | 23 | 3. Add project to ```PYTHONPATH``` as 24 | 25 | ```sh 26 | $ export PYTHONPATH="$PYTHONPATH:." # . corresponds to current directory(project-dir) 27 | ``` 28 | 29 | 4. Under `````` install requirements/dependencies as 30 | 31 | ```sh 32 | $ pip3 install -r requirements.txt 33 | ``` 34 | 35 | 5. Then run test cases as - 36 | 37 | ```sh 38 | $ python -m unittest discover -s 'tests' -p '*.py' 39 | ``` 40 | 41 | 6. Run server as - 42 | ```sh 43 | $ python app.py 44 | ``` 45 | > Now you can access the application by visiting ```{protocol}://{host}:{port}```. For localhost it is ```http://localhost:5000```. 46 | 47 | 48 | ### Applications & Endpoints 49 | 50 | There are following three APIs - 51 | 52 | #### 1. Adding a new document - 53 | 54 | > POST ```{host}:{port}/api/v1/plagiarism/documents```. 55 | 56 | *Request body* 57 | 58 | ```javascript 59 | { 60 | "content": "Sachin Ramesh Tendulkar is a former Indian international cricketer and a former captain of the Indian national team, regarded as one of the greatest batsmen of all time. He is the highest run scorer of all time in International cricket.", 61 | "title": "Sachin Tendulkar", 62 | "author": "James neshley", 63 | "description": "About the legacy of the great Sachin Tendulkar" 64 | } 65 | ``` 66 | 67 | *Response* 68 | 69 | ```javascript 70 | { 71 | "success": true, 72 | "message": "Document added successfully!", 73 | "data": null, 74 | "errors": [] 75 | } 76 | ``` 77 | 78 | #### 2. Detecting plagiarism - 79 | 80 | > POST ```{host}:{port}/api/v1/plagiarism/detect```. 81 | 82 | *Request body* 83 | 84 | ```javascript 85 | { 86 | "text": "Sachin Tendulkar is the great cricketer.", 87 | } 88 | ``` 89 | 90 | *Response* 91 | 92 | ```javascript 93 | { 94 | "success": true, 95 | "message": "Input text is 25.5% similar to the doc `Sachin Tendulkar` with similarity score of 0.25499620385104793", 96 | "data": { 97 | "similarity_score": 0.25499620385104793, 98 | "similarity_percentage": 25.5, 99 | "doc": { 100 | "id": "4855f11b-78d2-4e08-a070-169965cb6c11", 101 | "author": "James neshley", 102 | "title": "Sachin Tendulkar", 103 | "description": "About the legacy of the great Sachin Tendulkar", 104 | "content": "Sachin Ramesh Tendulkar is a former Indian international cricketer and a former captain of the Indian national team, regarded as one of the greatest batsmen of all time. He is the highest run scorer of all time in International cricket." 105 | } 106 | }, 107 | "errors": [] 108 | } 109 | ``` 110 | 111 | #### 3. Fetch all documents - 112 | 113 | > GET ```{host}:{port}/api/v1/plagiarism/documents?page=1&per_page=10```. 114 | 115 | *Response* 116 | 117 | ```javascript 118 | { 119 | "success": true, 120 | "message": "", 121 | "data": { 122 | "data": [ 123 | { 124 | "id": "4855f11b-78d2-4e08-a070-169965cb6c11", 125 | "author": "James neshley", 126 | "title": "Sachin Tendulkar", 127 | "description": "About the legacy of the great Sachin Tendulkar", 128 | "content": "Sachin Ramesh Tendulkar is a former Indian international cricketer and a former captain of the Indian national team, regarded as one of the greatest batsmen of all time. He is the highest run scorer of all time in International cricket." 129 | }, 130 | { 131 | "id": "e7b4e65b-1ff0-4f1c-98b5-fc6ca5f9cda3", 132 | "author": "test_author_aa45d441-0a57-45b7-b995-c804620ef427", 133 | "title": "test_title_f0eba63b-2e10-47c6-b869-fbb91cb6c385", 134 | "description": "test_description_2283a1a6-da0e-45e7-82a5-eb380f778739", 135 | "content": "test_content_de8fedf2-170d-4f91-a1b7-c7345cddac46" 136 | }, 137 | {......}, 138 | {......} 139 | ], 140 | "count": 72 141 | } 142 | } 143 | ``` 144 | 145 | ### Links - 146 | - [Postmant API dump](https://github.com/suyash248/plagiarism_detection/blob/master/output/Plag.postman_collection.json) 147 | - [Screenshots](https://github.com/suyash248/plagiarism_detection/blob/master/output) 148 | 149 | ### TODO - 150 | 1. Use a wsgi server like Gunicorn. 151 | 2. Centralized logging. 152 | --------------------------------------------------------------------------------