├── .gitignore ├── LICENSE ├── README.md ├── backend ├── .env.example ├── Dockerfile ├── api │ ├── __init__.py │ ├── models.py │ ├── routes.py │ └── swagger.py ├── main.py ├── requirements.txt └── utils │ ├── cloudinary_utils.py │ ├── db.py │ ├── download.py │ ├── extraction.py │ ├── prompts.py │ └── search.py └── web ├── .env.example ├── app ├── annotation │ └── page.tsx ├── globals.css ├── layout.tsx ├── lib │ └── utils.ts ├── page.tsx └── template.tsx ├── components.json ├── components ├── PageScrollBar.tsx ├── PaperContent.tsx ├── ResourceDisplay.tsx ├── UrlForm.tsx └── magicui │ ├── line-shadow-text.tsx │ └── shimmer-button.tsx ├── eslint.config.mjs ├── hooks └── extraction.tsx ├── lib └── utils.ts ├── next-env.d.ts ├── next.config.ts ├── package.json ├── postcss.config.mjs ├── public ├── book.svg ├── file.svg ├── globe.svg ├── next.svg ├── vercel.svg └── window.svg ├── tailwind.config.ts └── tsconfig.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | /.pnp 4 | .pnp.js 5 | .yarn/install-state.gz 6 | 7 | # Testing 8 | /coverage 9 | 10 | # Next.js 11 | /.next/ 12 | /out/ 13 | .next 14 | 15 | # Production 16 | /build 17 | 18 | # Environment Variables 19 | .env 20 | .env*.local 21 | .env.development.local 22 | .env.test.local 23 | .env.production.local 24 | 25 | # Debug 26 | npm-debug.log* 27 | yarn-debug.log* 28 | yarn-error.log* 29 | 30 | # Vercel 31 | .vercel 32 | 33 | # TypeScript 34 | *.tsbuildinfo 35 | next-env.d.ts 36 | 37 | # Python 38 | __pycache__/ 39 | *.py[cod] 40 | *$py.class 41 | *.so 42 | .Python 43 | .venv/ 44 | venv/ 45 | ENV/ 46 | env/ 47 | .env 48 | 49 | # Distribution / packaging 50 | .Python 51 | build/ 52 | develop-eggs/ 53 | dist/ 54 | downloads/ 55 | eggs/ 56 | .eggs/ 57 | lib64/ 58 | parts/ 59 | sdist/ 60 | var/ 61 | wheels/ 62 | *.egg-info/ 63 | .installed.cfg 64 | *.egg 65 | 66 | # PyInstaller 67 | *.manifest 68 | *.spec 69 | 70 | # Installer logs 71 | pip-log.txt 72 | pip-delete-this-directory.txt 73 | 74 | # Unit test / coverage reports 75 | htmlcov/ 76 | .tox/ 77 | .coverage 78 | .coverage.* 79 | .cache 80 | nosetests.xml 81 | coverage.xml 82 | *.cover 83 | .hypothesis/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | *.ipynb 88 | 89 | # VS Code 90 | .vscode/ 91 | 92 | # PyCharm 93 | .idea/ 94 | 95 | # macOS 96 | .DS_Store 97 | .AppleDouble 98 | .LSOverride 99 | 100 | # Ruff 101 | .ruff_cache/ 102 | 103 | # Docker 104 | *.env 105 | .dockerignore 106 | 107 | # Package Lock 108 | /web/package-lock.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Dev Khant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SmartRead 2 | 3 | SmartRead is an AI-powered tool designed to automatically annotate technical PDFs, providing key insights and important highlights. Additionally, it offers related articles and videos to enhance understanding. 4 | 5 | 6 | https://github.com/user-attachments/assets/3644033c-8953-4d22-8d32-697b60b28afe 7 | 8 | 9 | ## Features 10 | 11 | - **Smart Annotation**: View key insights and important highlights from the PDF 12 | - **Related Resources**: Get related articles and videos on selected technical highlights for improved understanding 13 | - **Technical PDFs**: Works with any technical PDF, making technical reading easier to understand 14 | - **Download Annotated PDF**: Save a copy of the annotated original PDF to keep highlights 15 | 16 | ## Technology Stack 17 | 18 | - **Frontend**: Next.js with TypeScript 19 | - **Backend**: FastAPI (Python 3.12) 20 | - **Database**: MongoDB 21 | - **AI Models**: Mistral AI and Groq 22 | - **Storage**: Cloudinary 23 | - **Containerization**: Docker (backend only) 24 | 25 | ## Getting Started 26 | 27 | ### Prerequisites 28 | 29 | - Git 30 | - Node.js 18+ (for frontend) 31 | - Docker (for backend) 32 | - Python 3.12 (for local backend development) 33 | 34 | ### Installation 35 | 36 | ```bash 37 | # Clone the repository 38 | git clone 39 | cd smartread 40 | 41 | # Set up environment variables 42 | cp backend/.env.example backend/.env 43 | cp web/.env.example web/.env.local 44 | ``` 45 | 46 | ### Environment Setup 47 | 48 | #### Backend Variables (.env) 49 | ```plaintext 50 | PORT=8000 # API port 51 | HOST=0.0.0.0 # API host 52 | ENVIRONMENT=development # development/production 53 | MONGODB_URL=mongodb://... # MongoDB connection URL 54 | MISTRAL_API_KEY= # Mistral AI API key 55 | GROQ_API_KEY= # Groq API key 56 | CLOUDINARY_CLOUD_NAME= # Cloudinary cloud name 57 | CLOUDINARY_API_KEY= # Cloudinary API key 58 | CLOUDINARY_API_SECRET= # Cloudinary API secret 59 | ``` 60 | 61 | #### Frontend Variables (.env.local) 62 | ```plaintext 63 | NEXT_PUBLIC_BACKEND_API_URL=http://localhost:8000 # Backend API URL 64 | ``` 65 | 66 | ### Running the Application 67 | 68 | #### Frontend (Next.js) 69 | 70 | ```bash 71 | # Navigate to frontend directory 72 | cd web 73 | 74 | # Install dependencies 75 | npm install 76 | 77 | # Start development server 78 | npm run dev 79 | ``` 80 | 81 | The frontend will be available at http://localhost:3000 82 | 83 | #### Backend (FastAPI) 84 | 85 | Using Docker: 86 | ```bash 87 | cd backend 88 | docker build -t smartread-backend . 89 | docker run -p 8000:8000 --env-file .env smartread-backend 90 | ``` 91 | 92 | Or for local development: 93 | ```bash 94 | cd backend 95 | python -m venv .venv 96 | source .venv/bin/activate # On Windows: .venv\Scripts\activate 97 | pip install -r requirements.txt 98 | uvicorn main:app --reload --host 0.0.0.0 --port 8000 99 | ``` 100 | 101 | ## Development 102 | 103 | The application is built with: 104 | - Next.js and TypeScript for the frontend 105 | - FastAPI (Python 3.12) for the backend 106 | - MongoDB for data storage 107 | - Mistral and Groq AI models for AI features 108 | - Cloudinary for media management 109 | 110 | ## Contributing 111 | 112 | Contributions are welcome! Please feel free to submit a Pull Request. 113 | 114 | ## License 115 | 116 | This project is open source and available under the MIT License. 117 | 118 | ## Learn More 119 | 120 | - [Next.js Documentation](https://nextjs.org/docs) 121 | - [FastAPI Documentation](https://fastapi.tiangolo.com/) 122 | - [MongoDB Installation Documentation](https://www.mongodb.com/docs/manual/installation/) 123 | - [Cloudinary Documentation](https://cloudinary.com/documentation) 124 | - [Mistral AI OCR Documentation](https://docs.mistral.ai/capabilities/document/) 125 | - [Groq Documentation](https://console.groq.com/docs/overview) 126 | -------------------------------------------------------------------------------- /backend/.env.example: -------------------------------------------------------------------------------- 1 | PORT=8000 2 | HOST=0.0.0.0 3 | ENVIRONMENT=development 4 | 5 | MISTRAL_API_KEY=MISTRAL_API_KEY 6 | GROQ_API_KEY=GROQ_API_KEY 7 | 8 | MONGODB_URL=mongodb://localhost:27017/ 9 | 10 | CLOUDINARY_CLOUD_NAME=CLOUDINARY_CLOUD_NAME 11 | CLOUDINARY_API_KEY=CLOUDINARY_API_KEY 12 | CLOUDINARY_API_SECRET=CLOUDINARY_API_SECRET 13 | -------------------------------------------------------------------------------- /backend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | WORKDIR /app 4 | 5 | # Install curl for healthcheck 6 | RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* 7 | 8 | # Copy requirements first to leverage Docker cache 9 | COPY requirements.txt . 10 | 11 | # Install dependencies 12 | RUN pip install --no-cache-dir -r requirements.txt 13 | 14 | # Copy the rest of the application 15 | COPY . . 16 | 17 | # Expose the port the app runs on 18 | EXPOSE ${PORT} 19 | 20 | # Command to run the application with conditional reload 21 | CMD if [ "$ENVIRONMENT" = "development" ]; then \ 22 | uvicorn main:app --host ${HOST} --port ${PORT} --reload; \ 23 | else \ 24 | uvicorn main:app --host ${HOST} --port ${PORT}; \ 25 | fi -------------------------------------------------------------------------------- /backend/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dev-Khant/smartread/5e3aaa7c0ab25515f051cd728814a9111df44fe1/backend/api/__init__.py -------------------------------------------------------------------------------- /backend/api/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Dict, Optional 3 | 4 | 5 | class URLRequest(BaseModel): 6 | url: str 7 | page_number: int = 1 8 | 9 | class Config: 10 | json_schema_extra = { 11 | "example": {"url": "https://example.com/image.jpg", "page_number": 1} 12 | } 13 | 14 | 15 | class ErrorResponse(BaseModel): 16 | detail: str 17 | 18 | 19 | class HealthCheck(BaseModel): 20 | status: str 21 | message: str 22 | 23 | 24 | class APIResponse(BaseModel): 25 | status: str 26 | message: str 27 | data: Dict 28 | 29 | 30 | class Dimensions(BaseModel): 31 | dpi: int 32 | height: int 33 | width: int 34 | 35 | 36 | class Images(BaseModel): 37 | id: str 38 | top_left_x: float 39 | top_left_y: float 40 | bottom_right_x: float 41 | bottom_right_y: float 42 | image_url: Optional[str] = None 43 | 44 | 45 | class Page(BaseModel): 46 | index: int 47 | content: str 48 | highlights: List[str] 49 | dimensions: Dimensions 50 | images: List[Images] 51 | resources: Dict 52 | 53 | 54 | class DownloadPDFRequest(BaseModel): 55 | pdf_url: str 56 | -------------------------------------------------------------------------------- /backend/api/routes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | import tempfile 4 | import asyncio 5 | import logging 6 | from tqdm import tqdm 7 | from copy import deepcopy 8 | 9 | from fastapi.responses import JSONResponse 10 | from fastapi import APIRouter, HTTPException, BackgroundTasks 11 | from urllib.parse import urlparse 12 | from .models import ( 13 | URLRequest, 14 | HealthCheck, 15 | ErrorResponse, 16 | APIResponse, 17 | Page, 18 | Dimensions, 19 | Images, 20 | DownloadPDFRequest, 21 | ) 22 | from utils.extraction import extract_data, extract_highlights, format_to_html 23 | from utils.search import prepare_resources 24 | from utils.db import store_page, get_page, check_page_exists, get_highlights 25 | from utils.cloudinary_utils import init_cloudinary, upload_to_cloudinary 26 | from utils.download import download_and_highlight_pdf 27 | 28 | 29 | # Configure logging 30 | logging.basicConfig( 31 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 32 | ) 33 | logger = logging.getLogger(__name__) 34 | 35 | router = APIRouter() 36 | 37 | # Initialize Cloudinary 38 | init_cloudinary() 39 | 40 | 41 | @router.get( 42 | "/", 43 | response_model=HealthCheck, 44 | tags=["Health"], 45 | summary="Health Check", 46 | description="Check if the API is running", 47 | ) 48 | async def root(): 49 | return HealthCheck(status="ok", message="Welcome to SmartRead API") 50 | 51 | 52 | async def process_single_page(page, url: str, total_pages: int) -> None: 53 | """Process a single page and store it in the database""" 54 | page_number = page.index + 1 55 | 56 | # Skip if page already exists 57 | if check_page_exists(url, page_number): 58 | return 59 | 60 | try: 61 | highlights = extract_highlights(page.markdown) 62 | html, highlight_mapping = format_to_html(page.markdown, highlights) 63 | resources = prepare_resources(highlight_mapping) 64 | 65 | page_images = [] 66 | for image in page.images: 67 | image_uuid = str(uuid.uuid4()) 68 | cloudinary_img_url = upload_to_cloudinary( 69 | image.image_base64, 70 | f"url_{url}_page_{page.index}_image_{image_uuid}", 71 | ) 72 | page_images.append( 73 | Images( 74 | id=image.id, 75 | top_left_x=image.top_left_x, 76 | top_left_y=image.top_left_y, 77 | bottom_right_x=image.bottom_right_x, 78 | bottom_right_y=image.bottom_right_y, 79 | image_url=cloudinary_img_url, 80 | ) 81 | ) 82 | 83 | page_obj = Page( 84 | index=page_number, 85 | content=f"""{html}""", 86 | highlights=list(highlight_mapping.values()), 87 | dimensions=Dimensions( 88 | dpi=page.dimensions.dpi, 89 | height=page.dimensions.height, 90 | width=page.dimensions.width, 91 | ), 92 | images=page_images, 93 | resources=resources, 94 | ) 95 | final_page = page_obj.model_dump() 96 | store_page(url, page_number, final_page, total_pages) 97 | except Exception as e: 98 | logger.error(f"Error processing page {page_number}: {str(e)}") 99 | 100 | 101 | def process_remaining_pages(ocr_response, url, total_pages): 102 | """Background task to process remaining pages starting from page 2 sequentially""" 103 | try: 104 | # Process remaining pages sequentially 105 | for page in tqdm(ocr_response.pages[1:], desc="Processing remaining pages"): 106 | loop = asyncio.new_event_loop() 107 | asyncio.set_event_loop(loop) 108 | loop.run_until_complete(process_single_page(page, url, total_pages)) 109 | except Exception as e: 110 | logger.error(f"Error in background task: {str(e)}") 111 | 112 | 113 | @router.post( 114 | "/api/extract", 115 | response_model=APIResponse, 116 | responses={ 117 | 200: {"description": "Successfully processed the image"}, 118 | 400: {"model": ErrorResponse, "description": "Invalid URL provided"}, 119 | 500: {"model": ErrorResponse, "description": "Internal server error"}, 120 | 202: {"description": "Page is currently being processed"}, 121 | }, 122 | tags=["OCR"], 123 | summary="Extract Text from Image", 124 | description="Process an image from a given URL using Mistral OCR to extract text", 125 | ) 126 | async def extract_from_url(request: URLRequest, background_tasks: BackgroundTasks): 127 | try: 128 | # Check if page already exists 129 | existing_page, total_pages = get_page(request.url, request.page_number) 130 | 131 | if existing_page: 132 | return { 133 | "status": "success", 134 | "message": "Retrieved from cache", 135 | "data": { 136 | "total_pages": total_pages, 137 | "page": existing_page["page_data"], 138 | }, 139 | } 140 | 141 | try: 142 | result = urlparse(request.url) 143 | if not all([result.scheme, result.netloc]): 144 | raise ValueError("Invalid URL") 145 | except Exception: 146 | raise HTTPException(status_code=400, detail="Invalid URL provided") 147 | 148 | ocr_response = extract_data(request.url) 149 | if not ocr_response or not ocr_response.pages: 150 | raise HTTPException( 151 | status_code=500, detail="No response received from Mistral API" 152 | ) 153 | 154 | # Process first page immediately if it's the requested page 155 | first_page = ocr_response.pages[0] 156 | first_page_number = first_page.index + 1 157 | 158 | if request.page_number == first_page_number: 159 | highlights = extract_highlights(first_page.markdown) 160 | html, highlight_mapping = format_to_html(first_page.markdown, highlights) 161 | resources = prepare_resources(highlight_mapping) 162 | 163 | page_images = [] 164 | for image in first_page.images: 165 | image_uuid = str(uuid.uuid4()) 166 | cloudinary_img_url = upload_to_cloudinary( 167 | image.image_base64, 168 | f"url_{request.url}_page_{first_page.index}_image_{image_uuid}", 169 | ) 170 | page_images.append( 171 | Images( 172 | id=image.id, 173 | top_left_x=image.top_left_x, 174 | top_left_y=image.top_left_y, 175 | bottom_right_x=image.bottom_right_x, 176 | bottom_right_y=image.bottom_right_y, 177 | image_url=cloudinary_img_url, 178 | ) 179 | ) 180 | 181 | page_obj = Page( 182 | index=first_page_number, 183 | content=f"""{html}""", 184 | highlights=list(highlight_mapping.values()), 185 | dimensions=Dimensions( 186 | dpi=first_page.dimensions.dpi, 187 | height=first_page.dimensions.height, 188 | width=first_page.dimensions.width, 189 | ), 190 | images=page_images, 191 | resources=resources, 192 | ) 193 | final_page = page_obj.model_dump() 194 | response_page = deepcopy(final_page) 195 | store_page( 196 | request.url, first_page_number, final_page, len(ocr_response.pages) 197 | ) 198 | 199 | # Schedule remaining pages for background processing 200 | if len(ocr_response.pages) > 1: 201 | background_tasks.add_task( 202 | process_remaining_pages, 203 | ocr_response, 204 | request.url, 205 | len(ocr_response.pages), 206 | ) 207 | 208 | return { 209 | "status": "success", 210 | "message": "Page processed successfully", 211 | "data": {"total_pages": len(ocr_response.pages), "page": response_page}, 212 | } 213 | 214 | return JSONResponse( 215 | status_code=202, 216 | content={ 217 | "status": "processing", 218 | "message": f"Page {request.page_number} is being processed", 219 | "data": {"total_pages": len(ocr_response.pages)}, 220 | }, 221 | ) 222 | 223 | except Exception as e: 224 | raise HTTPException(status_code=500, detail=str(e)) 225 | 226 | 227 | @router.post( 228 | "/pdf/download", 229 | response_model=dict, 230 | responses={ 231 | 200: {"description": "Successfully downloaded the PDF"}, 232 | 400: {"model": ErrorResponse, "description": "Failed to download PDF"}, 233 | 500: {"model": ErrorResponse, "description": "Internal server error"}, 234 | }, 235 | tags=["PDF"], 236 | summary="Download PDF", 237 | description="Download a PDF from a given URL", 238 | ) 239 | async def download_pdf(request: DownloadPDFRequest): 240 | try: 241 | highlights = get_highlights(request.pdf_url) 242 | # Create a temporary directory to work with files 243 | with tempfile.TemporaryDirectory() as temp_dir: 244 | original_pdf_path = os.path.join(temp_dir, "original.pdf") 245 | with open(original_pdf_path, "wb") as _: 246 | success, original_filename, highlighted_pdf_path = ( 247 | download_and_highlight_pdf(request.pdf_url, highlights) 248 | ) 249 | if not success: 250 | raise HTTPException( 251 | status_code=400, detail="Failed to download PDF" 252 | ) 253 | 254 | # Upload to Cloudinary 255 | pdf_url = upload_to_cloudinary( 256 | highlighted_pdf_path, 257 | f"{'_'.join(original_filename.split('.')[:-1])}", 258 | type="pdf", 259 | ) 260 | 261 | return { 262 | "status": "success", 263 | "message": "PDF Ready", 264 | "data": {"pdf_url": pdf_url}, 265 | } 266 | except Exception as e: 267 | raise HTTPException(status_code=500, detail=str(e)) 268 | -------------------------------------------------------------------------------- /backend/api/swagger.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from fastapi.openapi.utils import get_openapi 3 | 4 | 5 | def custom_openapi(app: FastAPI): 6 | if app.openapi_schema: 7 | return app.openapi_schema 8 | 9 | openapi_schema = get_openapi( 10 | title=app.title, 11 | version=app.version, 12 | description=app.description, 13 | routes=app.routes, 14 | ) 15 | 16 | app.openapi_schema = openapi_schema 17 | return app.openapi_schema 18 | -------------------------------------------------------------------------------- /backend/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uvicorn 3 | from dotenv import load_dotenv 4 | 5 | from fastapi import FastAPI 6 | from fastapi.middleware.cors import CORSMiddleware 7 | from fastapi.responses import JSONResponse 8 | 9 | from api.routes import router 10 | from api.swagger import custom_openapi 11 | 12 | # Load environment variables 13 | load_dotenv() 14 | 15 | 16 | def get_application() -> FastAPI: 17 | app = FastAPI( 18 | title="SmartRead API", 19 | description="API for extracting and processing text from images using Mistral OCR", 20 | version="1.0.0", 21 | docs_url="/docs", 22 | redoc_url="/redoc", 23 | ) 24 | 25 | # Configure CORS with more permissive settings for development 26 | origins = [ 27 | "http://localhost:3000", # React default port 28 | "http://127.0.0.1:3000", 29 | "http://localhost:5173", # Vite default port 30 | "http://127.0.0.1:5173", 31 | "https://smartread-beta.vercel.app", 32 | ] 33 | 34 | if os.getenv("ENVIRONMENT") == "development": 35 | # In development, you might want to allow all origins 36 | app.add_middleware( 37 | CORSMiddleware, 38 | allow_origins=origins, 39 | allow_credentials=True, 40 | allow_methods=["*"], 41 | allow_headers=["*"], 42 | expose_headers=["*"], 43 | max_age=3600, 44 | ) 45 | else: 46 | # In production, be more restrictive 47 | app.add_middleware( 48 | CORSMiddleware, 49 | allow_origins=origins, 50 | allow_credentials=True, 51 | allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"], 52 | allow_headers=["Authorization", "Content-Type"], 53 | expose_headers=["*"], 54 | max_age=3600, 55 | ) 56 | 57 | # Include routes 58 | app.include_router(router) 59 | 60 | # Configure custom OpenAPI 61 | app.openapi = lambda: custom_openapi(app) 62 | 63 | return app 64 | 65 | 66 | app = get_application() 67 | 68 | 69 | @app.options("/{path:path}") 70 | async def options_handler(path: str): 71 | return JSONResponse( 72 | status_code=200, 73 | content={"message": "OK"}, 74 | headers={ 75 | "Access-Control-Allow-Origin": "*", 76 | "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS", 77 | "Access-Control-Allow-Headers": "Authorization, Content-Type", 78 | }, 79 | ) 80 | 81 | 82 | if __name__ == "__main__": 83 | uvicorn.run( 84 | "main:app", 85 | host=str(os.getenv("HOST")), 86 | port=int(os.getenv("PORT")), 87 | reload=True, # Enable auto-reload 88 | reload_dirs=["api"], # Watch the api directory for changes 89 | workers=1, # Use single worker for development 90 | log_level="debug", # More detailed logging 91 | ) 92 | -------------------------------------------------------------------------------- /backend/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.115.4 2 | uvicorn==0.34.0 3 | python-dotenv==1.0.1 4 | pydantic==2.10.6 5 | mistralai==1.5.1 6 | python-multipart==0.0.20 7 | httpx==0.28.1 8 | groq==0.18.0 9 | requests==2.32.3 10 | pymongo==4.11.2 11 | tqdm==4.67.1 12 | cloudinary==1.42.2 13 | pymupdf==1.25.3 -------------------------------------------------------------------------------- /backend/utils/cloudinary_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Optional 3 | 4 | import cloudinary 5 | import cloudinary.uploader 6 | 7 | def init_cloudinary(): 8 | """Initialize Cloudinary configuration""" 9 | cloudinary.config( 10 | cloud_name=os.getenv("CLOUDINARY_CLOUD_NAME"), 11 | api_key=os.getenv("CLOUDINARY_API_KEY"), 12 | api_secret=os.getenv("CLOUDINARY_API_SECRET"), 13 | secure=True, 14 | ) 15 | 16 | 17 | def upload_to_cloudinary( 18 | file: str, public_id: str, type: str = "image" 19 | ) -> Optional[Dict]: 20 | """ 21 | Upload a file to Cloudinary and generate thumbnails 22 | Returns: Dictionary containing image URLs and metadata 23 | """ 24 | try: 25 | if type == "image": 26 | if "data:image" not in file: 27 | file = f"data:image/png;base64,{file}" 28 | 29 | upload_result = cloudinary.uploader.upload( 30 | file=file, public_id=public_id, folder="smartread", overwrite=False 31 | ) 32 | 33 | # Get URLs for different versions 34 | original_url = upload_result["secure_url"] 35 | 36 | return original_url 37 | except Exception as e: 38 | print(f"Error uploading image: {str(e)}") 39 | return None 40 | -------------------------------------------------------------------------------- /backend/utils/db.py: -------------------------------------------------------------------------------- 1 | import os 2 | import base64 3 | 4 | from pymongo import MongoClient 5 | 6 | 7 | # Initialize database connection at module level 8 | def _init_database(): 9 | """ 10 | Initialize MongoDB database connection 11 | """ 12 | mongodb_url = os.getenv("MONGODB_URL") 13 | if not mongodb_url: 14 | raise ValueError("MongoDB URL not configured") 15 | 16 | client = MongoClient(mongodb_url) 17 | return client.smartread 18 | 19 | 20 | # Global database instance 21 | db = _init_database() 22 | 23 | 24 | def store_page(url: str, page_number: int, page_data: dict, total_pages: int): 25 | """ 26 | Store page data in MongoDB with HTML content encoded in base64 27 | """ 28 | collection = db.pages 29 | 30 | document_id = f"{base64.b64encode(url.encode()).decode()}" 31 | 32 | page_data["content"] = base64.b64encode(page_data["content"].encode()).decode() 33 | 34 | if "resources" in page_data: 35 | page_data["resources"] = {str(k): v for k, v in page_data["resources"].items()} 36 | 37 | # Insert the document 38 | collection.insert_one( 39 | { 40 | "document_id": document_id, 41 | "url": url, 42 | "page_number": page_number, 43 | "page_data": page_data, 44 | "total_pages": total_pages, 45 | } 46 | ) 47 | return document_id 48 | 49 | 50 | def check_page_exists(url: str, page_number: int) -> bool: 51 | """ 52 | Check if a specific page exists for a URL 53 | Returns: bool indicating if the page exists 54 | """ 55 | collection = db.pages 56 | 57 | document_id = f"{base64.b64encode(url.encode()).decode()}" 58 | return ( 59 | collection.count_documents( 60 | {"document_id": document_id, "page_number": page_number} 61 | ) 62 | > 0 63 | ) 64 | 65 | 66 | def get_page(url: str, page_number: int): 67 | """ 68 | Retrieve page data from MongoDB and total page count 69 | Returns: (page_data, total_pages) with decoded HTML content 70 | """ 71 | collection = db.pages 72 | 73 | document_id = f"{base64.b64encode(url.encode()).decode()}" 74 | page_data = collection.find_one( 75 | {"document_id": document_id, "page_number": page_number} 76 | ) 77 | 78 | if page_data and "page_data" in page_data and "content" in page_data["page_data"]: 79 | page_data["page_data"]["content"] = base64.b64decode( 80 | page_data["page_data"]["content"] 81 | ).decode() 82 | 83 | return page_data, 15 84 | 85 | 86 | def get_highlights(url: str): 87 | """ 88 | Retrieve highlights from MongoDB 89 | Returns: List of highlights 90 | """ 91 | document_id = f"{base64.b64encode(url.encode()).decode()}" 92 | pages = db.pages.find({"document_id": document_id}) 93 | 94 | highlights_dict = {} 95 | for page in pages: 96 | highlights_dict[page["page_number"] - 1] = page["page_data"]["highlights"] 97 | 98 | return highlights_dict 99 | -------------------------------------------------------------------------------- /backend/utils/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import fitz 4 | import logging 5 | from urllib.parse import urlparse 6 | 7 | # Configure logging 8 | logging.basicConfig( 9 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 10 | ) 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def download_and_highlight_pdf(url: str, highlights: dict) -> bool: 15 | """ 16 | Download a PDF from URL, highlight specified text, and save both original and highlighted versions. 17 | 18 | :param url: The URL of the PDF file to be downloaded 19 | :param highlights: Dictionary of highlights for each page 20 | :return: True if successful, False otherwise 21 | """ 22 | try: 23 | response = requests.get(url, stream=True, timeout=10) 24 | 25 | if response.status_code == 200: 26 | parsed_url = urlparse(url) 27 | original_filename = os.path.basename(parsed_url.path) 28 | if not original_filename.lower().endswith(".pdf"): 29 | original_filename += ".pdf" 30 | 31 | highlighted_filename = f"highlighted_{original_filename}" 32 | 33 | filepath = os.path.join(os.getcwd(), original_filename) 34 | with open(filepath, "wb") as pdf_file: 35 | for chunk in response.iter_content(chunk_size=1024): 36 | if chunk: 37 | pdf_file.write(chunk) 38 | 39 | # Highlight the PDF 40 | doc = fitz.open(filepath) 41 | 42 | for page_num, sentences_to_highlight in highlights.items(): 43 | if page_num < 0 or page_num >= len(doc): 44 | logger.error(f"Invalid page number. PDF has {len(doc)} pages.") 45 | return False 46 | 47 | page = doc[page_num] 48 | text_instances = [] 49 | for sentence in sentences_to_highlight: 50 | text_instances.extend(page.search_for(sentence)) 51 | 52 | for inst in text_instances: 53 | highlight = page.add_highlight_annot(inst) 54 | highlight.update() 55 | 56 | highlighted_filepath = os.path.join(os.getcwd(), highlighted_filename) 57 | doc.save(highlighted_filepath, garbage=4, deflate=True) 58 | logger.info( 59 | f"Found and highlighted {len(text_instances)} matches on page {page_num + 1}" 60 | ) 61 | 62 | doc.close() 63 | return True, original_filename, highlighted_filepath 64 | 65 | else: 66 | logger.error( 67 | f"Failed to download file. Status code: {response.status_code}" 68 | ) 69 | return False 70 | 71 | except requests.RequestException as e: 72 | logger.error(f"Download error: {e}") 73 | return False 74 | except Exception as e: 75 | logger.error(f"Processing error: {e}") 76 | return False 77 | -------------------------------------------------------------------------------- /backend/utils/extraction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from mistralai import Mistral 4 | from groq import Groq 5 | from dotenv import load_dotenv 6 | 7 | from .prompts import ( 8 | HTML_FORMATTING_PROMPT, 9 | HIGHLIGHT_PROMPT, 10 | SEARCHABLE_SENTENCES_PROMPT, 11 | ) 12 | 13 | load_dotenv() 14 | 15 | MISTRAL_CLIENT = Mistral(api_key=os.getenv("MISTRAL_API_KEY")) 16 | GROQ_CLIENT = Groq(api_key=os.getenv("GROQ_API_KEY")) 17 | 18 | 19 | def extract_data(url: str): 20 | """ 21 | Extract text from a URL using Mistral OCR. 22 | 23 | Args: 24 | url (str): The URL of the document to extract text from. 25 | 26 | Returns: 27 | str: The extracted text from the document. 28 | """ 29 | ocr_response = MISTRAL_CLIENT.ocr.process( 30 | model="mistral-ocr-latest", 31 | document={"type": "document_url", "document_url": url}, 32 | include_image_base64=True, 33 | ) 34 | return ocr_response 35 | 36 | 37 | def extract_highlights(content: str): 38 | """ 39 | Extract highlights from a given text using Groq. 40 | 41 | Args: 42 | content (str): The text to extract highlights from. 43 | 44 | Returns: 45 | str: The extracted highlights from the text. 46 | """ 47 | response = GROQ_CLIENT.chat.completions.create( 48 | model="llama-3.1-8b-instant", 49 | messages=[ 50 | {"role": "system", "content": HIGHLIGHT_PROMPT}, 51 | {"role": "user", "content": content}, 52 | ], 53 | temperature=0.0, 54 | ) 55 | return response.choices[0].message.content 56 | 57 | 58 | def format_to_html(content: str, highlights: str): 59 | """ 60 | Format the extracted text and highlights into HTML. 61 | 62 | Args: 63 | content (str): The extracted text. 64 | highlights (str): The extracted highlights. 65 | 66 | Returns: 67 | tuple: A tuple containing: 68 | - str: The formatted HTML with indexed highlight tags 69 | - dict: A dictionary mapping highlight indexes to their sentences 70 | """ 71 | response = GROQ_CLIENT.chat.completions.create( 72 | model="llama-3.3-70b-versatile", 73 | messages=[ 74 | {"role": "system", "content": HTML_FORMATTING_PROMPT}, 75 | { 76 | "role": "user", 77 | "content": f"Markdown text: {content}\n\nList of sentences to highlight: {highlights}", 78 | }, 79 | ], 80 | temperature=0.0, 81 | ) 82 | 83 | html_content = response.choices[0].message.content 84 | 85 | # Extract highlight mapping 86 | highlight_mapping = {} 87 | highlight_pattern = r'(.*?)' 88 | matches = re.finditer(highlight_pattern, html_content) 89 | for match in matches: 90 | index = int(match.group(1)) 91 | sentence = match.group(2) 92 | highlight_mapping[index] = sentence 93 | 94 | return html_content, highlight_mapping 95 | 96 | 97 | def extract_searchable_sentences(content: str): 98 | """ 99 | Extract searchable sentences from a given text using Groq. 100 | 101 | Args: 102 | content (str): The text to extract searchable sentences from. 103 | 104 | Returns: 105 | str: The extracted searchable sentences from the text. 106 | """ 107 | response = GROQ_CLIENT.chat.completions.create( 108 | model="llama-3.1-8b-instant", 109 | messages=[ 110 | {"role": "system", "content": SEARCHABLE_SENTENCES_PROMPT}, 111 | {"role": "user", "content": content}, 112 | ], 113 | temperature=0.0, 114 | ) 115 | return response.choices[0].message.content 116 | -------------------------------------------------------------------------------- /backend/utils/prompts.py: -------------------------------------------------------------------------------- 1 | HTML_FORMATTING_PROMPT = """ 2 | You are an expert highlight writer and HTML formatter. Your task is to highlight important statements or sentences and structure the provided Markdown text into valid, properly formatted HTML. 3 | 4 | **You will be provided with:** 5 | - A Markdown text that needs to be converted to HTML. 6 | - A list of sentences that must be highlighted. 7 | 8 | **Follow these detailed guidelines:** 9 | 1. HTML Structuring and Formatting: 10 | - Use proper HTML tags such as

,

,

,

,