├── pranaam
    ├── py.typed
    ├── __init__.py
    ├── logging.py
    ├── base.py
    ├── pranaam.py
    ├── utils.py
    └── naam.py
├── docs
    ├── _static
    │   └── .gitkeep
    ├── requirements.txt
    ├── make.bat
    ├── Makefile
    ├── index.md
    ├── installation.md
    ├── quickstart.md
    ├── api.md
    ├── conf.py
    ├── contributing.md
    └── examples.md
├── streamlit
    ├── requirements.txt
    ├── run_app.py
    └── streamlit_app.py
├── .github
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   ├── adjacent_repo_recommender.yaml
    │   ├── docs.yml
    │   ├── ci.yml
    │   └── python-publish.yml
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_logging.py
    ├── test_base.py
    ├── test_cli.py
    ├── test_e2e.py
    ├── test_naam.py
    └── test_utils.py
├── Citation.cff
├── .pre-commit-config.yaml
├── LICENSE
├── Makefile
├── examples
    ├── basic_usage.py
    ├── pandas_integration.py
    ├── csv_processor.py
    ├── README.md
    └── performance_demo.py
├── .gitignore
├── README.md
├── pyproject.toml
├── model_training
    └── 01_uncompress_data.ipynb
└── scripts
    └── migrate_models.py


/pranaam/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/_static/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/streamlit/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file is deprecated - use pyproject.toml instead
2 | # Install with: pip install -e .[streamlit]
3 | 
4 | pranaam[streamlit]
5 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Documentation build requirements
2 | sphinx>=5.0.0
3 | sphinx-rtd-theme>=1.0.0
4 | myst-parser>=0.18.0
5 | tomli>=2.0.0;python_version<"3.11"
6 | 
7 | # For live documentation serving (optional)
8 | sphinx-autobuild>=2021.3.14


--------------------------------------------------------------------------------
/pranaam/__init__.py:
--------------------------------------------------------------------------------
 1 | """Pranaam - Religion prediction from names.
 2 | 
 3 | A Python package for predicting religion from names using machine learning
 4 | models trained on Bihar Land Records data.
 5 | """
 6 | 
 7 | from .naam import Naam
 8 | from .pranaam import pred_rel
 9 | 
10 | __version__ = "0.0.2"
11 | __all__ = ["pred_rel", "Naam"]
12 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Please briefly describe your problem and what output you expect.
 2 | 
 3 | Please include a [minimal reprex](https://github.com/jennybc/reprex#what-is-a-reprex). The goal of a reprex is to make it as easy as possible for me to recreate your problem so that I can fix it.
 4 | 
 5 | Delete these instructions once you have read them.
 6 | 
 7 | ---
 8 | 
 9 | Brief description of the problem
10 | 
11 | ``` r
12 | # insert reprex here
13 | ```
14 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from collections.abc import Callable, Generator
 3 | from contextlib import contextmanager
 4 | from io import StringIO
 5 | from typing import Any
 6 | 
 7 | 
 8 | @contextmanager
 9 | def capture(
10 |     command: Callable[..., Any], *args: Any, **kwargs: Any
11 | ) -> Generator[str, None, None]:
12 |     out, sys.stdout = sys.stdout, StringIO()
13 |     command(*args, **kwargs)
14 |     sys.stdout.seek(0)
15 |     yield sys.stdout.read()
16 |     sys.stdout = out
17 | 


--------------------------------------------------------------------------------
/.github/workflows/adjacent_repo_recommender.yaml:
--------------------------------------------------------------------------------
 1 | name: Find Adjacent Repositories
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 5 * * 0'   # Every Sunday at 5am UTC
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   recommend-repos:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout repository
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Adjacent Repositories Recommender
16 |         uses: gojiplus/adjacent@v1.3
17 |         with:
18 |           token: ${{ secrets.GITHUB_TOKEN }}  # ✅ Pass the required token
19 | 
20 |       - name: Commit and push changes
21 |         run: |
22 |           git config --global user.name "github-actions"
23 |           git config --global user.email "actions@github.com"
24 |           git add README.md
25 |           git commit -m "Update adjacent repositories [automated]" || echo "No changes to commit"
26 |           git push


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd


--------------------------------------------------------------------------------
/Citation.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Chintalapati"
 5 |   given-names: "Rajashekar"
 6 |   email: "rajshekar.ch@gmail.com"
 7 | - family-names: "Dar"
 8 |   given-names: "Aaditya"
 9 | - family-names: "Sood"
10 |   given-names: "Gaurav"
11 |   email: "gsood07@gmail.com"
12 | title: "pranaam: Predict religion and caste based on name"
13 | version: 0.2.0
14 | date-released: 2024-12-03
15 | url: "https://github.com/appeler/pranaam"
16 | repository-code: "https://github.com/appeler/pranaam"
17 | repository-artifact: "https://pypi.org/project/pranaam/"
18 | abstract: "Predict religion and caste based on name using machine learning models for Hindi and English names"
19 | keywords:
20 |   - predict
21 |   - religion
22 |   - name
23 |   - hindi
24 |   - english
25 |   - machine-learning
26 |   - deep-learning
27 |   - nlp
28 |   - name-classification
29 |   - tensorflow
30 | license: MIT


--------------------------------------------------------------------------------
/pranaam/logging.py:
--------------------------------------------------------------------------------
 1 | """Logging configuration for pranaam package."""
 2 | 
 3 | import logging
 4 | 
 5 | from rich.logging import RichHandler
 6 | 
 7 | 
 8 | def get_logger(name: str | None = None) -> logging.Logger:
 9 |     """Get a configured logger instance.
10 | 
11 |     Args:
12 |         name: Logger name, defaults to 'pranaam'
13 | 
14 |     Returns:
15 |         Configured logger instance
16 |     """
17 |     logger_name = name or "pranaam"
18 |     logger = logging.getLogger(logger_name)
19 | 
20 |     # Only configure if no handlers exist (avoid duplicate configuration)
21 |     if not logger.handlers:
22 |         handler = RichHandler(
23 |             show_time=True, show_path=False, rich_tracebacks=True, markup=True
24 |         )
25 |         formatter = logging.Formatter("%(name)s - %(message)s")
26 |         handler.setFormatter(formatter)
27 |         logger.addHandler(handler)
28 |         logger.setLevel(logging.INFO)
29 | 
30 |     return logger
31 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     rev: v0.8.7
 4 |     hooks:
 5 |       - id: ruff
 6 |         args: [--fix]
 7 |       - id: ruff-format
 8 | 
 9 |   - repo: https://github.com/pre-commit/mirrors-mypy
10 |     rev: v1.12.1
11 |     hooks:
12 |       - id: mypy
13 |         additional_dependencies:
14 |           - types-requests
15 |           - pandas-stubs
16 |         args: [--ignore-missing-imports]
17 | 
18 |   - repo: https://github.com/pre-commit/pre-commit-hooks
19 |     rev: v5.0.0
20 |     hooks:
21 |       - id: trailing-whitespace
22 |       - id: end-of-file-fixer
23 |       - id: check-yaml
24 |       - id: check-toml
25 |       - id: check-merge-conflict
26 |       - id: check-case-conflict
27 |       - id: check-docstring-first
28 |       - id: debug-statements
29 |       - id: check-added-large-files
30 | 
31 |   - repo: https://github.com/pycqa/bandit
32 |     rev: 1.8.0
33 |     hooks:
34 |       - id: bandit
35 |         args: [-r, ., -ll]
36 |         exclude: ^tests/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 appeler
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD  ?= sphinx-build
 8 | SOURCEDIR    = .
 9 | BUILDDIR     = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx-build using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | # Custom targets for convenience
23 | clean:
24 | 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
25 | 
26 | html:
27 | 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
28 | 
29 | livehtml:
30 | 	sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
31 | 
32 | linkcheck:
33 | 	@$(SPHINXBUILD) -M linkcheck "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
34 | 
35 | # GitHub Pages build
36 | github:
37 | 	@$(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
38 | 	@echo
39 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: help clean install test test-cov lint format type-check docs docs-serve build upload dev-install
 2 | 
 3 | help: ## Show this help message
 4 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
 5 | 
 6 | clean: ## Clean build artifacts
 7 | 	rm -rf build/
 8 | 	rm -rf dist/
 9 | 	rm -rf *.egg-info/
10 | 	rm -rf .pytest_cache/
11 | 	rm -rf .coverage
12 | 	rm -rf htmlcov/
13 | 	rm -rf docs/_build/
14 | 	find . -type d -name __pycache__ -delete
15 | 	find . -type f -name "*.pyc" -delete
16 | 
17 | install: ## Install package
18 | 	uv pip install -e .
19 | 
20 | dev-install: ## Install package with development dependencies
21 | 	uv sync --dev
22 | 	pre-commit install
23 | 
24 | test: ## Run tests
25 | 	pytest
26 | 
27 | test-cov: ## Run tests with coverage
28 | 	pytest --cov=pranaam --cov-report=html --cov-report=term
29 | 
30 | lint: ## Run linter
31 | 	ruff check .
32 | 
33 | format: ## Format code
34 | 	ruff format .
35 | 	ruff check --fix .
36 | 
37 | type-check: ## Run type checker
38 | 	mypy pranaam/
39 | 
40 | docs: ## Build documentation
41 | 	cd docs && make clean && make html
42 | 
43 | docs-serve: ## Serve documentation locally
44 | 	cd docs/_build/html && python -m http.server 8000
45 | 
46 | build: ## Build package
47 | 	uv build
48 | 
49 | upload: ## Upload to PyPI
50 | 	uv publish
51 | 
52 | ci: lint type-check test ## Run CI checks


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   workflow_dispatch:
 9 | 
10 | permissions:
11 |   contents: read
12 |   pages: write
13 |   id-token: write
14 | 
15 | concurrency:
16 |   group: "pages"
17 |   cancel-in-progress: false
18 | 
19 | jobs:
20 |   build:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       
25 |       - name: Set up Python
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: "3.11"
29 |       
30 |       - name: Install uv
31 |         uses: astral-sh/setup-uv@v7.1.6
32 |       
33 |       - name: Install dependencies
34 |         run: |
35 |           uv sync --extra docs
36 |       
37 |       - name: Build documentation
38 |         run: |
39 |           uv run sphinx-build -M clean docs docs/_build
40 |           uv run sphinx-build -M html docs docs/_build
41 |       
42 |       - name: Upload artifact
43 |         uses: actions/upload-pages-artifact@v3
44 |         with:
45 |           path: ./docs/_build/html
46 | 
47 |   deploy:
48 |     if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master')
49 |     environment:
50 |       name: github-pages
51 |       url: ${{ steps.deployment.outputs.page_url }}
52 |     runs-on: ubuntu-latest
53 |     needs: build
54 |     steps:
55 |       - name: Deploy to GitHub Pages
56 |         id: deployment
57 |         uses: actions/deploy-pages@v4


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to pranaam's documentation!
 2 | 
 3 | **pranaam** is a Python package for predicting religion from names using machine learning models trained on Bihar Land Records data. The package supports both Hindi and English names and provides high accuracy predictions.
 4 | 
 5 | :::{toctree}
 6 | :maxdepth: 2
 7 | :caption: Contents:
 8 | 
 9 | installation
10 | quickstart
11 | api
12 | examples
13 | contributing
14 | :::
15 | 
16 | ## Overview
17 | 
18 | Pranaam uses machine learning models trained on 4M unique records from Bihar Land Records data to predict religion (currently Muslim/not-Muslim) from names. The package supports:
19 | 
20 | * **High Accuracy**: 98% accuracy on unseen names for both Hindi and English
21 | * **Multiple Languages**: Support for Hindi and English names
22 | * **Easy to Use**: Simple API with pandas DataFrame output
23 | * **Pre-trained Models**: Models are automatically downloaded and cached
24 | 
25 | ## Quick Example
26 | 
27 | ```python
28 | from pranaam import pred_rel
29 | 
30 | # English names
31 | names = ["Shah Rukh Khan", "Amitabh Bachchan"]
32 | result = pred_rel(names)
33 | print(result)
34 | 
35 | # Hindi names  
36 | hindi_names = ["शाहरुख खान", "अमिताभ बच्चन"]
37 | result = pred_rel(hindi_names, lang="hin")
38 | print(result)
39 | ```
40 | 
41 | ## Installation
42 | 
43 | Install pranaam using pip:
44 | 
45 | ```bash
46 | pip install pranaam
47 | ```
48 | 
49 | For development:
50 | 
51 | ```bash
52 | pip install -e .[dev]
53 | ```
54 | 
55 | ## Indices and tables
56 | 
57 | * {ref}`genindex`
58 | * {ref}`modindex`
59 | * {ref}`search`


--------------------------------------------------------------------------------
/streamlit/run_app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Simple script to run the Streamlit app locally.
 4 | Usage: python run_app.py
 5 | """
 6 | 
 7 | import os
 8 | import subprocess
 9 | import sys
10 | 
11 | 
12 | def main():
13 |     """Run the Streamlit app."""
14 |     # Get the directory containing this script
15 |     script_dir = os.path.dirname(os.path.abspath(__file__))
16 |     app_path = os.path.join(script_dir, "streamlit_app.py")
17 | 
18 |     print("🚀 Starting Pranaam Streamlit App...")
19 |     print(f"📁 App location: {app_path}")
20 |     print("🌐 App will be available at: http://localhost:8501")
21 |     print("🛑 Press Ctrl+C to stop the app")
22 |     print("-" * 50)
23 | 
24 |     try:
25 |         # Run streamlit
26 |         subprocess.run(
27 |             [
28 |                 sys.executable,
29 |                 "-m",
30 |                 "streamlit",
31 |                 "run",
32 |                 app_path,
33 |                 "--server.headless",
34 |                 "false",
35 |                 "--server.enableCORS",
36 |                 "false",
37 |                 "--server.enableXsrfProtection",
38 |                 "false",
39 |             ],
40 |             check=True,
41 |         )
42 |     except KeyboardInterrupt:
43 |         print("\n🛑 App stopped by user")
44 |     except FileNotFoundError:
45 |         print("❌ Error: streamlit not found. Install with: pip install streamlit")
46 |         sys.exit(1)
47 |     except subprocess.CalledProcessError as e:
48 |         print(f"❌ Error running streamlit: {e}")
49 |         sys.exit(1)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/pranaam/base.py:
--------------------------------------------------------------------------------
 1 | from importlib.resources import files
 2 | from pathlib import Path
 3 | 
 4 | from .logging import get_logger
 5 | from .utils import REPO_BASE_URL, download_file
 6 | 
 7 | logger = get_logger()
 8 | 
 9 | 
10 | class Base:
11 |     """Base class for model data management and loading."""
12 | 
13 |     MODELFN: str | None = None
14 | 
15 |     @classmethod
16 |     def load_model_data(cls, file_name: str, latest: bool = False) -> Path | None:
17 |         """Load model data, downloading if necessary.
18 | 
19 |         Args:
20 |             file_name: Name of the model file to load
21 |             latest: Whether to force download of latest version
22 | 
23 |         Returns:
24 |             Path to the model directory, or None if loading failed
25 |         """
26 |         model_path: Path | None = None
27 |         if cls.MODELFN:
28 |             # Use modern importlib.resources instead of deprecated pkg_resources
29 |             package_dir = files(__package__)
30 |             model_dir = Path(str(package_dir)) / cls.MODELFN
31 |             model_dir.mkdir(exist_ok=True)
32 | 
33 |             target_file = model_dir / file_name
34 |             if not target_file.exists() or latest:
35 |                 logger.debug(
36 |                     f"Downloading model data from the server (this is done only first time) ({model_dir})..."
37 |                 )
38 |                 if not download_file(REPO_BASE_URL, str(model_dir), file_name):
39 |                     logger.error("ERROR: Cannot download model data file")
40 |             else:
41 |                 logger.debug(f"Using model data from {model_dir}...")
42 |             model_path = model_dir
43 | 
44 |         return model_path
45 | 


--------------------------------------------------------------------------------
/pranaam/pranaam.py:
--------------------------------------------------------------------------------
 1 | """Entry point module for pranaam CLI."""
 2 | 
 3 | import argparse
 4 | import sys
 5 | 
 6 | from .logging import get_logger
 7 | from .naam import Naam
 8 | 
 9 | logger = get_logger()
10 | 
11 | # Export main prediction function
12 | pred_rel = Naam.pred_rel
13 | 
14 | 
15 | def main(argv: list[str] | None = None) -> int:
16 |     """Main CLI entry point for religion prediction.
17 | 
18 |     Args:
19 |         argv: Command line arguments, defaults to sys.argv[1:]
20 | 
21 |     Returns:
22 |         Exit code (0 for success, non-zero for error)
23 |     """
24 |     if argv is None:
25 |         argv = sys.argv[1:]
26 | 
27 |     parser = argparse.ArgumentParser(
28 |         description="Predict religion based on name",
29 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
30 |     )
31 |     parser.add_argument(
32 |         "--input", required=True, help="Name to analyze (single name as string)"
33 |     )
34 |     parser.add_argument(
35 |         "--lang", default="eng", choices=["eng", "hin"], help="Language of input name"
36 |     )
37 |     parser.add_argument(
38 |         "--latest", action="store_true", help="Download latest model version"
39 |     )
40 | 
41 |     try:
42 |         args = parser.parse_args(argv)
43 |         result = pred_rel(args.input, lang=args.lang, latest=args.latest)
44 |         print(result.to_string(index=False))
45 |         return 0
46 | 
47 |     except SystemExit:
48 |         # Re-raise SystemExit for help and argument errors
49 |         raise
50 |     except Exception as e:
51 |         error_message = f"Error: {e}"
52 |         logger.error(error_message)
53 |         print(error_message, file=sys.stderr)
54 |         return 1
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     sys.exit(main())
59 | 


--------------------------------------------------------------------------------
/examples/basic_usage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Basic Usage Examples for Pranaam
 4 | 
 5 | This script demonstrates the most common usage patterns for the pranaam package.
 6 | """
 7 | 
 8 | import pranaam
 9 | 
10 | 
11 | def single_name_prediction():
12 |     """Predict religion for a single name."""
13 |     print("🔮 Single Name Prediction")
14 |     print("=" * 40)
15 | 
16 |     # English name
17 |     result = pranaam.pred_rel("Shah Rukh Khan", lang="eng")
18 |     print("English name:")
19 |     print(result)
20 |     print()
21 | 
22 |     # Hindi name
23 |     result = pranaam.pred_rel("शाहरुख खान", lang="hin")
24 |     print("Hindi name:")
25 |     print(result)
26 |     print()
27 | 
28 | 
29 | def multiple_names_prediction():
30 |     """Predict religion for multiple names."""
31 |     print("📝 Multiple Names Prediction")
32 |     print("=" * 40)
33 | 
34 |     # List of English names
35 |     names = ["Shah Rukh Khan", "Amitabh Bachchan", "Salman Khan", "Akshay Kumar"]
36 | 
37 |     result = pranaam.pred_rel(names, lang="eng")
38 |     print("Batch prediction results:")
39 |     print(result)
40 |     print()
41 | 
42 | 
43 | def mixed_examples():
44 |     """Show predictions for mixed cultural names."""
45 |     print("🌍 Mixed Cultural Names")
46 |     print("=" * 40)
47 | 
48 |     diverse_names = [
49 |         "Mohammed Ali",
50 |         "Priya Sharma",
51 |         "Fatima Khan",
52 |         "Raj Patel",
53 |         "John Smith",
54 |     ]
55 | 
56 |     result = pranaam.pred_rel(diverse_names, lang="eng")
57 | 
58 |     print("Name                | Prediction   | Confidence")
59 |     print("-" * 45)
60 |     for _, row in result.iterrows():
61 |         print(
62 |             f"{row['name']:<18} | {row['pred_label']:<10} | {row['pred_prob_muslim']:>6.1f}%"
63 |         )
64 |     print()
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     print("🔥 Pranaam Basic Usage Examples")
69 |     print("=" * 50)
70 |     print(
71 |         "This script shows simple usage patterns for religion prediction from names.\n"
72 |     )
73 | 
74 |     single_name_prediction()
75 |     multiple_names_prediction()
76 |     mixed_examples()
77 | 
78 |     print("✅ All examples completed!")
79 |     print("Next steps: Check out pandas_integration.py for data processing examples.")
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | lit/
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | 
133 | .vscode
134 | .DS_Store
135 | /pranaam/model/eng_and_hindi_models_v2
136 | /pranaam/model
137 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
  1 | # Installation
  2 | 
  3 | ## Requirements
  4 | 
  5 | * Python 3.10 or 3.11 (TensorFlow 2.14.1 compatibility requirement)
  6 | * TensorFlow 2.14.1 (automatically installed)
  7 | 
  8 | :::{note}
  9 | Python 3.12+ is not currently supported due to TensorFlow availability constraints.
 10 | :::
 11 | 
 12 | ## Standard Installation
 13 | 
 14 | We strongly recommend installing pranaam inside a Python virtual environment. (see [venv documentation](https://docs.python.org/3/library/venv.html#creating-virtual-environments))
 15 | 
 16 | Install pranaam using pip:
 17 | 
 18 | ```bash
 19 | pip install pranaam
 20 | ```
 21 | 
 22 | This installs TensorFlow 2.14.1, which is known to work correctly with the models.
 23 | 
 24 | ## Installation Options
 25 | 
 26 | For development work:
 27 | 
 28 | ```bash
 29 | pip install -e .[dev]
 30 | ```
 31 | 
 32 | For testing:
 33 | 
 34 | ```bash
 35 | pip install -e .[test]
 36 | ```
 37 | 
 38 | For documentation building:
 39 | 
 40 | ```bash
 41 | pip install -e .[docs]
 42 | ```
 43 | 
 44 | For all optional dependencies:
 45 | 
 46 | ```bash
 47 | pip install -e .[all]
 48 | ```
 49 | 
 50 | ## TensorFlow Compatibility
 51 | 
 52 | The package requires TensorFlow 2.14.1 with Keras 2.14.0 for model compatibility. If you encounter compatibility issues:
 53 | 
 54 | ```bash
 55 | pip install 'pranaam[tensorflow-compat]'
 56 | ```
 57 | 
 58 | ## Model Downloads
 59 | 
 60 | Models are automatically downloaded from Harvard Dataverse (306MB) and cached locally on first use. Ensure you have:
 61 | 
 62 | * Stable internet connection
 63 | * At least 500MB free disk space
 64 | * Unrestricted access to dataverse.harvard.edu
 65 | 
 66 | ## Verification
 67 | 
 68 | Test your installation:
 69 | 
 70 | ```python
 71 | import pranaam
 72 | result = pranaam.pred_rel("Shah Rukh Khan")
 73 | print(result)
 74 | ```
 75 | 
 76 | If successful, you should see a pandas DataFrame with prediction results.
 77 | 
 78 | ## Troubleshooting
 79 | 
 80 | ### Common Issues
 81 | 
 82 | **TensorFlow/Keras Compatibility Errors**
 83 | 
 84 | Error: `"Keras 3 only supports V3 .keras files and legacy H5 format files"`
 85 | 
 86 | Solution: Install with `pip install 'pranaam[tensorflow-compat]'`
 87 | 
 88 | **Model Download Issues**
 89 | 
 90 | Error: Network timeouts or download failures
 91 | 
 92 | Solution: Check internet connection, models are large (306MB)
 93 | 
 94 | **Import Errors**
 95 | 
 96 | Error: `pkg_resources` deprecation warnings
 97 | 
 98 | Solution: Already fixed in v0.1.0 (uses `importlib.resources`)
 99 | 
100 | For additional help, please check our [GitHub Issues](https://github.com/appeler/pranaam/issues).


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main, develop ]
 6 |   pull_request:
 7 |     branches: [ main, develop ]
 8 | 
 9 | jobs:
10 |   test:
11 |     name: Test Python ${{ matrix.python-version }} on ${{ matrix.os }}
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         os: [ubuntu-latest, macos-latest]
16 |         python-version: ["3.11", "3.12"]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 | 
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 | 
26 |     - name: Install uv
27 |       uses: astral-sh/setup-uv@v7.1.6
28 | 
29 |     - name: Install dependencies
30 |       run: |
31 |         uv sync --group dev --group test
32 | 
33 |     - name: Verify installation
34 |       run: |
35 |         uv run python -c "import pranaam; print('[OK] Pranaam installation successful')"
36 |         uv run python -c "import tensorflow as tf; print(f'[OK] TensorFlow {tf.__version__} loaded')"
37 | 
38 |     - name: Check dependency issues
39 |       run: |
40 |         uv run deptry .
41 | 
42 |     - name: Lint and format with ruff
43 |       run: |
44 |         uv run ruff check
45 |         uv run ruff format --check
46 | 
47 |     - name: Type check with mypy
48 |       run: |
49 |         uv run mypy pranaam/
50 | 
51 |     - name: Run unit tests
52 |       run: |
53 |         uv run pytest -m "not integration" --tb=short -v --maxfail=5
54 | 
55 |     - name: Run integration tests (main branch only)
56 |       if: github.event_name == 'push' && github.ref == 'refs/heads/main'
57 |       run: |
58 |         uv run pytest -m "integration" -v --tb=short
59 |       timeout-minutes: 15
60 |       env:
61 |         PYTEST_TIMEOUT: 900
62 | 
63 |   build:
64 |     name: Build and Validate Package
65 |     runs-on: ubuntu-latest
66 |     needs: test
67 | 
68 |     steps:
69 |     - uses: actions/checkout@v4
70 | 
71 |     - name: Set up Python
72 |       uses: actions/setup-python@v5
73 |       with:
74 |         python-version: "3.11"
75 | 
76 |     - name: Install uv
77 |       uses: astral-sh/setup-uv@v7.1.6
78 | 
79 |     - name: Build package
80 |       run: |
81 |         uv build
82 | 
83 |     - name: Install validation tools
84 |       run: |
85 |         uv tool install twine
86 | 
87 |     - name: Validate package
88 |       run: |
89 |         uv tool run twine check dist/* --strict
90 | 
91 |     - name: Upload build artifacts
92 |       uses: actions/upload-artifact@v4
93 |       with:
94 |         name: dist
95 |         path: dist/
96 |         retention-days: 7


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
  1 | # Quick Start
  2 | 
  3 | This guide will get you up and running with pranaam in just a few minutes.
  4 | 
  5 | ## Basic Usage
  6 | 
  7 | The main function in pranaam is `pred_rel`, which predicts religion based on names.
  8 | 
  9 | ### Single Name Prediction
 10 | 
 11 | ```python
 12 | import pranaam
 13 | 
 14 | # Predict for a single name
 15 | result = pranaam.pred_rel("Shah Rukh Khan")
 16 | print(result)
 17 | ```
 18 | 
 19 | Output:
 20 | 
 21 | ```text
 22 |          name pred_label  pred_prob_muslim
 23 | 0  Shah Rukh Khan     muslim              73.0
 24 | ```
 25 | 
 26 | ### Multiple Names (English)
 27 | 
 28 | ```python
 29 | import pranaam
 30 | 
 31 | # List of English names
 32 | names = ["Shah Rukh Khan", "Amitabh Bachchan", "Abdul Kalam"]
 33 | result = pranaam.pred_rel(names, lang="eng")
 34 | print(result)
 35 | ```
 36 | 
 37 | Output:
 38 | 
 39 | ```text
 40 |          name   pred_label  pred_prob_muslim
 41 | 0  Shah Rukh Khan       muslim              73.0
 42 | 1  Amitabh Bachchan  not-muslim              27.0
 43 | 2      Abdul Kalam       muslim              85.5
 44 | ```
 45 | 
 46 | ### Hindi Names
 47 | 
 48 | ```python
 49 | import pranaam
 50 | 
 51 | # Hindi names
 52 | hindi_names = ["शाहरुख खान", "अमिताभ बच्चन"]
 53 | result = pranaam.pred_rel(hindi_names, lang="hin")
 54 | print(result)
 55 | ```
 56 | 
 57 | Output:
 58 | 
 59 | ```text
 60 |         name   pred_label  pred_prob_muslim
 61 | 0    शाहरुख खान       muslim              73.0
 62 | 1  अमिताभ बच्चन  not-muslim              27.0
 63 | ```
 64 | 
 65 | ### Working with Pandas
 66 | 
 67 | ```python
 68 | import pandas as pd
 69 | import pranaam
 70 | 
 71 | # Create a DataFrame with names
 72 | df = pd.DataFrame({
 73 |     'names': ['Shah Rukh Khan', 'Amitabh Bachchan', 'A.P.J. Abdul Kalam'],
 74 |     'profession': ['Actor', 'Actor', 'Scientist']
 75 | })
 76 | 
 77 | # Predict religion for the names column
 78 | predictions = pranaam.pred_rel(df['names'], lang="eng")
 79 | 
 80 | # Merge with original data
 81 | result = pd.concat([df, predictions[['pred_label', 'pred_prob_muslim']]], axis=1)
 82 | print(result)
 83 | ```
 84 | 
 85 | ## Command Line Interface
 86 | 
 87 | You can also use pranaam from the command line:
 88 | 
 89 | ```bash
 90 | # Single name prediction
 91 | predict_religion --input "Shah Rukh Khan" --lang eng
 92 | 
 93 | # Hindi name prediction
 94 | predict_religion --input "शाहरुख खान" --lang hin
 95 | ```
 96 | 
 97 | ## Understanding the Output
 98 | 
 99 | The function returns a pandas DataFrame with these columns:
100 | 
101 | * **name**: The input name
102 | * **pred_label**: Predicted religion ('muslim' or 'not-muslim')  
103 | * **pred_prob_muslim**: Probability score (0-100) that the person is Muslim
104 | 
105 | ## Accuracy and Limitations
106 | 
107 | * **High Accuracy**: 98% accuracy on unseen names for both Hindi and English models
108 | * **Binary Classification**: Currently predicts Muslim vs. not-Muslim only
109 | * **Training Data**: Based on Bihar Land Records (4M+ unique records)
110 | * **Context**: Nearly 95% of India's population are Hindu or Muslim
111 | 
112 | ## Next Steps
113 | 
114 | * Check out the {doc}`api` for detailed function documentation
115 | * See {doc}`examples` for more advanced usage patterns
116 | * Learn about the training data in our [notebooks](https://github.com/appeler/pranaam/tree/main/pranaam/notebooks)


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | """Test configuration and fixtures for pranaam tests."""
  2 | 
  3 | import os
  4 | import tempfile
  5 | from collections.abc import Generator
  6 | from typing import Any
  7 | from unittest.mock import Mock
  8 | 
  9 | import numpy as np
 10 | import pytest
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def sample_english_names() -> list[str]:
 15 |     """Sample English names for testing."""
 16 |     return ["Shah Rukh Khan", "Amitabh Bachchan", "Rajesh Khanna", "Mohammed Ali"]
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def sample_hindi_names() -> list[str]:
 21 |     """Sample Hindi names for testing."""
 22 |     return ["शाहरुख खान", "अमिताभ बच्चन", "राजेश खन्ना", "मोहम्मद अली"]
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def expected_predictions() -> dict[str, dict[str, Any]]:
 27 |     """Expected predictions for sample names."""
 28 |     return {
 29 |         "Shah Rukh Khan": {"label": "muslim", "prob_range": (60, 90)},
 30 |         "Amitabh Bachchan": {"label": "not-muslim", "prob_range": (10, 40)},
 31 |         "Rajesh Khanna": {"label": "not-muslim", "prob_range": (10, 40)},
 32 |         "Mohammed Ali": {"label": "muslim", "prob_range": (70, 95)},
 33 |     }
 34 | 
 35 | 
 36 | @pytest.fixture
 37 | def mock_tensorflow_model() -> Mock:
 38 |     """Mock TensorFlow model for testing."""
 39 |     model = Mock()
 40 | 
 41 |     # Mock prediction results - returns probabilities for [not-muslim, muslim]
 42 |     def mock_predict(names: Any, verbose: int = 0) -> Any:
 43 |         results = []
 44 |         for name in names:
 45 |             if any(
 46 |                 muslim_name in str(name).lower()
 47 |                 for muslim_name in ["shah", "khan", "mohammed", "ali"]
 48 |             ):
 49 |                 # Higher probability for muslim class
 50 |                 results.append([0.2, 0.8])
 51 |             else:
 52 |                 # Higher probability for not-muslim class
 53 |                 results.append([0.8, 0.2])
 54 |         return np.array(results)
 55 | 
 56 |     model.predict = mock_predict
 57 |     return model
 58 | 
 59 | 
 60 | @pytest.fixture
 61 | def temp_model_dir() -> Generator[str, None, None]:
 62 |     """Temporary directory for model files."""
 63 |     with tempfile.TemporaryDirectory() as temp_dir:
 64 |         # Create model directory structure
 65 |         model_path = os.path.join(temp_dir, "eng_and_hindi_models_v1")
 66 |         os.makedirs(os.path.join(model_path, "eng_model"))
 67 |         os.makedirs(os.path.join(model_path, "hin_model"))
 68 |         yield temp_dir
 69 | 
 70 | 
 71 | @pytest.fixture
 72 | def mock_requests_get() -> Mock:
 73 |     """Mock requests.get for download testing."""
 74 |     mock_response = Mock()
 75 |     mock_response.headers = {"Content-Length": "1000"}
 76 |     mock_response.iter_content.return_value = [b"test data chunk"]
 77 |     mock_response.raise_for_status.return_value = None
 78 |     return mock_response
 79 | 
 80 | 
 81 | @pytest.fixture(autouse=True)
 82 | def reset_naam_class() -> Generator[None, None, None]:
 83 |     """Reset Naam class state between tests."""
 84 |     from pranaam.naam import Naam
 85 | 
 86 |     # Store original values
 87 |     original_weights_loaded = Naam.weights_loaded
 88 |     original_model = Naam.model
 89 |     original_cur_lang = Naam.cur_lang
 90 | 
 91 |     yield
 92 | 
 93 |     # Reset to original values
 94 |     Naam.weights_loaded = original_weights_loaded
 95 |     Naam.model = original_model
 96 |     Naam.cur_lang = original_cur_lang
 97 | 
 98 | 
 99 | @pytest.fixture
100 | def caplog_debug(caplog: pytest.LogCaptureFixture) -> pytest.LogCaptureFixture:
101 |     """Capture log messages at DEBUG level."""
102 |     import logging
103 | 
104 |     caplog.set_level(logging.DEBUG)
105 |     return caplog
106 | 


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
  1 | # API Reference
  2 | 
  3 | This page contains the complete API documentation for pranaam.
  4 | 
  5 | ## Main Functions
  6 | 
  7 | :::{automodule} pranaam.pranaam
  8 | :members:
  9 | :undoc-members:
 10 | :show-inheritance:
 11 | :::
 12 | 
 13 | ## Core Classes
 14 | 
 15 | ### Naam Class
 16 | 
 17 | :::{automodule} pranaam.naam
 18 | :members:
 19 | :undoc-members:
 20 | :show-inheritance:
 21 | :::
 22 | 
 23 | ### Base Class
 24 | 
 25 | :::{automodule} pranaam.base
 26 | :members:
 27 | :undoc-members:
 28 | :show-inheritance:
 29 | :::
 30 | 
 31 | ## Utility Functions
 32 | 
 33 | :::{automodule} pranaam.utils
 34 | :members:
 35 | :undoc-members:
 36 | :show-inheritance:
 37 | :::
 38 | 
 39 | ## Logging Configuration
 40 | 
 41 | :::{automodule} pranaam.logging
 42 | :members:
 43 | :undoc-members:
 44 | :show-inheritance:
 45 | :::
 46 | 
 47 | ## Function Parameters
 48 | 
 49 | ### pred_rel
 50 | 
 51 | :::{py:function} pred_rel(input, lang="eng", latest=False)
 52 | 
 53 | Predict religion (Muslim/not-Muslim) from names.
 54 | 
 55 | :param input: Name(s) to predict religion for
 56 | :type input: str, list of str, or pandas.Series
 57 | :param lang: Language of the input names ("eng" for English, "hin" for Hindi)
 58 | :type lang: str, optional
 59 | :param latest: Whether to download the latest model if available
 60 | :type latest: bool, optional
 61 | :returns: DataFrame with columns ['name', 'pred_label', 'pred_prob_muslim']
 62 | :rtype: pandas.DataFrame
 63 | :raises ValueError: If invalid language is specified
 64 | :raises FileNotFoundError: If model files cannot be found or downloaded
 65 | 
 66 | **Examples:**
 67 | 
 68 | ```python
 69 | # Single name
 70 | result = pred_rel("Shah Rukh Khan")
 71 | 
 72 | # Multiple names
 73 | result = pred_rel(["Shah Rukh Khan", "Amitabh Bachchan"], lang="eng")
 74 | 
 75 | # Hindi names
 76 | result = pred_rel(["शाहरुख खान"], lang="hin")
 77 | 
 78 | # Pandas Series
 79 | import pandas as pd
 80 | df = pd.DataFrame({"names": ["Shah Rukh Khan", "Amitabh Bachchan"]})
 81 | result = pred_rel(df["names"])
 82 | ```
 83 | :::
 84 | 
 85 | ### Return Values
 86 | 
 87 | The `pred_rel` function returns a pandas DataFrame with the following structure:
 88 | 
 89 | | Column | Type | Description |
 90 | |--------|------|-------------|
 91 | | name | str | Original input name |
 92 | | pred_label | str | Predicted religion ('muslim' or 'not-muslim') |
 93 | | pred_prob_muslim | float | Probability score (0-100) that person is Muslim |
 94 | 
 95 | ## Model Information
 96 | 
 97 | The package uses two TensorFlow models:
 98 | 
 99 | * **English Model**: Trained on transliterated names from Hindi to English
100 | * **Hindi Model**: Trained on original Hindi names from Bihar Land Records
101 | 
102 | Both models:
103 | 
104 | * Use SavedModel format (TensorFlow 2.14.1 compatible)
105 | * Achieve 98% out-of-sample accuracy
106 | * Are automatically downloaded and cached (306MB total)
107 | * Use character-level and n-gram features
108 | 
109 | ## Exception Handling
110 | 
111 | Common exceptions that may be raised:
112 | 
113 | :::{py:exception} ValueError
114 | 
115 | Raised when invalid parameters are provided (e.g., unsupported language).
116 | :::
117 | 
118 | :::{py:exception} FileNotFoundError
119 | 
120 | Raised when model files cannot be found or downloaded.
121 | :::
122 | 
123 | :::{py:exception} ImportError
124 | 
125 | Raised when required dependencies are missing.
126 | :::
127 | 
128 | ## Type Hints
129 | 
130 | The package includes comprehensive type annotations for all public functions:
131 | 
132 | ```python
133 | from typing import Union, List
134 | import pandas as pd
135 | 
136 | def pred_rel(
137 |     input: Union[str, List[str], pd.Series],
138 |     lang: str = "eng",
139 |     latest: bool = False
140 | ) -> pd.DataFrame:
141 |     ...
142 | ```
143 | 
144 | ## Constants
145 | 
146 | :::{py:data} SUPPORTED_LANGUAGES
147 | 
148 | List of supported language codes: ['eng', 'hin']
149 | :::
150 | 
151 | :::{py:data} MODEL_URLS
152 | 
153 | Dictionary mapping model names to their download URLs
154 | :::
155 | 
156 | :::{py:data} DEFAULT_CACHE_DIR
157 | 
158 | Default directory for caching downloaded models
159 | :::


--------------------------------------------------------------------------------
/pranaam/utils.py:
--------------------------------------------------------------------------------
  1 | """Utilities for downloading and extracting model files."""
  2 | 
  3 | import os
  4 | import tarfile
  5 | from pathlib import Path
  6 | from typing import Final
  7 | 
  8 | import requests
  9 | from tqdm.auto import tqdm
 10 | 
 11 | from .logging import get_logger
 12 | 
 13 | logger = get_logger()
 14 | 
 15 | REPO_BASE_URL: Final[str] = (
 16 |     os.environ.get("PRANAAM_MODEL_URL")
 17 |     or "https://dataverse.harvard.edu/api/access/datafile/13228210"
 18 | )
 19 | 
 20 | 
 21 | def download_file(url: str, target: str, file_name: str) -> bool:
 22 |     """Download and extract a model file from the given URL.
 23 | 
 24 |     Args:
 25 |         url: Base URL (not currently used, uses REPO_BASE_URL instead)
 26 |         target: Target directory for extraction
 27 |         file_name: Name of the file to download
 28 | 
 29 |     Returns:
 30 |         True if download and extraction successful, False otherwise
 31 |     """
 32 |     target_path = Path(target)
 33 |     file_path = target_path / f"{file_name}.tar.gz"
 34 |     try:
 35 |         logger.info("Downloading models from dataverse...")
 36 | 
 37 |         with (
 38 |             requests.Session() as session,
 39 |             tqdm(
 40 |                 unit="iB",
 41 |                 unit_scale=True,
 42 |                 desc=file_name,
 43 |                 ascii=True,
 44 |                 colour="cyan",
 45 |             ) as pbar,
 46 |             file_path.open("wb") as file_handle,
 47 |         ):
 48 |             response = session.get(
 49 |                 REPO_BASE_URL, stream=True, allow_redirects=True, timeout=120
 50 |             )
 51 |             response.raise_for_status()
 52 |             content_length = response.headers.get("Content-Length")
 53 |             total_size = int(content_length) if content_length else None
 54 |             pbar.total = total_size
 55 | 
 56 |             for chunk in response.iter_content(chunk_size=1024**2):
 57 |                 if chunk:  # filter out keep-alive chunks
 58 |                     size = file_handle.write(chunk)
 59 |                     pbar.update(size)
 60 |         # Check if file was downloaded successfully
 61 |         if not file_path.exists():
 62 |             logger.error(f"Downloaded file not found at {file_path}")
 63 |             return False
 64 | 
 65 |         logger.info(f"Downloaded file size: {file_path.stat().st_size} bytes")
 66 | 
 67 |         # Extract tar file with safety checks
 68 |         _safe_extract_tar(file_path, target_path)
 69 |         # Clean up downloaded tar file
 70 |         file_path.unlink()
 71 |         logger.info("Finished downloading models")
 72 |         return True
 73 |     except requests.exceptions.RequestException as e:
 74 |         logger.error(f"Network error downloading models: {e}")
 75 |         return False
 76 |     except (tarfile.TarError, OSError) as e:
 77 |         logger.error(f"File extraction error: {e}")
 78 |         return False
 79 |     except Exception as e:
 80 |         logger.error(f"Unexpected error downloading models: {e}")
 81 |         return False
 82 | 
 83 | 
 84 | def _safe_extract_tar(tar_path: Path, extract_to: Path) -> None:
 85 |     """Safely extract tar file preventing path traversal attacks.
 86 | 
 87 |     Args:
 88 |         tar_path: Path to the tar file
 89 |         extract_to: Directory to extract to
 90 | 
 91 |     Raises:
 92 |         SecurityError: If path traversal attempt detected
 93 |         tarfile.TarError: If tar file is corrupted
 94 |     """
 95 | 
 96 |     def is_within_directory(directory: Path, target: Path) -> bool:
 97 |         abs_directory = directory.resolve()
 98 |         abs_target = target.resolve()
 99 |         try:
100 |             abs_target.relative_to(abs_directory)
101 |             return True
102 |         except ValueError:
103 |             return False
104 | 
105 |     with tarfile.open(tar_path, "r:gz") as tar_file:
106 |         for member in tar_file.getmembers():
107 |             member_path = extract_to / member.name
108 |             if not is_within_directory(extract_to, member_path):
109 |                 raise SecurityError(
110 |                     f"Attempted path traversal in tar file: {member.name}"
111 |                 )
112 | 
113 |         tar_file.extractall(extract_to)
114 | 
115 | 
116 | class SecurityError(Exception):
117 |     """Raised when a security violation is detected."""
118 | 
119 |     pass
120 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | 
 13 | import os
 14 | import sys
 15 | 
 16 | sys.path.insert(0, os.path.abspath(".."))
 17 | 
 18 | # Read project metadata from pyproject.toml
 19 | import tomllib
 20 | 
 21 | with open("../pyproject.toml", "rb") as f:
 22 |     pyproject_data = tomllib.load(f)
 23 | 
 24 | project_info = pyproject_data["project"]
 25 | 
 26 | # -- Project information -----------------------------------------------------
 27 | 
 28 | project = project_info["name"]
 29 | author = ", ".join([a["name"] for a in project_info["authors"]])
 30 | copyright = f"2022-2025, {author}"
 31 | 
 32 | # The full version, including alpha/beta/rc tags
 33 | release = project_info["version"]
 34 | version = project_info["version"]
 35 | 
 36 | 
 37 | # -- General configuration ---------------------------------------------------
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     "sphinx.ext.autodoc",
 44 |     "sphinx_autodoc_typehints",
 45 |     "sphinx.ext.viewcode",
 46 |     "sphinx.ext.napoleon",
 47 |     "sphinx.ext.intersphinx",
 48 |     "sphinx.ext.githubpages",
 49 |     "sphinx_copybutton",
 50 |     "myst_parser",
 51 | ]
 52 | 
 53 | # Add any paths that contain templates here, relative to this directory.
 54 | templates_path = ["_templates"]
 55 | 
 56 | # List of patterns, relative to source directory, that match files and
 57 | # directories to ignore when looking for source files.
 58 | # This pattern also affects html_static_path and html_extra_path.
 59 | exclude_patterns: list[str] = []
 60 | 
 61 | 
 62 | # -- Options for HTML output -------------------------------------------------
 63 | 
 64 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 65 | # a list of builtin themes.
 66 | #
 67 | html_theme = "furo"
 68 | 
 69 | # Add any paths that contain custom static files (such as style sheets) here,
 70 | # relative to this directory. They are copied after the builtin static files,
 71 | # so a file named "default.css" will overwrite the builtin "default.css".
 72 | html_static_path = ["_static"]
 73 | 
 74 | # Napoleon settings
 75 | napoleon_google_docstring = True
 76 | napoleon_numpy_docstring = True
 77 | napoleon_include_init_with_doc = False
 78 | napoleon_include_private_with_doc = False
 79 | 
 80 | # Autodoc settings
 81 | autodoc_default_options = {
 82 |     "members": True,
 83 |     "member-order": "bysource",
 84 |     "special-members": "__init__",
 85 |     "undoc-members": True,
 86 |     "exclude-members": "__weakref__",
 87 | }
 88 | 
 89 | # Intersphinx mapping
 90 | intersphinx_mapping = {
 91 |     "python": ("https://docs.python.org/3/", None),
 92 |     "pandas": ("https://pandas.pydata.org/docs/", None),
 93 |     "numpy": ("https://numpy.org/doc/stable/", None),
 94 | }
 95 | 
 96 | # Furo theme options
 97 | html_theme_options = {
 98 |     "source_repository": "https://github.com/appeler/pranaam",
 99 |     "source_branch": "main",
100 |     "source_directory": "docs/",
101 |     "sidebar_hide_name": False,
102 |     "navigation_with_keys": True,
103 |     "top_of_page_button": "edit",
104 | }
105 | 
106 | html_title = f"{project} {release}"
107 | html_logo = None
108 | html_favicon = None
109 | 
110 | # Type hints settings
111 | typehints_defaults = "comma"
112 | typehints_use_signature = True
113 | typehints_use_signature_return = True
114 | autodoc_typehints_description_target = "documented"
115 | 
116 | # MyST settings
117 | source_suffix = {
118 |     ".rst": "restructuredtext",
119 |     ".md": "markdown",
120 | }
121 | 
122 | myst_enable_extensions = [
123 |     "colon_fence",
124 |     "deflist",
125 |     "dollarmath",
126 |     "fieldlist",
127 |     "html_admonition",
128 |     "html_image",
129 |     "replacements",
130 |     "smartquotes",
131 |     "strikethrough",
132 |     "substitution",
133 |     "tasklist",
134 | ]
135 | 
136 | # Copy button configuration
137 | copybutton_prompt_text = r">>> |\.\.\. |\$ "
138 | copybutton_prompt_is_regexp = True
139 | copybutton_line_continuation_character = "\\"
140 | copybutton_here_doc_delimiter = "EOF"
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pranaam: predict religion from name
  2 | 
  3 | [![ci](https://github.com/appeler/pranaam/actions/workflows/ci.yml/badge.svg)](https://github.com/appeler/pranaam/actions/workflows/ci.yml)
  4 | [![image](https://img.shields.io/pypi/v/pranaam.svg)](https://pypi.python.org/pypi/pranaam)
  5 | [![Documentation](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://appeler.github.io/pranaam/)
  6 | [![image](https://static.pepy.tech/badge/pranaam)](https://pepy.tech/project/pranaam)
  7 | 
  8 | Pranaam uses the Bihar Land Records data, plot-level land records (N=
  9 | 41.87 million plots or 12.13 individuals/accounts across 35,626
 10 | villages), to build machine learning models that predict religion and
 11 | caste from the name. Our final dataset has around 4M unique records. To
 12 | learn how to transform the data and the models underlying the package,
 13 | check the
 14 | [notebooks](https://github.com/appeler/pranaam/tree/main/).
 15 | 
 16 | The first function we are releasing with the package is
 17 | [pred_rel]{.title-ref}, which predicts religion based on the name
 18 | (currently only [muslim]{.title-ref} or [not]{.title-ref}). (For
 19 | context, nearly 95% of India\'s population are Hindu or Muslim, with
 20 | Sikhs, Buddhists, Christians, and other groups making up the rest.) The
 21 | OOS accuracy assessed on unseen names is nearly 98% for both
 22 | [Hindi](https://github.com/appeler/pranaam_dev/blob/main/05_train_hindi.ipynb)
 23 | and
 24 | [English](https://github.com/appeler/pranaam_dev/blob/main/04_train_english.ipynb)
 25 | models.
 26 | 
 27 | Our training data is in Hindi. To build models that classify names
 28 | provided in English, we used the
 29 | [indicate](https://github.com/in-rolls/indicate) package to
 30 | transliterate our training data to English.
 31 | 
 32 | We are releasing this software in the hope that it enables activists and
 33 | researchers
 34 | 
 35 | 1)  Highlight biases
 36 | 2)  Fight biases
 37 | 3)  Prevent biases (regress out some of these biases in models built on
 38 |     natural language corpus with person names).
 39 | 
 40 | ## Install
 41 | 
 42 | We strongly recommend installing pranaam inside a Python virtual environment. (see [venv documentation](https://docs.python.org/3/library/venv.html#creating-virtual-environments))
 43 | 
 44 | ### Standard Installation
 45 | 
 46 | ```bash
 47 | pip install pranaam
 48 | ```
 49 | 
 50 | This installs TensorFlow 2.14.1, which is known to work correctly with the models.
 51 | 
 52 | ### Requirements
 53 | 
 54 | - Python 3.10 or 3.11 (TensorFlow 2.14.1 compatibility requirement)
 55 | - TensorFlow 2.14.1 (automatically installed)
 56 | 
 57 | > **Note**: This package requires TensorFlow 2.14.1 with Keras 2.14.0 for model compatibility. Python 3.12+ is not currently supported due to TensorFlow availability constraints.
 58 | 
 59 | ## General API
 60 | 
 61 | 1.  pranaam.pred_rel takes a list of Hindi/English names and predicts
 62 |     whether the person is Muslim or not.
 63 | 
 64 | ## Examples
 65 | 
 66 | By using names in English :
 67 | 
 68 |     from pranaam import pranaam
 69 |     names = ["Shah Rukh Khan", "Amitabh Bachchan"]
 70 |     result = pranaam.pred_rel(names)
 71 |     print(result)
 72 | 
 73 | output -:
 74 | 
 75 |     name  pred_label  pred_prob_muslim
 76 |     0    Shah Rukh Khan      muslim              73.0
 77 |     1  Amitabh Bachchan  not-muslim              27.0
 78 | 
 79 | By using names in Hindi :
 80 | 
 81 |     from pranaam import pranaam
 82 |     names = ["शाहरुख खान", "अमिताभ बच्चन"]
 83 |     result = pranaam.pred_rel(names, lang="hin")
 84 |     print(result)
 85 | 
 86 | output -:
 87 | 
 88 |     name  pred_label  pred_prob_muslim
 89 |     0    शाहरुख खान      muslim              73.0
 90 |     1  अमिताभ बच्चन  not-muslim              27.0
 91 | 
 92 | ## Functions
 93 | 
 94 | We expose one function, which takes Hindi/English text (name) and
 95 | predicts religion and caste.
 96 | 
 97 | - **pranaam.pred_rel(input)**
 98 |   - What it does:
 99 |     - predicts religion based on hindi/english text (name)
100 |   - Output
101 |     - Returns pandas with name and label (muslim/not-muslim)
102 | 
103 | ## Authors
104 | 
105 | Rajashekar Chintalapati, Aaditya Dar, and Gaurav Sood
106 | 
107 | 
108 | ## 🔗 Adjacent Repositories
109 | 
110 | - [appeler/naampy](https://github.com/appeler/naampy) — Infer Sociodemographic Characteristics from Names Using Indian Electoral Rolls
111 | - [appeler/parsernaam](https://github.com/appeler/parsernaam) — AI name parsing. Predict first or last name using a DL model.
112 | - [appeler/namesexdata](https://github.com/appeler/namesexdata) — Data on international first names and sex of people with that name
113 | - [appeler/graphic_names](https://github.com/appeler/graphic_names) — Infer the gender of person with a particular first name using Google image search and Clarifai
114 | - [appeler/ethnicolr2](https://github.com/appeler/ethnicolr2) — Ethnicolr implementation with new models in pytorch
115 | ## Contributor Code of Conduct
116 | 
117 | The project welcomes contributions from everyone! It depends on it. To
118 | maintain this welcoming atmosphere and to collaborate in a fun and
119 | productive way, we expect contributors to the project to abide by the
120 | [Contributor Code of
121 | Conduct](http://contributor-covenant.org/version/1/0/0/).
122 | 
123 | ## License
124 | 
125 | The package is released under the [MIT
126 | License](https://opensource.org/licenses/MIT).
127 | 


--------------------------------------------------------------------------------
/examples/pandas_integration.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Pandas Integration Examples for Pranaam
  4 | 
  5 | This script demonstrates how to use pranaam with pandas DataFrames for real-world data processing.
  6 | """
  7 | 
  8 | import pandas as pd
  9 | 
 10 | import pranaam
 11 | 
 12 | 
 13 | def create_sample_data():
 14 |     """Create sample data for demonstration."""
 15 |     return pd.DataFrame(
 16 |         {
 17 |             "employee_id": [1001, 1002, 1003, 1004, 1005, 1006],
 18 |             "name": [
 19 |                 "Shah Rukh Khan",
 20 |                 "Priya Sharma",
 21 |                 "Mohammed Ali",
 22 |                 "Raj Patel",
 23 |                 "Fatima Khan",
 24 |                 "Amitabh Bachchan",
 25 |             ],
 26 |             "department": [
 27 |                 "Engineering",
 28 |                 "Marketing",
 29 |                 "Finance",
 30 |                 "HR",
 31 |                 "Engineering",
 32 |                 "Management",
 33 |             ],
 34 |             "salary": [75000, 65000, 70000, 60000, 80000, 120000],
 35 |         }
 36 |     )
 37 | 
 38 | 
 39 | def basic_dataframe_processing():
 40 |     """Show basic DataFrame processing with pranaam."""
 41 |     print("📊 Basic DataFrame Processing")
 42 |     print("=" * 50)
 43 | 
 44 |     # Create sample data
 45 |     df = create_sample_data()
 46 |     print("Original data:")
 47 |     print(df)
 48 |     print()
 49 | 
 50 |     # Get predictions for the name column
 51 |     predictions = pranaam.pred_rel(df["name"], lang="eng")
 52 |     print("Predictions:")
 53 |     print(predictions)
 54 |     print()
 55 | 
 56 |     # Merge predictions back to original DataFrame
 57 |     # Note: pranaam returns name, pred_label, pred_prob_muslim
 58 |     df_with_predictions = df.merge(
 59 |         predictions[["name", "pred_label", "pred_prob_muslim"]], on="name", how="left"
 60 |     )
 61 | 
 62 |     print("Combined data with predictions:")
 63 |     print(df_with_predictions)
 64 |     print()
 65 | 
 66 | 
 67 | def analysis_examples():
 68 |     """Show analysis examples using the predictions."""
 69 |     print("📈 Analysis Examples")
 70 |     print("=" * 50)
 71 | 
 72 |     df = create_sample_data()
 73 |     predictions = pranaam.pred_rel(df["name"], lang="eng")
 74 |     df_combined = df.merge(
 75 |         predictions[["name", "pred_label", "pred_prob_muslim"]], on="name"
 76 |     )
 77 | 
 78 |     # Basic statistics
 79 |     print("Religion distribution:")
 80 |     print(df_combined["pred_label"].value_counts())
 81 |     print()
 82 | 
 83 |     # Average salary by predicted religion
 84 |     print("Average salary by predicted religion:")
 85 |     salary_by_religion = df_combined.groupby("pred_label")["salary"].agg(
 86 |         ["mean", "count"]
 87 |     )
 88 |     print(salary_by_religion)
 89 |     print()
 90 | 
 91 |     # Department distribution by predicted religion
 92 |     print("Department distribution by predicted religion:")
 93 |     dept_religion = pd.crosstab(df_combined["department"], df_combined["pred_label"])
 94 |     print(dept_religion)
 95 |     print()
 96 | 
 97 | 
 98 | def confidence_filtering():
 99 |     """Show how to work with prediction confidence scores."""
100 |     print("🎯 Confidence-Based Filtering")
101 |     print("=" * 50)
102 | 
103 |     df = create_sample_data()
104 |     predictions = pranaam.pred_rel(df["name"], lang="eng")
105 |     df_combined = df.merge(
106 |         predictions[["name", "pred_label", "pred_prob_muslim"]], on="name"
107 |     )
108 | 
109 |     # Show confidence distribution
110 |     print("Confidence distribution:")
111 |     print("Name                | Prediction   | Confidence")
112 |     print("-" * 50)
113 |     for _, row in df_combined.iterrows():
114 |         confidence = max(row["pred_prob_muslim"], 100 - row["pred_prob_muslim"])
115 |         print(f"{row['name']:<18} | {row['pred_label']:<10} | {confidence:>6.1f}%")
116 |     print()
117 | 
118 |     # Filter high-confidence predictions (>90%)
119 |     high_confidence = df_combined[
120 |         (df_combined["pred_prob_muslim"] > 90) | (df_combined["pred_prob_muslim"] < 10)
121 |     ]
122 |     print("High-confidence predictions (>90%):")
123 |     print(high_confidence[["name", "pred_label", "pred_prob_muslim"]])
124 |     print()
125 | 
126 | 
127 | def save_results():
128 |     """Show how to save results to different formats."""
129 |     print("💾 Saving Results")
130 |     print("=" * 50)
131 | 
132 |     df = create_sample_data()
133 |     predictions = pranaam.pred_rel(df["name"], lang="eng")
134 |     df_combined = df.merge(
135 |         predictions[["name", "pred_label", "pred_prob_muslim"]], on="name"
136 |     )
137 | 
138 |     # Save to CSV
139 |     output_file = "employee_predictions.csv"
140 |     df_combined.to_csv(output_file, index=False)
141 |     print(f"✅ Results saved to {output_file}")
142 | 
143 |     # Show what was saved
144 |     print("Saved data preview:")
145 |     print(df_combined.head())
146 | 
147 |     # Clean up
148 |     import os
149 | 
150 |     os.remove(output_file)
151 |     print(f"🧹 Cleaned up {output_file}")
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     print("🐼 Pranaam + Pandas Integration Examples")
156 |     print("=" * 60)
157 |     print("This script shows how to integrate pranaam with pandas for data analysis.\n")
158 | 
159 |     basic_dataframe_processing()
160 |     analysis_examples()
161 |     confidence_filtering()
162 |     save_results()
163 | 
164 |     print("✅ All pandas integration examples completed!")
165 |     print("Next steps: Check out csv_processor.py for command-line data processing.")
166 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["uv_build>=0.9.17,<0.10.0"]
  3 | build-backend = "uv_build"
  4 | 
  5 | [project]
  6 | name = "pranaam"
  7 | version = "0.4.0"
  8 | description = "Predict religion and caste based on name"
  9 | readme = "README.md"
 10 | requires-python = ">=3.11,<3.13"
 11 | license = {text = "MIT"}
 12 | authors = [
 13 |   { name = "Rajashekar Chintalapati", email = "rajshekar.ch@gmail.com" },
 14 |   { name = "Aaditya Dar" },
 15 |   { name = "Gaurav Sood", email = "gsood07@gmail.com" }
 16 | ]
 17 | classifiers = [
 18 |   "Development Status :: 5 - Production/Stable",
 19 |   "Intended Audience :: Developers",
 20 |   "Intended Audience :: Science/Research",
 21 |   "License :: OSI Approved :: MIT License",
 22 |   "Operating System :: OS Independent",
 23 |   "Programming Language :: Python :: 3.11",
 24 |   "Programming Language :: Python :: 3.12",
 25 |   "Topic :: Scientific/Engineering :: Artificial Intelligence",
 26 |   "Topic :: Scientific/Engineering :: Information Analysis",
 27 |   "Topic :: Software Development :: Libraries :: Python Modules",
 28 |   "Topic :: Text Processing :: Linguistic",
 29 |   "Natural Language :: English",
 30 |   "Natural Language :: Hindi"
 31 | ]
 32 | keywords = ["predict", "religion", "name", "hindi", "english", "machine-learning", "deep-learning", "nlp", "name-classification", "tensorflow"]
 33 | dependencies = [
 34 |   "tensorflow>=2.20.0",  # TF 2.20+ without gcs-filesystem dependency
 35 |   "tf-keras>=2.20.0",  # Keras 2 compatibility layer
 36 |   "tqdm>=4.64.0",
 37 |   "pandas>=1.5.0",
 38 |   "numpy>=1.21.0",
 39 |   "requests>=2.25.0",  # Used in utils.py
 40 |   "rich>=13.0.0",  # Rich console output and logging
 41 | ]
 42 | 
 43 | [project.optional-dependencies]
 44 | docs = [
 45 |     "sphinx>=7.0",
 46 |     "furo>=2024.1.29",
 47 |     "sphinx-autodoc-typehints>=1.25",
 48 |     "sphinx-copybutton>=0.5",
 49 |     "myst-parser>=2.0",
 50 | ]
 51 | streamlit = [
 52 |   "streamlit>=1.20.0",
 53 |   "scikit-learn>=1.0.0,<2.0.0"
 54 | ]
 55 | 
 56 | [dependency-groups]
 57 | dev = [
 58 |     "pytest>=7.0",
 59 |     "pytest-cov>=4.0",
 60 |     "pytest-xdist>=3.0",
 61 |     "ruff>=0.7.0",
 62 |     "mypy>=1.0",
 63 |     "pre-commit>=3.0",
 64 |     "pandas-stubs>=2.0.0",
 65 |     "types-requests",
 66 |     "types-tqdm",
 67 |     "build>=1.3.0",
 68 |     "twine>=6.2.0",
 69 |     "deptry>=0.12.0",
 70 | ]
 71 | test = [
 72 |     "pytest>=7.0",
 73 |     "pytest-cov>=4.0",
 74 |     "pytest-xdist>=3.0",
 75 | ]
 76 | 
 77 | [project.urls]
 78 | "Homepage" = "https://github.com/appeler/pranaam"
 79 | "Repository" = "https://github.com/appeler/pranaam"
 80 | "Bug Tracker" = "https://github.com/appeler/pranaam/issues"
 81 | 
 82 | [project.scripts]
 83 | predict_religion = "pranaam.predict:main"
 84 | 
 85 | [tool.uv.build-backend]
 86 | module-name = "pranaam"
 87 | module-root = ""
 88 | 
 89 | [tool.pytest.ini_options]
 90 | python_files = "test_*.py"
 91 | testpaths = ["tests"]
 92 | addopts = [
 93 |     "-v",
 94 |     "--tb=short",
 95 |     "--strict-markers",
 96 |     "--disable-warnings"
 97 | ]
 98 | markers = [
 99 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
100 |     "integration: marks tests as integration tests (requires model download)",
101 |     "unit: marks tests as unit tests",
102 |     "e2e: marks tests as end-to-end tests with real models"
103 | ]
104 | # Coverage options (enabled in CI)
105 | # addopts = "--cov=pranaam --cov-report=html --cov-report=term-missing"
106 | 
107 | 
108 | 
109 | [tool.mypy]
110 | python_version = "3.11"
111 | warn_return_any = true
112 | warn_unused_configs = true
113 | disallow_untyped_defs = true
114 | disallow_incomplete_defs = true
115 | check_untyped_defs = true
116 | disallow_untyped_decorators = true
117 | no_implicit_optional = true
118 | warn_redundant_casts = true
119 | warn_unused_ignores = true
120 | warn_no_return = true
121 | warn_unreachable = true
122 | strict_equality = true
123 | show_error_codes = true
124 | 
125 | [[tool.mypy.overrides]]
126 | module = ["tensorflow.*", "tf_keras.*", "gdown.*"]
127 | ignore_missing_imports = true
128 | 
129 | [tool.coverage.run]
130 | source = ["pranaam"]
131 | omit = ["*/tests/*", "*/test_*.py"]
132 | 
133 | [tool.coverage.report]
134 | exclude_lines = [
135 |     "pragma: no cover",
136 |     "def __repr__",
137 |     "raise AssertionError",
138 |     "raise NotImplementedError",
139 | ]
140 | 
141 | [tool.ruff]
142 | target-version = "py311"
143 | line-length = 88
144 | exclude = [
145 |     "model_training/",
146 |     "scripts/",
147 | ]
148 | 
149 | [tool.ruff.lint]
150 | select = [
151 |     "E",  # pycodestyle errors
152 |     "W",  # pycodestyle warnings
153 |     "F",  # pyflakes
154 |     "I",  # isort
155 |     "B",  # flake8-bugbear
156 |     "C4", # flake8-comprehensions
157 |     "UP", # pyupgrade
158 | ]
159 | ignore = [
160 |     "E501",  # line too long (handled by formatter)
161 |     "B008",  # do not perform function calls in argument defaults
162 |     "C901",  # too complex
163 | ]
164 | 
165 | [tool.ruff.format]
166 | quote-style = "double"
167 | indent-style = "space"
168 | skip-magic-trailing-comma = false
169 | line-ending = "auto"
170 | 
171 | [tool.ruff.lint.per-file-ignores]
172 | "__init__.py" = ["F401"]  # imported but unused
173 | "tests/*" = ["S101"]  # use of assert
174 | "*.ipynb" = ["E402", "E722", "UP031", "B905", "F821", "F811", "W293", "F841"]  # ignore notebook linting issues
175 | 
176 | [tool.ruff.lint.isort]
177 | known-first-party = ["pranaam"]
178 | 
179 | [tool.deptry]
180 | ignore_notebooks = true
181 | 
182 | [tool.deptry.per_rule_ignores]
183 | DEP002 = ["sphinx", "furo", "sphinx-autodoc-typehints", "sphinx-copybutton", "myst-parser", "scikit-learn"]
184 | 
185 | 


--------------------------------------------------------------------------------
/examples/csv_processor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | CSV Processor - Command Line Utility for Pranaam
  4 | 
  5 | A practical command-line tool that reads a CSV file with names,
  6 | adds religion predictions, and saves the results.
  7 | 
  8 | Usage:
  9 |     python csv_processor.py input.csv output.csv --name-column "name" --language eng
 10 | """
 11 | 
 12 | import argparse
 13 | import sys
 14 | from pathlib import Path
 15 | 
 16 | import pandas as pd
 17 | 
 18 | import pranaam
 19 | 
 20 | 
 21 | def create_sample_csv(filename: str = "sample_names.csv") -> None:
 22 |     """Create a sample CSV file for testing."""
 23 |     sample_data = pd.DataFrame(
 24 |         {
 25 |             "id": [1, 2, 3, 4, 5, 6],
 26 |             "full_name": [
 27 |                 "Shah Rukh Khan",
 28 |                 "Priya Sharma",
 29 |                 "Mohammed Ali",
 30 |                 "Raj Patel",
 31 |                 "Fatima Khan",
 32 |                 "John Smith",
 33 |             ],
 34 |             "department": ["Engineering", "Marketing", "Finance", "HR", "Sales", "IT"],
 35 |             "city": ["Mumbai", "Delhi", "Bangalore", "Chennai", "Pune", "Hyderabad"],
 36 |         }
 37 |     )
 38 |     sample_data.to_csv(filename, index=False)
 39 |     print(f"📝 Created sample file: {filename}")
 40 | 
 41 | 
 42 | def process_csv(
 43 |     input_file: str, output_file: str, name_column: str, language: str
 44 | ) -> None:
 45 |     """Process CSV file and add religion predictions."""
 46 | 
 47 |     # Validate input file
 48 |     if not Path(input_file).exists():
 49 |         print(f"❌ Error: Input file '{input_file}' not found")
 50 |         sys.exit(1)
 51 | 
 52 |     try:
 53 |         # Read CSV
 54 |         print(f"📖 Reading {input_file}...")
 55 |         df = pd.read_csv(input_file)
 56 |         print(f"   Found {len(df)} rows")
 57 | 
 58 |         # Validate name column
 59 |         if name_column not in df.columns:
 60 |             print(f"❌ Error: Column '{name_column}' not found in CSV")
 61 |             print(f"   Available columns: {list(df.columns)}")
 62 |             sys.exit(1)
 63 | 
 64 |         # Check for missing names
 65 |         missing_names = df[name_column].isna().sum()
 66 |         if missing_names > 0:
 67 |             print(f"⚠️  Warning: {missing_names} rows have missing names")
 68 |             df = df.dropna(subset=[name_column])
 69 |             print(f"   Processing {len(df)} rows with valid names")
 70 | 
 71 |         # Get predictions
 72 |         print(f"🔮 Getting predictions for {len(df)} names (language: {language})...")
 73 |         predictions = pranaam.pred_rel(df[name_column], lang=language)
 74 | 
 75 |         # Merge predictions back to original data
 76 |         # Rename columns to avoid conflicts
 77 |         predictions = predictions.rename(
 78 |             columns={
 79 |                 "name": name_column,
 80 |                 "pred_label": f"{name_column}_religion",
 81 |                 "pred_prob_muslim": f"{name_column}_confidence_muslim",
 82 |             }
 83 |         )
 84 | 
 85 |         df_with_predictions = df.merge(predictions, on=name_column, how="left")
 86 | 
 87 |         # Save results
 88 |         print(f"💾 Saving results to {output_file}...")
 89 |         df_with_predictions.to_csv(output_file, index=False)
 90 | 
 91 |         # Show summary
 92 |         print("\n📊 Processing Summary:")
 93 |         print(f"   Input rows: {len(df)}")
 94 |         print(f"   Output rows: {len(df_with_predictions)}")
 95 | 
 96 |         religion_counts = df_with_predictions[f"{name_column}_religion"].value_counts()
 97 |         print(f"   Predictions: {dict(religion_counts)}")
 98 | 
 99 |         # Show confidence distribution
100 |         conf_col = f"{name_column}_confidence_muslim"
101 |         high_conf_muslim = (df_with_predictions[conf_col] > 90).sum()
102 |         high_conf_not_muslim = (df_with_predictions[conf_col] < 10).sum()
103 |         print(
104 |             f"   High confidence (>90%): {high_conf_muslim + high_conf_not_muslim} predictions"
105 |         )
106 | 
107 |         print(f"\n✅ Successfully processed {input_file} → {output_file}")
108 | 
109 |     except Exception as e:
110 |         print(f"❌ Error processing file: {str(e)}")
111 |         sys.exit(1)
112 | 
113 | 
114 | def main():
115 |     """Main command-line interface."""
116 |     parser = argparse.ArgumentParser(
117 |         description="Add religion predictions to CSV files using pranaam",
118 |         formatter_class=argparse.RawDescriptionHelpFormatter,
119 |         epilog="""
120 | Examples:
121 |     python csv_processor.py data.csv results.csv --name-column "full_name"
122 |     python csv_processor.py data.csv results.csv --name-column "employee_name" --language hin
123 |     python csv_processor.py --create-sample  # Create sample file for testing
124 |         """,
125 |     )
126 | 
127 |     parser.add_argument("input_file", nargs="?", help="Input CSV file path")
128 |     parser.add_argument("output_file", nargs="?", help="Output CSV file path")
129 |     parser.add_argument(
130 |         "--name-column",
131 |         default="name",
132 |         help="Name of the column containing names (default: 'name')",
133 |     )
134 |     parser.add_argument(
135 |         "--language",
136 |         choices=["eng", "hin"],
137 |         default="eng",
138 |         help="Language for predictions: eng (English) or hin (Hindi) (default: eng)",
139 |     )
140 |     parser.add_argument(
141 |         "--create-sample",
142 |         action="store_true",
143 |         help="Create a sample CSV file for testing",
144 |     )
145 | 
146 |     args = parser.parse_args()
147 | 
148 |     # Handle sample creation
149 |     if args.create_sample:
150 |         create_sample_csv()
151 |         print(
152 |             "You can now test with: python csv_processor.py sample_names.csv results.csv --name-column 'full_name'"
153 |         )
154 |         return
155 | 
156 |     # Validate required arguments
157 |     if not args.input_file or not args.output_file:
158 |         print("❌ Error: Both input_file and output_file are required")
159 |         print("Use --help for usage information or --create-sample to create test data")
160 |         sys.exit(1)
161 | 
162 |     # Process the CSV
163 |     process_csv(args.input_file, args.output_file, args.name_column, args.language)
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     main()
168 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
  1 | # Pranaam Examples
  2 | 
  3 | This directory contains practical examples showing how to use the pranaam package for religion prediction from names.
  4 | 
  5 | ## Overview
  6 | 
  7 | | Example | Description | Complexity | Use Case |
  8 | |---------|-------------|------------|----------|
  9 | | [`basic_usage.py`](basic_usage.py) | Simple usage patterns | Beginner | Learning the API |
 10 | | [`pandas_integration.py`](pandas_integration.py) | DataFrame processing | Intermediate | Data analysis |
 11 | | [`csv_processor.py`](csv_processor.py) | Command-line utility | Intermediate | Batch processing |
 12 | | [`performance_demo.py`](performance_demo.py) | Performance analysis | Advanced | Optimization |
 13 | 
 14 | ## Quick Start
 15 | 
 16 | ```bash
 17 | # Install pranaam
 18 | pip install pranaam
 19 | 
 20 | # Run basic examples
 21 | python examples/basic_usage.py
 22 | python examples/pandas_integration.py  
 23 | python examples/performance_demo.py
 24 | 
 25 | # Try the CSV processor
 26 | python examples/csv_processor.py --create-sample
 27 | python examples/csv_processor.py sample_names.csv results.csv --name-column "full_name"
 28 | ```
 29 | 
 30 | ## Example Details
 31 | 
 32 | ### 1. Basic Usage (`basic_usage.py`)
 33 | **What it shows:**
 34 | - Single name prediction
 35 | - Multiple name prediction  
 36 | - Working with English and Hindi names
 37 | - Understanding output format
 38 | 
 39 | **Run it:**
 40 | ```bash
 41 | python examples/basic_usage.py
 42 | ```
 43 | 
 44 | **Expected output:**
 45 | ```
 46 | 🔮 Single Name Prediction
 47 | English name:
 48 |              name pred_label  pred_prob_muslim
 49 | 0  Shah Rukh Khan     muslim              71.0
 50 | 
 51 | 📝 Multiple Names Prediction
 52 | Batch prediction results:
 53 |              name    pred_label  pred_prob_muslim
 54 | 0   Shah Rukh Khan     muslim              71.0
 55 | 1  Amitabh Bachchan not-muslim              15.2
 56 | ```
 57 | 
 58 | ### 2. Pandas Integration (`pandas_integration.py`)
 59 | **What it shows:**
 60 | - Loading data from pandas DataFrame
 61 | - Merging predictions with existing data
 62 | - Data analysis with predictions
 63 | - Confidence-based filtering
 64 | - Saving results to CSV
 65 | 
 66 | **Run it:**
 67 | ```bash
 68 | python examples/pandas_integration.py
 69 | ```
 70 | 
 71 | **Key concepts:**
 72 | - Use `df["name"]` as input to `pranaam.pred_rel()`
 73 | - Merge results back using `pd.merge()` on name column
 74 | - Filter by confidence levels for quality control
 75 | - Analyze demographics with prediction results
 76 | 
 77 | ### 3. CSV Processor (`csv_processor.py`)
 78 | **What it shows:**
 79 | - Command-line interface for batch processing
 80 | - Reading/writing CSV files
 81 | - Error handling and validation
 82 | - Progress reporting
 83 | 
 84 | **Run it:**
 85 | ```bash
 86 | # Create sample data
 87 | python examples/csv_processor.py --create-sample
 88 | 
 89 | # Process the sample file
 90 | python examples/csv_processor.py sample_names.csv results.csv --name-column "full_name"
 91 | 
 92 | # For Hindi names
 93 | python examples/csv_processor.py data.csv results.csv --name-column "name" --language hin
 94 | ```
 95 | 
 96 | **Features:**
 97 | - Handles missing names gracefully
 98 | - Validates input files and columns
 99 | - Shows processing statistics
100 | - Configurable name column and language
101 | 
102 | ### 4. Performance Demo (`performance_demo.py`)
103 | **What it shows:**
104 | - Batch processing performance
105 | - Model caching behavior  
106 | - Language switching costs
107 | - Memory usage patterns
108 | - Real-world benchmarks
109 | 
110 | **Run it:**
111 | ```bash
112 | python examples/performance_demo.py
113 | ```
114 | 
115 | **Performance insights:**
116 | - First prediction: 3-5 seconds (includes model loading)
117 | - Cached predictions: 100-500+ names/second
118 | - Batch processing is much more efficient than individual calls
119 | - Language switching requires model reload (~2-3 seconds)
120 | 
121 | ## Common Patterns
122 | 
123 | ### Basic Prediction
124 | ```python
125 | import pranaam
126 | 
127 | # Single name
128 | result = pranaam.pred_rel("Shah Rukh Khan", lang="eng")
129 | 
130 | # Multiple names  
131 | names = ["Shah Rukh Khan", "Priya Sharma"]
132 | result = pranaam.pred_rel(names, lang="eng")
133 | ```
134 | 
135 | ### DataFrame Integration
136 | ```python
137 | import pandas as pd
138 | import pranaam
139 | 
140 | # Load your data
141 | df = pd.read_csv("your_data.csv")
142 | 
143 | # Get predictions
144 | predictions = pranaam.pred_rel(df["name_column"], lang="eng")
145 | 
146 | # Merge back to original data
147 | df_with_predictions = df.merge(predictions, left_on="name_column", right_on="name")
148 | ```
149 | 
150 | ### Confidence Filtering
151 | ```python
152 | # Keep only high-confidence predictions
153 | high_conf = df_with_predictions[
154 |     (df_with_predictions["pred_prob_muslim"] > 90) |  # High confidence Muslim
155 |     (df_with_predictions["pred_prob_muslim"] < 10)    # High confidence Not Muslim
156 | ]
157 | ```
158 | 
159 | ## Performance Tips
160 | 
161 | 1. **Batch Processing**: Always process multiple names at once rather than individual calls
162 | 2. **Model Caching**: The model stays loaded between calls - reuse the same process
163 | 3. **Language Switching**: Minimize switches between English and Hindi models  
164 | 4. **Memory Management**: Process very large datasets in chunks of 1000-5000 names
165 | 5. **First Call**: The first prediction takes longer due to model loading
166 | 
167 | ## Supported Languages
168 | 
169 | - **English (`eng`)**: Names transliterated to English script
170 | - **Hindi (`hin`)**: Names in Devanagari script
171 | 
172 | ## Output Format
173 | 
174 | All examples return pandas DataFrames with:
175 | - `name`: The input name
176 | - `pred_label`: Predicted religion ("muslim" or "not-muslim") 
177 | - `pred_prob_muslim`: Probability of being Muslim (0-100)
178 | 
179 | ## Error Handling
180 | 
181 | The examples demonstrate proper error handling for:
182 | - Missing input files
183 | - Invalid column names
184 | - Empty or malformed names
185 | - Network issues during model download
186 | - Memory constraints with large batches
187 | 
188 | ## Next Steps
189 | 
190 | After running these examples:
191 | 1. Integrate pranaam into your existing data pipeline
192 | 2. Adapt the CSV processor for your specific file formats
193 | 3. Use the performance insights to optimize your batch sizes
194 | 4. Implement confidence thresholds appropriate for your use case
195 | 
196 | For more information, see the [main documentation](https://appeler.github.io/pranaam/).


--------------------------------------------------------------------------------
/streamlit/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | 
  3 | import pandas as pd
  4 | 
  5 | import pranaam
  6 | import streamlit as st
  7 | 
  8 | 
  9 | def download_file(df):
 10 |     """Create download link for DataFrame as CSV."""
 11 |     csv = df.to_csv(index=False)
 12 |     b64 = base64.b64encode(csv.encode()).decode()
 13 |     href = f'<a href="data:file/csv;base64,{b64}" download="results.csv">Download results</a>'
 14 |     st.markdown(href, unsafe_allow_html=True)
 15 | 
 16 | 
 17 | def app():
 18 |     # Set app title
 19 |     st.title("🔮 pranaam: predict religion based on name")
 20 | 
 21 |     # Add sidebar info
 22 |     with st.sidebar:
 23 |         st.header("About")
 24 |         st.write(
 25 |             "Pranaam uses Bihar Land Records data (4M+ records) to predict religion from names using ML models."
 26 |         )
 27 |         st.write("**Accuracy**: ~98% on out-of-sample data")
 28 |         st.write("[GitHub Repository](https://github.com/appeler/pranaam)")
 29 |         st.write("[Documentation](https://pranaam.readthedocs.io/)")
 30 | 
 31 |     # Description
 32 |     st.write(
 33 |         """
 34 |     This app predicts whether a name is **Muslim** or **not-Muslim** based on machine learning models
 35 |     trained on Bihar Land Records data covering 35,626+ villages and 4M+ unique records.
 36 |     """
 37 |     )
 38 | 
 39 |     # Input methods
 40 |     input_method = st.radio(
 41 |         "Choose input method:", ["Enter names manually", "Upload CSV file"]
 42 |     )
 43 | 
 44 |     if input_method == "Enter names manually":
 45 |         # Manual input
 46 |         st.subheader("Enter Names")
 47 | 
 48 |         # Language selection
 49 |         lang = st.selectbox(
 50 |             "Select language:",
 51 |             ["eng", "hin"],
 52 |             format_func=lambda x: "English" if x == "eng" else "Hindi",
 53 |         )
 54 | 
 55 |         # Name input
 56 |         if lang == "eng":
 57 |             example = "Shah Rukh Khan, Amitabh Bachchan, Salman Khan"
 58 |             names_input = st.text_area(
 59 |                 "Enter names (one per line or comma-separated):",
 60 |                 placeholder=example,
 61 |                 height=100,
 62 |             )
 63 |         else:
 64 |             example = "शाहरुख खान, अमिताभ बच्चन"
 65 |             names_input = st.text_area(
 66 |                 "Enter names in Hindi (one per line or comma-separated):",
 67 |                 placeholder=example,
 68 |                 height=100,
 69 |             )
 70 | 
 71 |         if st.button("Predict Religion"):
 72 |             if names_input.strip():
 73 |                 # Parse names
 74 |                 if "\n" in names_input:
 75 |                     names = [
 76 |                         name.strip() for name in names_input.split("\n") if name.strip()
 77 |                     ]
 78 |                 else:
 79 |                     names = [
 80 |                         name.strip() for name in names_input.split(",") if name.strip()
 81 |                     ]
 82 | 
 83 |                 with st.spinner("Making predictions..."):
 84 |                     try:
 85 |                         result = pranaam.pred_rel(names, lang=lang)
 86 | 
 87 |                         st.subheader("Results")
 88 |                         st.dataframe(result, use_container_width=True)
 89 | 
 90 |                         # Summary
 91 |                         muslim_count = (result["pred_label"] == "muslim").sum()
 92 |                         total_count = len(result)
 93 |                         st.write(
 94 |                             f"**Summary**: {muslim_count} Muslim, {total_count - muslim_count} non-Muslim out of {total_count} names"
 95 |                         )
 96 | 
 97 |                         # Download button
 98 |                         download_file(result)
 99 | 
100 |                     except Exception as e:
101 |                         st.error(f"Error making predictions: {str(e)}")
102 |             else:
103 |                 st.warning("Please enter at least one name.")
104 | 
105 |     else:
106 |         # CSV Upload
107 |         st.subheader("Upload CSV File")
108 | 
109 |         uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
110 | 
111 |         if uploaded_file is not None:
112 |             try:
113 |                 df = pd.read_csv(uploaded_file)
114 |                 st.write("**Data loaded successfully!**")
115 |                 st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
116 | 
117 |                 # Preview data
118 |                 with st.expander("Preview data"):
119 |                     st.dataframe(df.head(), use_container_width=True)
120 | 
121 |                 # Column selection
122 |                 name_col = st.selectbox("Select column containing names:", df.columns)
123 |                 lang = st.selectbox(
124 |                     "Select language:",
125 |                     ["eng", "hin"],
126 |                     format_func=lambda x: "English" if x == "eng" else "Hindi",
127 |                 )
128 | 
129 |                 if st.button("Predict Religion for All Names"):
130 |                     with st.spinner("Processing names..."):
131 |                         try:
132 |                             names_list = df[name_col].dropna().astype(str).tolist()
133 |                             result = pranaam.pred_rel(names_list, lang=lang)
134 | 
135 |                             # Merge with original data
136 |                             result_df = df.copy()
137 |                             result_df = result_df.merge(
138 |                                 result, left_on=name_col, right_on="name", how="left"
139 |                             )
140 | 
141 |                             st.subheader("Results")
142 |                             st.dataframe(result_df, use_container_width=True)
143 | 
144 |                             # Summary
145 |                             muslim_count = (result_df["pred_label"] == "muslim").sum()
146 |                             total_count = len(result_df)
147 |                             st.write(
148 |                                 f"**Summary**: {muslim_count} Muslim, {total_count - muslim_count} non-Muslim out of {total_count} names"
149 |                             )
150 | 
151 |                             # Download
152 |                             download_file(result_df)
153 | 
154 |                         except Exception as e:
155 |                             st.error(f"Error processing file: {str(e)}")
156 | 
157 |             except Exception as e:
158 |                 st.error(f"Error loading file: {str(e)}")
159 |         else:
160 |             st.info("Please upload a CSV file to continue.")
161 | 
162 |     # Footer
163 |     st.markdown("---")
164 |     st.markdown(
165 |         """
166 |     **Note**: This tool is for research and educational purposes. The predictions are based on statistical patterns
167 |     and should not be used for discriminatory purposes.
168 |     """
169 |     )
170 | 
171 | 
172 | # Run the app
173 | if __name__ == "__main__":
174 |     app()
175 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
  1 | # Release and Publish Workflow
  2 | # 
  3 | # This consolidated workflow handles both GitHub release creation and PyPI publishing
  4 | # using PyPI's trusted publisher program for secure, token-free publishing.
  5 | #
  6 | # Triggers:
  7 | # - Version tags (v*): Creates release + publishes to PyPI
  8 | # - GitHub releases: Publishes to PyPI only  
  9 | # - Manual trigger: Allows TestPyPI or PyPI publishing
 10 | #
 11 | # For trusted publishing setup:
 12 | # 1. Go to https://pypi.org/manage/account/publishing/
 13 | # 2. Add GitHub as trusted publisher: appeler/pranaam
 14 | # 3. Workflow filename: python-publish.yml
 15 | # 4. Environment name: pypi
 16 | 
 17 | name: Release and Publish
 18 | 
 19 | on:
 20 |   # Automatic release creation and PyPI publishing on version tags
 21 |   push:
 22 |     tags:
 23 |       - 'v*'  # v1.0.0, v0.1.0, etc.
 24 |   
 25 |   # PyPI publishing on manual releases (no release creation needed)
 26 |   release:
 27 |     types: [published]
 28 |   
 29 |   # Manual trigger for emergency publishing or testing
 30 |   workflow_dispatch:
 31 |     inputs:
 32 |       environment:
 33 |         description: 'Target environment'
 34 |         required: true
 35 |         default: 'testpypi'
 36 |         type: choice
 37 |         options:
 38 |           - 'testpypi'
 39 |           - 'pypi'
 40 |       create_release:
 41 |         description: 'Create GitHub release (only for manual PyPI publishing)'
 42 |         required: false
 43 |         default: false
 44 |         type: boolean
 45 | 
 46 | permissions:
 47 |   contents: write  # Required for release creation
 48 |   id-token: write  # Required for PyPI trusted publishing
 49 | 
 50 | jobs:
 51 |   create-release:
 52 |     name: Create GitHub Release
 53 |     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
 54 |     runs-on: ubuntu-latest
 55 |     
 56 |     steps:
 57 |       - name: Checkout repository
 58 |         uses: actions/checkout@v4
 59 |         with:
 60 |           fetch-depth: 0
 61 |       
 62 |       - name: Sync CITATION.cff
 63 |         uses: gojiplus/citation-sync@v1
 64 |         with:
 65 |           commit: true
 66 |       
 67 |       - name: Extract version from tag
 68 |         id: version
 69 |         run: |
 70 |           VERSION=${GITHUB_REF#refs/tags/v}
 71 |           echo "version=$VERSION" >> $GITHUB_OUTPUT
 72 |           echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
 73 |       
 74 |       - name: Validate version consistency
 75 |         run: |
 76 |           PYPROJECT_VERSION=$(grep -E '^version = ' pyproject.toml | sed 's/version = "//' | sed 's/"//')
 77 |           if [ "$PYPROJECT_VERSION" != "${{ steps.version.outputs.version }}" ]; then
 78 |             echo "❌ Version mismatch:"
 79 |             echo "   Tag version: ${{ steps.version.outputs.version }}"
 80 |             echo "   pyproject.toml version: $PYPROJECT_VERSION"
 81 |             echo "Please ensure the tag version matches pyproject.toml"
 82 |             exit 1
 83 |           fi
 84 |           echo "✅ Version consistency validated"
 85 |       
 86 |       - name: Generate release notes
 87 |         run: |
 88 |           cat > release_notes.md << EOF
 89 |           ## Release ${{ steps.version.outputs.version }}
 90 |           
 91 |           ### Installation
 92 |           
 93 |           \`\`\`bash
 94 |           pip install pranaam==${{ steps.version.outputs.version }}
 95 |           \`\`\`
 96 |           
 97 |           For TensorFlow compatibility:
 98 |           \`\`\`bash
 99 |           pip install 'pranaam[tensorflow-compat]==${{ steps.version.outputs.version }}'
100 |           \`\`\`
101 |           
102 |           ### Documentation
103 |           
104 |           📖 [Documentation](https://appeler.github.io/pranaam/)
105 |           
106 |           ### What's Changed
107 |           
108 |           This release includes the latest updates and improvements to pranaam.
109 |           See the [full changelog](https://github.com/appeler/pranaam/compare/\$(git describe --tags --abbrev=0 HEAD^)...${{ steps.version.outputs.tag }}) for details.
110 |           EOF
111 |       
112 |       - name: Create Release
113 |         uses: softprops/action-gh-release@v2
114 |         with:
115 |           name: Release ${{ steps.version.outputs.version }}
116 |           body_path: release_notes.md
117 |           draft: false
118 |           prerelease: ${{ contains(steps.version.outputs.version, '-') }}
119 |           generate_release_notes: true
120 | 
121 |   build:
122 |     name: Build Python package
123 |     runs-on: ubuntu-latest
124 |     
125 |     steps:
126 |       - name: Checkout repository
127 |         uses: actions/checkout@v4
128 |         with:
129 |           fetch-depth: 0
130 |       
131 |       - name: Set up Python
132 |         uses: actions/setup-python@v5
133 |         with:
134 |           python-version: '3.11'
135 |       
136 |       - name: Install uv
137 |         uses: astral-sh/setup-uv@v7.1.6
138 |       
139 |       - name: Build package
140 |         run: uv build
141 |       
142 |       - name: Install validation tools
143 |         run: |
144 |           uv tool install twine
145 |       
146 |       - name: Verify build
147 |         run: |
148 |           uv tool run twine check dist/*
149 |           ls -la dist/
150 |       
151 |       - name: Upload build artifacts
152 |         uses: actions/upload-artifact@v4
153 |         with:
154 |           name: dist
155 |           path: dist/
156 | 
157 |   publish-testpypi:
158 |     name: Publish to TestPyPI
159 |     if: github.event_name == 'workflow_dispatch' && github.event.inputs.environment == 'testpypi'
160 |     needs: build
161 |     runs-on: ubuntu-latest
162 |     environment:
163 |       name: testpypi
164 |       url: https://test.pypi.org/p/pranaam
165 |     permissions:
166 |       id-token: write
167 |     
168 |     steps:
169 |       - name: Download build artifacts
170 |         uses: actions/download-artifact@v4
171 |         with:
172 |           name: dist
173 |           path: dist/
174 |       
175 |       - name: Publish to TestPyPI
176 |         uses: pypa/gh-action-pypi-publish@release/v1
177 |         with:
178 |           repository-url: https://test.pypi.org/legacy/
179 | 
180 |   publish-pypi:
181 |     name: Publish to PyPI
182 |     if: |
183 |       (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')) ||
184 |       (github.event_name == 'release' && github.event.action == 'published') ||
185 |       (github.event_name == 'workflow_dispatch' && github.event.inputs.environment == 'pypi')
186 |     needs: build
187 |     runs-on: ubuntu-latest
188 |     environment:
189 |       name: pypi
190 |       url: https://pypi.org/p/pranaam
191 |     permissions:
192 |       id-token: write
193 |     
194 |     steps:
195 |       - name: Download build artifacts
196 |         uses: actions/download-artifact@v4
197 |         with:
198 |           name: dist
199 |           path: dist/
200 |       
201 |       - name: Publish to PyPI
202 |         uses: pypa/gh-action-pypi-publish@release/v1


--------------------------------------------------------------------------------
/pranaam/naam.py:
--------------------------------------------------------------------------------
  1 | """Main prediction module for religion classification."""
  2 | 
  3 | from pathlib import Path
  4 | from typing import Final, Literal
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | 
 10 | from .base import Base
 11 | from .logging import get_logger
 12 | 
 13 | logger = get_logger()
 14 | 
 15 | 
 16 | def is_english(text: str) -> bool:
 17 |     """Check if text contains only ASCII characters (English).
 18 | 
 19 |     Args:
 20 |         text: Input text to check
 21 | 
 22 |     Returns:
 23 |         True if text is ASCII-only (English), False otherwise
 24 |     """
 25 |     try:
 26 |         text.encode(encoding="utf-8").decode("ascii")
 27 |         return True
 28 |     except UnicodeDecodeError:
 29 |         return False
 30 | 
 31 | 
 32 | class Naam(Base):
 33 |     """Main class for religion prediction from names."""
 34 | 
 35 |     MODELFN: str = "model"
 36 |     weights_loaded: bool = False
 37 |     model: tf.keras.Model | None = None
 38 |     model_path: Path | None = None
 39 |     classes: Final[list[str]] = ["not-muslim", "muslim"]
 40 |     cur_lang: str = "eng"
 41 |     model_name: Final[str] = "eng_and_hindi_models_v2"
 42 | 
 43 |     @classmethod
 44 |     def pred_rel(
 45 |         cls,
 46 |         names: str | list[str] | pd.Series,
 47 |         lang: Literal["eng", "hin"] = "eng",
 48 |         latest: bool = False,
 49 |     ) -> pd.DataFrame:
 50 |         """Predict religion based on name(s).
 51 | 
 52 |         Args:
 53 |             names: Single name string, list of names, or pandas Series of names
 54 |             lang: Language of input ('eng' for English, 'hin' for Hindi)
 55 |             latest: Whether to download latest model version
 56 | 
 57 |         Returns:
 58 |             DataFrame with columns: name, pred_label, pred_prob_muslim
 59 | 
 60 |         Raises:
 61 |             ValueError: If invalid language specified
 62 |             RuntimeError: If model loading fails or TensorFlow not available
 63 |         """
 64 |         if lang not in ["eng", "hin"]:
 65 |             raise ValueError(f"Unsupported language: {lang}. Use 'eng' or 'hin'")
 66 | 
 67 |         # Convert single string or pandas Series to list for consistent processing
 68 |         if isinstance(names, str):
 69 |             name_list = [names]
 70 |         elif isinstance(names, pd.Series):
 71 |             name_list = names.tolist()
 72 |         else:
 73 |             name_list = list(names)
 74 | 
 75 |         if not name_list:
 76 |             raise ValueError("Input names list cannot be empty")
 77 | 
 78 |         # Validate that no names are empty or contain only whitespace
 79 |         for i, name in enumerate(name_list):
 80 |             if not name or not name.strip():
 81 |                 raise ValueError(
 82 |                     f"Name at index {i} is empty or contains only whitespace"
 83 |                 )
 84 | 
 85 |         # Load model if not loaded or language changed
 86 |         if not cls.weights_loaded or cls.cur_lang != lang:
 87 |             cls._load_model(lang, latest)
 88 | 
 89 |         # Make predictions
 90 |         try:
 91 |             if cls.model is None:
 92 |                 raise RuntimeError("Model not loaded properly")
 93 | 
 94 |             results = cls.model.predict(name_list, verbose=0)
 95 |             predictions = tf.argmax(results, axis=1)
 96 |             probabilities = tf.nn.softmax(results)
 97 | 
 98 |             # Extract results
 99 |             labels = [cls.classes[pred] for pred in predictions.numpy()]
100 |             muslim_probs = [
101 |                 float(np.around(prob[1] * 100)) for prob in probabilities.numpy()
102 |             ]
103 | 
104 |             return pd.DataFrame(
105 |                 {
106 |                     "name": name_list,
107 |                     "pred_label": labels,
108 |                     "pred_prob_muslim": muslim_probs,
109 |                 }
110 |             )
111 | 
112 |         except Exception as e:
113 |             logger.error(f"Prediction failed: {e}")
114 |             raise RuntimeError(f"Prediction failed: {e}") from e
115 | 
116 |     @classmethod
117 |     def _load_model(cls, lang: Literal["eng", "hin"], latest: bool = False) -> None:
118 |         """Load the appropriate model for the specified language.
119 | 
120 |         Args:
121 |             lang: Language code ('eng' or 'hin')
122 |             latest: Whether to download latest model version
123 | 
124 |         Raises:
125 |             RuntimeError: If model loading fails
126 |         """
127 |         try:
128 |             cls.model_path = cls.load_model_data(cls.model_name, latest)
129 |             if cls.model_path is None:
130 |                 raise RuntimeError("Failed to load model data")
131 | 
132 |             model_filename = f"{lang}_model.keras"
133 |             model_full_path = cls.model_path / cls.model_name / model_filename
134 | 
135 |             logger.info(f"Loading {lang} model from {model_full_path}")
136 | 
137 |             # Load Keras 3 compatible model using tf-keras
138 |             cls.model = cls._load_model_with_compatibility(str(model_full_path), lang)
139 |             cls.weights_loaded = True
140 |             cls.cur_lang = lang
141 | 
142 |         except Exception as e:
143 |             logger.error(f"Failed to load {lang} model: {e}")
144 |             raise RuntimeError(f"Failed to load {lang} model: {e}") from e
145 | 
146 |     @classmethod
147 |     def _load_model_with_compatibility(
148 |         cls, model_path: str, lang: Literal["eng", "hin"]
149 |     ) -> tf.keras.Model:
150 |         """Load Keras 3 compatible model using tf-keras for compatibility.
151 | 
152 |         Args:
153 |             model_path: Path to .keras model file
154 |             lang: Language code for error messages
155 | 
156 |         Returns:
157 |             Loaded TensorFlow model
158 | 
159 |         Raises:
160 |             RuntimeError: If model loading fails
161 |         """
162 |         # Fix Windows encoding for model assets with Unicode content
163 |         import sys
164 | 
165 |         if sys.platform == "win32":
166 |             import os
167 | 
168 |             os.environ.setdefault("PYTHONIOENCODING", "utf-8")
169 | 
170 |         try:
171 |             # Use tf-keras for loading the migrated Keras 3 models
172 |             import tf_keras as keras
173 | 
174 |             logger.info(f"Loading {lang} model with tf-keras compatibility layer")
175 |             return keras.models.load_model(model_path)
176 |         except ImportError:
177 |             logger.info(
178 |                 f"tf-keras not available, trying standard Keras for {lang} model"
179 |             )
180 |             try:
181 |                 return tf.keras.models.load_model(model_path)
182 |             except Exception as e:
183 |                 raise RuntimeError(
184 |                     f"Standard Keras loading failed for {lang} model: {e}"
185 |                 ) from e
186 |         except Exception as e:
187 |             logger.error(f"tf-keras loading failed for {lang} model: {e}")
188 |             raise RuntimeError(f"Model loading failed for {lang} model: {e}") from e
189 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | We welcome contributions to pranaam! This guide will help you get started.
  4 | 
  5 | ## Code of Conduct
  6 | 
  7 | The project welcomes contributions from everyone! It depends on it. To maintain this welcoming atmosphere and to collaborate in a fun and productive way, we expect contributors to the project to abide by the [Contributor Code of Conduct](http://contributor-covenant.org/version/1/0/0/).
  8 | 
  9 | ## Getting Started
 10 | 
 11 | ### Development Setup
 12 | 
 13 | 1. Fork the repository on GitHub
 14 | 2. Clone your fork locally:
 15 | 
 16 |    ```bash
 17 |    git clone https://github.com/yourusername/pranaam.git
 18 |    cd pranaam
 19 |    ```
 20 | 
 21 | 3. Create a virtual environment:
 22 | 
 23 |    ```bash
 24 |    python -m venv venv
 25 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
 26 |    ```
 27 | 
 28 | 4. Install development dependencies:
 29 | 
 30 |    ```bash
 31 |    pip install -e .[dev,test,docs]
 32 |    ```
 33 | 
 34 | 5. Install pre-commit hooks:
 35 | 
 36 |    ```bash
 37 |    pre-commit install
 38 |    ```
 39 | 
 40 | ## Development Workflow
 41 | 
 42 | ### Code Quality Standards
 43 | 
 44 | We maintain high code quality standards:
 45 | 
 46 | * **Type Safety**: 100% MyPy compliance required
 47 | * **Code Formatting**: Black formatting (line length 88)
 48 | * **Testing**: Comprehensive test coverage
 49 | * **Documentation**: All public APIs must be documented
 50 | 
 51 | ### Running Tests
 52 | 
 53 | Run the full test suite:
 54 | 
 55 | ```bash
 56 | pytest
 57 | ```
 58 | 
 59 | Run specific test categories:
 60 | 
 61 | ```bash
 62 | pytest -m unit          # Unit tests only
 63 | pytest -m integration   # Integration tests only
 64 | pytest -m "not slow"    # Skip slow tests
 65 | ```
 66 | 
 67 | Run with coverage:
 68 | 
 69 | ```bash
 70 | pytest --cov=pranaam --cov-report=html
 71 | ```
 72 | 
 73 | ### Code Quality Checks
 74 | 
 75 | Format code with Black:
 76 | 
 77 | ```bash
 78 | black pranaam/
 79 | ```
 80 | 
 81 | Type check with MyPy:
 82 | 
 83 | ```bash
 84 | mypy pranaam/
 85 | ```
 86 | 
 87 | Both commands must pass without errors before submitting a PR.
 88 | 
 89 | ## Types of Contributions
 90 | 
 91 | ### Bug Reports
 92 | 
 93 | When reporting bugs, please include:
 94 | 
 95 | * Python version and operating system
 96 | * TensorFlow version
 97 | * Complete error traceback
 98 | * Minimal code example to reproduce the issue
 99 | * Expected vs. actual behavior
100 | 
101 | ### Feature Requests
102 | 
103 | Before submitting feature requests:
104 | 
105 | * Check existing issues and discussions
106 | * Provide clear use case and rationale
107 | * Consider implementation complexity
108 | * Discuss API design implications
109 | 
110 | ### Code Contributions
111 | 
112 | Areas where we welcome contributions:
113 | 
114 | * **New Language Support**: Adding support for additional Indian languages
115 | * **Model Improvements**: Better accuracy or efficiency
116 | * **Performance Optimizations**: Faster prediction times
117 | * **Documentation**: Improved examples and guides
118 | * **Testing**: Additional test cases and edge cases
119 | * **Bug Fixes**: Resolving reported issues
120 | 
121 | ## Submission Guidelines
122 | 
123 | ### Pull Request Process
124 | 
125 | 1. Create a feature branch:
126 | 
127 |    ```bash
128 |    git checkout -b feature/your-feature-name
129 |    ```
130 | 
131 | 2. Make your changes following our coding standards
132 | 3. Add or update tests as needed
133 | 4. Update documentation if applicable
134 | 5. Ensure all tests pass:
135 | 
136 |    ```bash
137 |    pytest
138 |    black --check pranaam/
139 |    mypy pranaam/
140 |    ```
141 | 
142 | 6. Commit your changes with clear commit messages
143 | 7. Push to your fork and submit a pull request
144 | 
145 | ### Pull Request Requirements
146 | 
147 | Your PR must:
148 | 
149 | * Pass all CI checks (tests, linting, type checking)
150 | * Include appropriate tests for new functionality
151 | * Update documentation for API changes
152 | * Follow semantic versioning principles
153 | * Include a clear description of changes
154 | 
155 | ## Code Style Guidelines
156 | 
157 | ### Python Code Style
158 | 
159 | * Follow PEP 8 with Black formatting
160 | * Use type hints for all function signatures
161 | * Write docstrings for all public functions and classes
162 | * Maximum line length: 88 characters
163 | * Use meaningful variable and function names
164 | 
165 | ### Documentation Style
166 | 
167 | * Use Markdown (.md) format
168 | * Include code examples for new features
169 | * Write clear, concise explanations
170 | * Update API documentation for code changes
171 | 
172 | ### Testing Guidelines
173 | 
174 | * Write unit tests for all new functions
175 | * Include integration tests for complex features
176 | * Test edge cases and error conditions
177 | * Mock external dependencies appropriately
178 | * Aim for high test coverage (>90%)
179 | 
180 | ## Project Structure
181 | 
182 | ### Understanding the Codebase
183 | 
184 | ```text
185 | pranaam/
186 | ├── __init__.py              # Package initialization
187 | ├── naam.py                  # Core Naam class with pred_rel method
188 | ├── base.py                  # Base class for model data management
189 | ├── utils.py                 # Utility functions
190 | ├── logging.py               # Centralized logging configuration
191 | ├── pranaam.py              # CLI entry point and function exports
192 | └── tests/                  # Comprehensive test suite
193 |     ├── conftest.py         # pytest fixtures
194 |     ├── test_naam.py        # Core functionality tests
195 |     ├── test_integration.py # End-to-end integration tests
196 |     └── ...                 # Additional test modules
197 | ```
198 | 
199 | ### Key Components
200 | 
201 | * **naam.py**: Core prediction logic and model loading
202 | * **base.py**: Model data management using importlib.resources
203 | * **utils.py**: Helper functions for data processing
204 | * **logging.py**: Centralized logging configuration
205 | * **tests/**: Comprehensive test suite with 75+ tests
206 | 
207 | ## Release Process
208 | 
209 | ### Version Management
210 | 
211 | We follow semantic versioning (MAJOR.MINOR.PATCH):
212 | 
213 | * **MAJOR**: Breaking API changes
214 | * **MINOR**: New features, backward compatible
215 | * **PATCH**: Bug fixes, backward compatible
216 | 
217 | ### Release Checklist
218 | 
219 | Before releasing a new version:
220 | 
221 | 1. Update version in `pyproject.toml`
222 | 2. Update `CLAUDE.md` with changes and test status
223 | 3. Run full test suite: `pytest` (must be 75/75 passing)
224 | 4. Check formatting: `black --check pranaam/`
225 | 5. Type check: `mypy pranaam/` (must pass with zero errors)
226 | 6. Build package: `python -m build`
227 | 7. Validate: `python -m twine check dist/*`
228 | 8. Test in clean environment
229 | 9. Verify CI passes on GitHub Actions
230 | 
231 | ## Communication
232 | 
233 | ### Getting Help
234 | 
235 | * **GitHub Issues**: Bug reports and feature requests
236 | * **GitHub Discussions**: General questions and ideas
237 | * **Documentation**: Check our comprehensive docs first
238 | 
239 | ### Maintainer Response
240 | 
241 | We aim to:
242 | 
243 | * Acknowledge issues within 48 hours
244 | * Review pull requests within 1 week
245 | * Provide constructive feedback
246 | * Maintain respectful, professional communication
247 | 
248 | ## Recognition
249 | 
250 | Contributors are recognized in:
251 | 
252 | * Release notes for significant contributions
253 | * `AUTHORS.md` file (if we create one)
254 | * GitHub contributors page
255 | 
256 | Thank you for contributing to pranaam! Your efforts help make this tool better for researchers, activists, and developers working to understand and address bias in AI systems.


--------------------------------------------------------------------------------
/model_training/01_uncompress_data.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"uncompress_data.ipynb","provenance":[],"collapsed_sections":[],"mount_file_id":"1p3X_00mbIDIhqM4LU419wIYyowW-zIOB","authorship_tag":"ABX9TyNK2jDcG/MQdVQOtWtBatEr"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8qQMG94kiefz","executionInfo":{"status":"ok","timestamp":1641743651971,"user_tz":480,"elapsed":168,"user":{"displayName":"Rajashekar Chintalapati","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhvaKS4oQw2sBCyEe0cxq9rGBNCg4w2UoWOgVaD660=s64","userId":"03596288833202137831"}},"outputId":"d6015f6a-99ce-459c-9b97-d8755f7e0332"},"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/Colab/islampy\n"]}],"source":["%cd /content/drive/MyDrive/Colab/islampy/"]},{"cell_type":"code","source":["%ls"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cQ8fB05ZjB4G","executionInfo":{"status":"ok","timestamp":1641743660162,"user_tz":480,"elapsed":160,"user":{"displayName":"Rajashekar Chintalapati","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhvaKS4oQw2sBCyEe0cxq9rGBNCg4w2UoWOgVaD660=s64","userId":"03596288833202137831"}},"outputId":"e5498bec-4a8b-4fb0-ff02-5d94213fb0cd"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[0m\u001b[01;34mdata\u001b[0m/  islampy.ipynb\n"]}]},{"cell_type":"code","source":["%ls -ltr data/bihar_lr_caste_muslim.7z"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gs14jzh7jEIb","executionInfo":{"status":"ok","timestamp":1641743698534,"user_tz":480,"elapsed":169,"user":{"displayName":"Rajashekar Chintalapati","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhvaKS4oQw2sBCyEe0cxq9rGBNCg4w2UoWOgVaD660=s64","userId":"03596288833202137831"}},"outputId":"6d655800-7971-4844-b6e4-5706887ed67b"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["-rw------- 1 root root 36073692 Jan  9 15:50 data/bihar_lr_caste_muslim.7z\n"]}]},{"cell_type":"code","source":["%cd data/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7eZ9vUx-jObm","executionInfo":{"status":"ok","timestamp":1641743791001,"user_tz":480,"elapsed":150,"user":{"displayName":"Rajashekar Chintalapati","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhvaKS4oQw2sBCyEe0cxq9rGBNCg4w2UoWOgVaD660=s64","userId":"03596288833202137831"}},"outputId":"ae53bf99-2d6b-4b98-962b-769aae98ad45"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/My Drive/Colab/islampy/data\n"]}]},{"cell_type":"code","source":["!7za e bihar_lr_caste_muslim.7z"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"W9opSBd_jjqU","executionInfo":{"status":"ok","timestamp":1641743823164,"user_tz":480,"elapsed":5626,"user":{"displayName":"Rajashekar Chintalapati","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhvaKS4oQw2sBCyEe0cxq9rGBNCg4w2UoWOgVaD660=s64","userId":"03596288833202137831"}},"outputId":"00f846cd-5cb2-400d-f671-91991ea873e0"},"execution_count":6,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21\n","p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)\n","\n","Scanning the drive for archives:\n","  0M Scan\b\b\b\b\b\b\b\b\b         \b\b\b\b\b\b\b\b\b1 file, 36073692 bytes (35 MiB)\n","\n","Extracting archive: bihar_lr_caste_muslim.7z\n","--\n","Path = bihar_lr_caste_muslim.7z\n","Type = 7z\n","Physical Size = 36073692\n","Headers Size = 154\n","Method = LZMA2:24\n","Solid = -\n","Blocks = 1\n","\n","  0%\b\b\b\b    \b\b\b\b  4% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b  8% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 12% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 17% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 22% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 27% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 32% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 37% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 41% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 46% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 51% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 56% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 61% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 66% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 70% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 75% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 80% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 85% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 90% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 95% - bihar_lr_caste_muslim.csv\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                                \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b100% 1\b\b\b\b\b\b      \b\b\b\b\b\bEverything is Ok\n","\n","Size:       260041734\n","Compressed: 36073692\n"]}]},{"cell_type":"code","source":["%ls"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jzOT-UGcjsP8","executionInfo":{"status":"ok","timestamp":1641743835806,"user_tz":480,"elapsed":325,"user":{"displayName":"Rajashekar Chintalapati","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhvaKS4oQw2sBCyEe0cxq9rGBNCg4w2UoWOgVaD660=s64","userId":"03596288833202137831"}},"outputId":"27943c1c-f2f7-49ca-d9d9-b7ff8e2fee7c"},"execution_count":7,"outputs":[{"output_type":"stream","name":"stdout","text":["bihar_lr_caste_muslim.7z  bihar_lr_caste_muslim.csv\n"]}]},{"cell_type":"code","source":[""],"metadata":{"id":"Ply96Wlqju1z"},"execution_count":null,"outputs":[]}]}


--------------------------------------------------------------------------------
/examples/performance_demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Performance and Caching Demo for Pranaam
  4 | 
  5 | This script demonstrates the performance characteristics and caching behavior
  6 | of pranaam, including batch processing and language switching.
  7 | """
  8 | 
  9 | import time
 10 | 
 11 | import pranaam
 12 | from pranaam.naam import Naam
 13 | 
 14 | 
 15 | def reset_model_state():
 16 |     """Reset model state for clean timing."""
 17 |     Naam.model = None
 18 |     Naam.weights_loaded = False
 19 |     Naam.cur_lang = None
 20 | 
 21 | 
 22 | def time_function(func, *args, **kwargs):
 23 |     """Time a function call and return result and elapsed time."""
 24 |     start = time.time()
 25 |     result = func(*args, **kwargs)
 26 |     elapsed = time.time() - start
 27 |     return result, elapsed
 28 | 
 29 | 
 30 | def batch_size_performance():
 31 |     """Test performance across different batch sizes."""
 32 |     print("⚡ Batch Size Performance Analysis")
 33 |     print("=" * 50)
 34 | 
 35 |     # Generate test names
 36 |     base_names = [
 37 |         "Shah Rukh Khan",
 38 |         "Amitabh Bachchan",
 39 |         "Salman Khan",
 40 |         "Priya Sharma",
 41 |         "Mohammed Ali",
 42 |         "Raj Patel",
 43 |     ]
 44 | 
 45 |     batch_sizes = [1, 5, 10, 25, 50, 100]
 46 |     results = []
 47 | 
 48 |     for batch_size in batch_sizes:
 49 |         # Create batch by repeating/cycling base names
 50 |         test_names = (base_names * ((batch_size // len(base_names)) + 1))[:batch_size]
 51 | 
 52 |         # Reset state for clean timing
 53 |         reset_model_state()
 54 | 
 55 |         # Time the prediction
 56 |         _, elapsed = time_function(pranaam.pred_rel, test_names, lang="eng")
 57 | 
 58 |         names_per_sec = batch_size / elapsed
 59 |         ms_per_name = (elapsed * 1000) / batch_size
 60 | 
 61 |         results.append(
 62 |             {
 63 |                 "batch_size": batch_size,
 64 |                 "total_time": elapsed,
 65 |                 "names_per_sec": names_per_sec,
 66 |                 "ms_per_name": ms_per_name,
 67 |             }
 68 |         )
 69 | 
 70 |         print(
 71 |             f"Batch {batch_size:>3}: {elapsed:>6.2f}s total, {names_per_sec:>6.1f} names/sec, {ms_per_name:>6.1f}ms/name"
 72 |         )
 73 | 
 74 |     # Show efficiency gains
 75 |     print("\nBatch Processing Efficiency:")
 76 |     single_ms = results[0]["ms_per_name"]
 77 |     for r in results[1:]:
 78 |         speedup = single_ms / r["ms_per_name"]
 79 |         print(
 80 |             f"Batch {r['batch_size']:>3}: {speedup:>4.1f}x faster than single predictions"
 81 |         )
 82 |     print()
 83 | 
 84 | 
 85 | def model_caching_behavior():
 86 |     """Demonstrate model caching and reload behavior."""
 87 |     print("💾 Model Caching and Reload Behavior")
 88 |     print("=" * 50)
 89 | 
 90 |     test_name = "Shah Rukh Khan"
 91 | 
 92 |     # First prediction - includes model loading
 93 |     reset_model_state()
 94 |     print("First prediction (includes model download/loading):")
 95 |     _, elapsed1 = time_function(pranaam.pred_rel, test_name, lang="eng")
 96 |     print(f"   Time: {elapsed1:.2f}s")
 97 |     print(f"   Model loaded: {Naam.weights_loaded}")
 98 |     print(f"   Current language: {Naam.cur_lang}")
 99 | 
100 |     # Second prediction - should use cached model
101 |     print("\nSecond prediction (cached model):")
102 |     _, elapsed2 = time_function(pranaam.pred_rel, test_name, lang="eng")
103 |     print(f"   Time: {elapsed2:.2f}s")
104 |     print(f"   Speedup: {elapsed1 / elapsed2:.1f}x faster")
105 | 
106 |     # Third prediction with different name - still cached
107 |     print("\nThird prediction with different name (still cached):")
108 |     _, elapsed3 = time_function(pranaam.pred_rel, "Amitabh Bachchan", lang="eng")
109 |     print(f"   Time: {elapsed3:.2f}s")
110 |     print(f"   Similar performance: {abs(elapsed3 - elapsed2) < 0.1}")
111 |     print()
112 | 
113 | 
114 | def language_switching_performance():
115 |     """Test performance when switching between languages."""
116 |     print("🔄 Language Switching Performance")
117 |     print("=" * 50)
118 | 
119 |     english_name = "Shah Rukh Khan"
120 |     hindi_name = "शाहरुख खान"
121 | 
122 |     # Start with English
123 |     reset_model_state()
124 |     print("Initial English prediction:")
125 |     _, elapsed_eng1 = time_function(pranaam.pred_rel, english_name, lang="eng")
126 |     print(f"   Time: {elapsed_eng1:.2f}s (includes model loading)")
127 |     print(f"   Current language: {Naam.cur_lang}")
128 | 
129 |     # Switch to Hindi - requires model reload
130 |     print("\nSwitch to Hindi (requires model reload):")
131 |     _, elapsed_hin = time_function(pranaam.pred_rel, hindi_name, lang="hin")
132 |     print(f"   Time: {elapsed_hin:.2f}s")
133 |     print(f"   Current language: {Naam.cur_lang}")
134 | 
135 |     # Switch back to English - requires reload again
136 |     print("\nSwitch back to English (requires reload):")
137 |     _, elapsed_eng2 = time_function(pranaam.pred_rel, english_name, lang="eng")
138 |     print(f"   Time: {elapsed_eng2:.2f}s")
139 |     print(f"   Current language: {Naam.cur_lang}")
140 | 
141 |     # Second English prediction - should be fast
142 |     print("\nSecond English prediction (cached):")
143 |     _, elapsed_eng3 = time_function(pranaam.pred_rel, english_name, lang="eng")
144 |     print(f"   Time: {elapsed_eng3:.2f}s")
145 |     print(f"   Speedup vs reload: {elapsed_eng2 / elapsed_eng3:.1f}x")
146 |     print()
147 | 
148 | 
149 | def memory_usage_demo():
150 |     """Show memory considerations for large batches."""
151 |     print("🧠 Memory Usage Considerations")
152 |     print("=" * 50)
153 | 
154 |     # Test with increasingly large batches
155 |     base_names = ["Shah Rukh Khan", "Priya Sharma", "Mohammed Ali"]
156 | 
157 |     for size in [100, 500, 1000]:
158 |         test_names = (base_names * (size // len(base_names) + 1))[:size]
159 | 
160 |         print(f"Processing {size} names...")
161 |         _, elapsed = time_function(pranaam.pred_rel, test_names, lang="eng")
162 | 
163 |         rate = size / elapsed
164 |         print(f"   Time: {elapsed:.2f}s ({rate:.0f} names/sec)")
165 | 
166 |     print("\n💡 Memory Tips:")
167 |     print("   • Process in chunks of 1000-5000 names for optimal memory usage")
168 |     print("   • Model stays loaded between predictions (uses ~500MB RAM)")
169 |     print("   • Language switching reloads model but previous model is freed")
170 |     print()
171 | 
172 | 
173 | def practical_benchmarks():
174 |     """Show practical real-world performance benchmarks."""
175 |     print("📊 Practical Performance Benchmarks")
176 |     print("=" * 50)
177 | 
178 |     # Typical use cases
179 |     use_cases = [
180 |         ("Single name lookup", 1),
181 |         ("Small batch (department)", 25),
182 |         ("Medium batch (company)", 500),
183 |         ("Large batch (survey)", 5000),
184 |     ]
185 | 
186 |     base_names = [
187 |         "Shah Rukh Khan",
188 |         "Amitabh Bachchan",
189 |         "Priya Sharma",
190 |         "Mohammed Ali",
191 |         "Raj Patel",
192 |         "Fatima Khan",
193 |     ]
194 | 
195 |     print("Use Case                    | Size  | Time    | Rate")
196 |     print("-" * 55)
197 | 
198 |     for use_case, size in use_cases:
199 |         test_names = (base_names * (size // len(base_names) + 1))[:size]
200 | 
201 |         # Reset for fair timing
202 |         reset_model_state()
203 |         _, elapsed = time_function(pranaam.pred_rel, test_names, lang="eng")
204 | 
205 |         rate = size / elapsed
206 |         print(f"{use_case:<25} | {size:>4} | {elapsed:>6.2f}s | {rate:>6.0f}/sec")
207 | 
208 |     print("\n🎯 Performance Summary:")
209 |     print("   • Cold start (first prediction): ~3-5 seconds")
210 |     print("   • Warm predictions: 100-500+ names/second")
211 |     print("   • Model loading is one-time cost per language")
212 |     print("   • Batch processing is highly efficient")
213 | 
214 | 
215 | if __name__ == "__main__":
216 |     print("🚀 Pranaam Performance and Caching Demo")
217 |     print("=" * 60)
218 |     print("This script analyzes performance characteristics of the pranaam package.\n")
219 | 
220 |     batch_size_performance()
221 |     model_caching_behavior()
222 |     language_switching_performance()
223 |     memory_usage_demo()
224 |     practical_benchmarks()
225 | 
226 |     print("✅ Performance analysis completed!")
227 |     print(
228 |         "Use these insights to optimize your pranaam usage for your specific use case."
229 |     )
230 | 


--------------------------------------------------------------------------------
/tests/test_logging.py:
--------------------------------------------------------------------------------
  1 | """Tests for logging module."""
  2 | 
  3 | import logging
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | import pytest
  7 | from rich.logging import RichHandler
  8 | 
  9 | from pranaam.logging import get_logger
 10 | 
 11 | 
 12 | class TestGetLogger:
 13 |     """Test get_logger function."""
 14 | 
 15 |     def test_default_logger_name(self) -> None:
 16 |         """Test getting logger with default name."""
 17 |         logger = get_logger()
 18 |         assert logger.name == "pranaam"
 19 |         assert isinstance(logger, logging.Logger)
 20 | 
 21 |     def test_custom_logger_name(self) -> None:
 22 |         """Test getting logger with custom name."""
 23 |         logger = get_logger("custom_name")
 24 |         assert logger.name == "custom_name"
 25 |         assert isinstance(logger, logging.Logger)
 26 | 
 27 |     def test_logger_configuration(self) -> None:
 28 |         """Test that logger is properly configured."""
 29 |         with patch("logging.getLogger") as mock_get_logger:
 30 |             mock_logger = mock_get_logger.return_value
 31 |             mock_logger.handlers = []  # No existing handlers
 32 | 
 33 |             get_logger("test")
 34 | 
 35 |             # Should add handler and set level
 36 |             mock_logger.addHandler.assert_called_once()
 37 |             mock_logger.setLevel.assert_called_once_with(logging.INFO)
 38 | 
 39 |     def test_no_duplicate_handlers(self) -> None:
 40 |         """Test that handlers are not duplicated on multiple calls."""
 41 |         with patch("logging.getLogger") as mock_get_logger:
 42 |             mock_logger = mock_get_logger.return_value
 43 |             mock_logger.handlers = [logging.StreamHandler()]  # Already has handler
 44 | 
 45 |             get_logger("test")
 46 | 
 47 |             # Should not add another handler
 48 |             mock_logger.addHandler.assert_not_called()
 49 |             mock_logger.setLevel.assert_not_called()
 50 | 
 51 |     def test_handler_formatter(self) -> None:
 52 |         """Test that handler has proper formatter."""
 53 |         # Clear any existing loggers to ensure clean test
 54 |         logger_name = "test_formatter_logger"
 55 |         if logger_name in logging.Logger.manager.loggerDict:
 56 |             del logging.Logger.manager.loggerDict[logger_name]
 57 | 
 58 |         logger = get_logger(logger_name)
 59 | 
 60 |         # Should have at least one handler
 61 |         assert len(logger.handlers) >= 1
 62 | 
 63 |         # Get the first handler (should be our RichHandler)
 64 |         handler = logger.handlers[0]
 65 |         assert isinstance(handler, RichHandler)
 66 | 
 67 |         # Check formatter
 68 |         formatter = handler.formatter
 69 |         assert formatter is not None
 70 | 
 71 |         # Test formatter format string
 72 |         format_str = formatter._fmt
 73 |         assert format_str is not None
 74 |         # The actual format is simplified: "%(name)s - %(message)s"
 75 |         expected_components = [
 76 |             "%(name)s",
 77 |             "%(message)s",
 78 |         ]
 79 |         for component in expected_components:
 80 |             assert component in format_str
 81 | 
 82 |     def test_logger_level(self) -> None:
 83 |         """Test that logger is set to INFO level."""
 84 |         logger_name = "test_level_logger"
 85 |         if logger_name in logging.Logger.manager.loggerDict:
 86 |             del logging.Logger.manager.loggerDict[logger_name]
 87 | 
 88 |         logger = get_logger(logger_name)
 89 |         assert logger.level == logging.INFO
 90 | 
 91 |     def test_same_logger_instance(self) -> None:
 92 |         """Test that same logger name returns same instance."""
 93 |         logger1 = get_logger("same_name")
 94 |         logger2 = get_logger("same_name")
 95 | 
 96 |         assert logger1 is logger2
 97 | 
 98 |     def test_different_logger_instances(self) -> None:
 99 |         """Test that different logger names return different instances."""
100 |         logger1 = get_logger("name1")
101 |         logger2 = get_logger("name2")
102 | 
103 |         assert logger1 is not logger2
104 |         assert logger1.name != logger2.name
105 | 
106 | 
107 | class TestLoggerFunctionality:
108 |     """Test actual logging functionality."""
109 | 
110 |     def test_logger_can_log_messages(self, caplog: pytest.LogCaptureFixture) -> None:
111 |         """Test that logger can actually log messages."""
112 |         logger = get_logger("test_logging")
113 | 
114 |         with caplog.at_level(logging.INFO):
115 |             logger.info("Test info message")
116 |             logger.warning("Test warning message")
117 |             logger.error("Test error message")
118 | 
119 |         # Check that messages were logged
120 |         assert "Test info message" in caplog.text
121 |         assert "Test warning message" in caplog.text
122 |         assert "Test error message" in caplog.text
123 | 
124 |     def test_logger_debug_level_filtering(
125 |         self, caplog: pytest.LogCaptureFixture
126 |     ) -> None:
127 |         """Test that DEBUG messages are filtered out by default."""
128 |         logger = get_logger("test_debug")
129 | 
130 |         # Test at INFO level (default) - debug should be filtered
131 |         with caplog.at_level(logging.INFO):
132 |             logger.debug("Debug message")
133 |             logger.info("Info message")
134 | 
135 |         # Debug should NOT appear at INFO level
136 |         assert "Debug message" not in caplog.text
137 |         assert "Info message" in caplog.text
138 | 
139 |         # Clear and test at DEBUG level - debug should appear
140 |         caplog.clear()
141 |         with caplog.at_level(logging.DEBUG):
142 |             # Ensure logger accepts DEBUG level
143 |             logger.setLevel(logging.DEBUG)
144 |             logger.debug("Debug message")
145 |             logger.info("Info message")
146 | 
147 |         # Both should appear at DEBUG level
148 |         assert "Debug message" in caplog.text
149 |         assert "Info message" in caplog.text
150 | 
151 |     def test_logger_formatting_output(self, caplog: pytest.LogCaptureFixture) -> None:
152 |         """Test that log messages are properly formatted."""
153 |         logger = get_logger("test_format")
154 | 
155 |         with caplog.at_level(logging.INFO):
156 |             logger.info("Test message")
157 | 
158 |         # Check format contains expected components
159 |         log_output = caplog.text
160 |         assert "test_format" in log_output  # Logger name
161 |         assert "INFO" in log_output  # Log level
162 |         assert "Test message" in log_output  # Actual message
163 |         # Note: timestamp format may vary, so we don't test exact format
164 | 
165 | 
166 | class TestLoggerEdgeCases:
167 |     """Test edge cases and error conditions."""
168 | 
169 |     def test_logger_with_empty_name(self) -> None:
170 |         """Test logger with empty string name."""
171 |         logger = get_logger("")
172 |         # Empty string gets converted to "pranaam" by our function
173 |         assert logger.name == "pranaam"
174 |         assert isinstance(logger, logging.Logger)
175 | 
176 |     def test_logger_with_none_name(self) -> None:
177 |         """Test logger with None name (should use default)."""
178 |         logger = get_logger(None)
179 |         assert logger.name == "pranaam"
180 | 
181 |     def test_logger_with_special_characters(self) -> None:
182 |         """Test logger name with special characters."""
183 |         special_name = "test.logger-with_special123"
184 |         logger = get_logger(special_name)
185 |         assert logger.name == special_name
186 | 
187 |     @patch("pranaam.logging.RichHandler")
188 |     def test_handler_creation_error(self, mock_handler: Mock) -> None:
189 |         """Test handling of handler creation errors."""
190 |         mock_handler.side_effect = Exception("Handler creation failed")
191 | 
192 |         # Should raise exception since we don't handle this error
193 |         with pytest.raises(Exception, match="Handler creation failed"):
194 |             get_logger("test_handler_error")
195 | 
196 |     def test_multiple_calls_same_name(self) -> None:
197 |         """Test multiple calls with same name don't create duplicate config."""
198 |         logger_name = "test_multiple_calls"
199 | 
200 |         # Clear existing logger
201 |         if logger_name in logging.Logger.manager.loggerDict:
202 |             del logging.Logger.manager.loggerDict[logger_name]
203 | 
204 |         logger1 = get_logger(logger_name)
205 |         initial_handlers = len(logger1.handlers)
206 | 
207 |         logger2 = get_logger(logger_name)
208 |         final_handlers = len(logger2.handlers)
209 | 
210 |         # Should not add more handlers
211 |         assert initial_handlers == final_handlers
212 |         assert logger1 is logger2
213 | 


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
  1 | # Examples
  2 | 
  3 | This page contains practical examples of using pranaam in various scenarios.
  4 | 
  5 | ## Basic Examples
  6 | 
  7 | ### Single Name Prediction
  8 | 
  9 | ```python
 10 | import pranaam
 11 | 
 12 | # Basic prediction
 13 | result = pranaam.pred_rel("Abdul Kalam")
 14 | print(result)
 15 | 
 16 | # Output:
 17 | #        name pred_label  pred_prob_muslim
 18 | # 0  Abdul Kalam     muslim              85.5
 19 | ```
 20 | 
 21 | ### Multiple Names
 22 | 
 23 | ```python
 24 | import pranaam
 25 | 
 26 | # List of names
 27 | bollywood_actors = [
 28 |     "Shah Rukh Khan",
 29 |     "Amitabh Bachchan", 
 30 |     "Salman Khan",
 31 |     "Aamir Khan",
 32 |     "Akshay Kumar"
 33 | ]
 34 | 
 35 | results = pranaam.pred_rel(bollywood_actors, lang="eng")
 36 | print(results)
 37 | ```
 38 | 
 39 | ## Data Analysis Examples
 40 | 
 41 | ### Working with CSV Files
 42 | 
 43 | ```python
 44 | import pandas as pd
 45 | import pranaam
 46 | 
 47 | # Read names from CSV
 48 | df = pd.read_csv('names.csv')
 49 | 
 50 | # Predict religion for name column
 51 | predictions = pranaam.pred_rel(df['name'], lang="eng")
 52 | 
 53 | # Merge predictions with original data
 54 | df_with_predictions = pd.concat([df, predictions[['pred_label', 'pred_prob_muslim']]], axis=1)
 55 | 
 56 | # Save results
 57 | df_with_predictions.to_csv('results.csv', index=False)
 58 | ```
 59 | 
 60 | ### Batch Processing
 61 | 
 62 | ```python
 63 | import pranaam
 64 | import pandas as pd
 65 | from typing import List
 66 | 
 67 | def process_large_dataset(names: List[str], batch_size: int = 1000) -> pd.DataFrame:
 68 |     """Process large datasets in batches to manage memory."""
 69 |     all_results = []
 70 |     
 71 |     for i in range(0, len(names), batch_size):
 72 |         batch = names[i:i+batch_size]
 73 |         batch_results = pranaam.pred_rel(batch, lang="eng")
 74 |         all_results.append(batch_results)
 75 |         print(f"Processed batch {i//batch_size + 1}/{(len(names)-1)//batch_size + 1}")
 76 |     
 77 |     return pd.concat(all_results, ignore_index=True)
 78 | 
 79 | # Usage
 80 | large_name_list = ["Name" + str(i) for i in range(10000)]  # Your actual names
 81 | results = process_large_dataset(large_name_list)
 82 | ```
 83 | 
 84 | ### Statistical Analysis
 85 | 
 86 | ```python
 87 | import pranaam
 88 | import pandas as pd
 89 | import matplotlib.pyplot as plt
 90 | 
 91 | # Sample dataset
 92 | politicians = [
 93 |     "Narendra Modi", "Rahul Gandhi", "Mamata Banerjee",
 94 |     "Arvind Kejriwal", "Yogi Adityanath", "Akhilesh Yadav",
 95 |     "Mayawati", "Sharad Pawar", "Uddhav Thackeray"
 96 | ]
 97 | 
 98 | # Get predictions
 99 | results = pranaam.pred_rel(politicians)
100 | 
101 | # Statistical summary
102 | print("Religion Distribution:")
103 | print(results['pred_label'].value_counts())
104 | 
105 | print("\nAverage Confidence Scores:")
106 | print(results.groupby('pred_label')['pred_prob_muslim'].mean())
107 | 
108 | # Visualization
109 | plt.figure(figsize=(10, 6))
110 | plt.hist(results['pred_prob_muslim'], bins=20, alpha=0.7)
111 | plt.xlabel('Muslim Probability Score')
112 | plt.ylabel('Count')
113 | plt.title('Distribution of Prediction Confidence Scores')
114 | plt.show()
115 | ```
116 | 
117 | ## Hindi Language Examples
118 | 
119 | ### Devanagari Script
120 | 
121 | ```python
122 | import pranaam
123 | 
124 | # Hindi names in Devanagari script
125 | hindi_names = [
126 |     "शाहरुख खान",     # Shah Rukh Khan
127 |     "अमिताभ बच्चन",    # Amitabh Bachchan
128 |     "आमिर खान",       # Aamir Khan
129 |     "अक्षय कुमार",     # Akshay Kumar
130 |     "सलमान खान"       # Salman Khan
131 | ]
132 | 
133 | results = pranaam.pred_rel(hindi_names, lang="hin")
134 | print(results)
135 | ```
136 | 
137 | ### Mixed Script Dataset
138 | 
139 | ```python
140 | import pranaam
141 | import pandas as pd
142 | 
143 | # Dataset with mixed English and Hindi names
144 | mixed_data = pd.DataFrame({
145 |     'name': ["Shah Rukh Khan", "शाहरुख खान", "Amitabh Bachchan", "अमिताभ बच्चन"],
146 |     'script': ['English', 'Hindi', 'English', 'Hindi']
147 | })
148 | 
149 | # Process by script type
150 | english_results = pranaam.pred_rel(
151 |     mixed_data[mixed_data['script'] == 'English']['name'], 
152 |     lang="eng"
153 | )
154 | 
155 | hindi_results = pranaam.pred_rel(
156 |     mixed_data[mixed_data['script'] == 'Hindi']['name'], 
157 |     lang="hin"
158 | )
159 | 
160 | # Combine results
161 | all_results = pd.concat([english_results, hindi_results], ignore_index=True)
162 | ```
163 | 
164 | ## Advanced Usage
165 | 
166 | ### Custom Confidence Thresholds
167 | 
168 | ```python
169 | import pranaam
170 | import pandas as pd
171 | 
172 | # Get predictions with custom confidence analysis
173 | names = ["Mohammed Ali", "John Smith", "Ahmad Khan", "David Wilson"]
174 | results = pranaam.pred_rel(names)
175 | 
176 | # Add confidence categories
177 | def categorize_confidence(prob):
178 |     if prob >= 80:
179 |         return "High"
180 |     elif prob >= 60:
181 |         return "Medium" 
182 |     else:
183 |         return "Low"
184 | 
185 | results['confidence_level'] = results['pred_prob_muslim'].apply(categorize_confidence)
186 | 
187 | # Filter high-confidence predictions only
188 | high_confidence = results[results['confidence_level'] == 'High']
189 | print("High confidence predictions:")
190 | print(high_confidence)
191 | ```
192 | 
193 | ### Error Handling
194 | 
195 | ```python
196 | import pranaam
197 | import pandas as pd
198 | 
199 | def safe_predict(names, lang="eng"):
200 |     """Predict with error handling."""
201 |     try:
202 |         results = pranaam.pred_rel(names, lang=lang)
203 |         return results
204 |     except ValueError as e:
205 |         print(f"ValueError: {e}")
206 |         return pd.DataFrame()
207 |     except FileNotFoundError as e:
208 |         print(f"Model file error: {e}")
209 |         return pd.DataFrame()
210 |     except Exception as e:
211 |         print(f"Unexpected error: {e}")
212 |         return pd.DataFrame()
213 | 
214 | # Usage
215 | problematic_input = ["", None, "Valid Name"]  # Contains invalid entries
216 | results = safe_predict([name for name in problematic_input if name])
217 | ```
218 | 
219 | ### Performance Optimization
220 | 
221 | ```python
222 | import pranaam
223 | import time
224 | 
225 | # Preload models for better performance in production
226 | # First call loads the model into memory
227 | _ = pranaam.pred_rel("Test Name")
228 | 
229 | # Subsequent calls are faster
230 | large_dataset = ["Name" + str(i) for i in range(1000)]
231 | 
232 | start_time = time.time()
233 | results = pranaam.pred_rel(large_dataset)
234 | end_time = time.time()
235 | 
236 | print(f"Processed {len(large_dataset)} names in {end_time - start_time:.2f} seconds")
237 | print(f"Average: {(end_time - start_time)/len(large_dataset)*1000:.2f} ms per name")
238 | ```
239 | 
240 | ## Integration Examples
241 | 
242 | ### Flask Web Application
243 | 
244 | ```python
245 | from flask import Flask, request, jsonify
246 | import pranaam
247 | 
248 | app = Flask(__name__)
249 | 
250 | @app.route('/predict', methods=['POST'])
251 | def predict_religion():
252 |     data = request.json
253 |     names = data.get('names', [])
254 |     lang = data.get('lang', 'eng')
255 |     
256 |     try:
257 |         results = pranaam.pred_rel(names, lang=lang)
258 |         return jsonify(results.to_dict('records'))
259 |     except Exception as e:
260 |         return jsonify({'error': str(e)}), 400
261 | 
262 | if __name__ == '__main__':
263 |     app.run(debug=True)
264 | ```
265 | 
266 | ### Command Line Scripts
267 | 
268 | ```bash
269 | # Using the built-in CLI
270 | predict_religion --input "Shah Rukh Khan" --lang eng
271 | 
272 | # Batch processing from file
273 | predict_religion --input-file names.txt --output-file results.csv --lang eng
274 | ```
275 | 
276 | ## Real-world Use Cases
277 | 
278 | ### Research Applications
279 | 
280 | ```python
281 | import pranaam
282 | import pandas as pd
283 | 
284 | # Academic research on name-based demographics
285 | def analyze_author_demographics(author_names):
286 |     """Analyze religious demographics of academic authors."""
287 |     results = pranaam.pred_rel(author_names)
288 |     
289 |     # Calculate statistics
290 |     total_authors = len(results)
291 |     muslim_authors = len(results[results['pred_label'] == 'muslim'])
292 |     
293 |     demographics = {
294 |         'total_authors': total_authors,
295 |         'muslim_percentage': (muslim_authors / total_authors) * 100,
296 |         'non_muslim_percentage': ((total_authors - muslim_authors) / total_authors) * 100
297 |     }
298 |     
299 |     return demographics, results
300 | 
301 | # Example usage
302 | ieee_authors = ["A. Sharma", "M. Ahmed", "R. Singh", "S. Ali"]
303 | stats, detailed_results = analyze_author_demographics(ieee_authors)
304 | ```
305 | 
306 | ### Business Intelligence
307 | 
308 | ```python
309 | import pranaam
310 | import pandas as pd
311 | 
312 | # Customer base analysis
313 | def analyze_customer_base(customer_df):
314 |     """Analyze customer demographics for business insights."""
315 |     # Predict religion from customer names
316 |     predictions = pranaam.pred_rel(customer_df['customer_name'])
317 |     
318 |     # Merge with customer data
319 |     analysis_df = pd.concat([customer_df, predictions[['pred_label', 'pred_prob_muslim']]], axis=1)
320 |     
321 |     # Business insights
322 |     demographic_summary = analysis_df.groupby('pred_label').agg({
323 |         'purchase_amount': ['mean', 'sum', 'count'],
324 |         'pred_prob_muslim': 'mean'
325 |     }).round(2)
326 |     
327 |     return analysis_df, demographic_summary
328 | 
329 | # Example usage
330 | customer_data = pd.DataFrame({
331 |     'customer_name': ['Ahmed Khan', 'Priya Sharma', 'Mohammad Ali'],
332 |     'purchase_amount': [1500, 2000, 1200]
333 | })
334 | 
335 | results, summary = analyze_customer_base(customer_data)
336 | ```


--------------------------------------------------------------------------------
/tests/test_base.py:
--------------------------------------------------------------------------------
  1 | """Tests for base module."""
  2 | 
  3 | from pathlib import Path
  4 | from unittest.mock import MagicMock, patch
  5 | 
  6 | from pranaam.base import Base
  7 | 
  8 | 
  9 | class TestBase:
 10 |     """Test Base class functionality."""
 11 | 
 12 |     def test_base_class_attributes(self) -> None:
 13 |         """Test Base class has expected attributes."""
 14 |         assert hasattr(Base, "MODELFN")
 15 |         assert hasattr(Base, "load_model_data")
 16 |         assert Base.MODELFN is None
 17 | 
 18 |     def test_load_model_data_no_modelfn(self) -> None:
 19 |         """Test load_model_data when MODELFN is None."""
 20 | 
 21 |         # Create a test class without MODELFN set
 22 |         class TestClass(Base):
 23 |             MODELFN = None
 24 | 
 25 |         result = TestClass.load_model_data("test_file")
 26 |         assert result is None
 27 | 
 28 |     @patch("pranaam.base.files")
 29 |     @patch("pranaam.base.download_file")
 30 |     @patch("pathlib.Path.exists")
 31 |     @patch("pathlib.Path.mkdir")
 32 |     def test_load_model_data_success(
 33 |         self,
 34 |         mock_mkdir: MagicMock,
 35 |         mock_exists: MagicMock,
 36 |         mock_download: MagicMock,
 37 |         mock_files: MagicMock,
 38 |     ) -> None:
 39 |         """Test successful model data loading."""
 40 |         # Setup mocks - make files() return a string that can be used as a path
 41 |         mock_files.return_value = "/fake/package"
 42 | 
 43 |         # File doesn't exist (so download gets called)
 44 |         mock_exists.return_value = False
 45 |         mock_download.return_value = True
 46 | 
 47 |         # Create test class
 48 |         class TestClass(Base):
 49 |             MODELFN = "model"
 50 | 
 51 |         result = TestClass.load_model_data("test_model", latest=False)
 52 | 
 53 |         assert result == Path("/fake/package/model")
 54 |         mock_mkdir.assert_called_once_with(exist_ok=True)
 55 |         mock_download.assert_called_once()
 56 | 
 57 |     @patch("pranaam.base.files")
 58 |     @patch("pranaam.base.download_file")
 59 |     @patch("pathlib.Path.exists")
 60 |     @patch("pathlib.Path.mkdir")
 61 |     def test_load_model_data_file_exists_no_latest(
 62 |         self,
 63 |         mock_mkdir: MagicMock,
 64 |         mock_exists: MagicMock,
 65 |         mock_download: MagicMock,
 66 |         mock_files: MagicMock,
 67 |     ) -> None:
 68 |         """Test model loading when file exists and latest=False."""
 69 |         # Setup mocks
 70 |         mock_files.return_value = "/fake/package"
 71 | 
 72 |         # File exists
 73 |         mock_exists.return_value = True
 74 | 
 75 |         class TestClass(Base):
 76 |             MODELFN = "model"
 77 | 
 78 |         result = TestClass.load_model_data("test_model", latest=False)
 79 | 
 80 |         assert result == Path("/fake/package/model")
 81 |         # Should not download since file exists and latest=False
 82 |         mock_download.assert_not_called()
 83 | 
 84 |     @patch("pranaam.base.files")
 85 |     @patch("pranaam.base.download_file")
 86 |     @patch("pathlib.Path.exists")
 87 |     @patch("pathlib.Path.mkdir")
 88 |     def test_load_model_data_force_latest(
 89 |         self,
 90 |         mock_mkdir: MagicMock,
 91 |         mock_exists: MagicMock,
 92 |         mock_download: MagicMock,
 93 |         mock_files: MagicMock,
 94 |     ) -> None:
 95 |         """Test model loading with latest=True forces redownload."""
 96 |         # Setup mocks
 97 |         mock_files.return_value = "/fake/package"
 98 | 
 99 |         mock_exists.return_value = True  # File exists
100 |         mock_download.return_value = True
101 | 
102 |         class TestClass(Base):
103 |             MODELFN = "model"
104 | 
105 |         result = TestClass.load_model_data("test_model", latest=True)
106 | 
107 |         assert result == Path("/fake/package/model")
108 |         # Should download even though file exists because latest=True
109 |         mock_download.assert_called_once()
110 | 
111 |     @patch("pranaam.base.files")
112 |     @patch("pranaam.base.download_file")
113 |     @patch("pathlib.Path.exists")
114 |     @patch("pathlib.Path.mkdir")
115 |     def test_load_model_data_download_failure(
116 |         self,
117 |         mock_mkdir: MagicMock,
118 |         mock_exists: MagicMock,
119 |         mock_download: MagicMock,
120 |         mock_files: MagicMock,
121 |     ) -> None:
122 |         """Test handling of download failure."""
123 |         # Setup mocks
124 |         mock_files.return_value = "/fake/package"
125 | 
126 |         # File doesn't exist
127 |         mock_exists.return_value = False
128 |         mock_download.return_value = False  # Download fails
129 | 
130 |         class TestClass(Base):
131 |             MODELFN = "model"
132 | 
133 |         result = TestClass.load_model_data("test_model")
134 | 
135 |         # Should still return path even if download fails
136 |         assert result == Path("/fake/package/model")
137 | 
138 |     @patch("pranaam.base.files")
139 |     @patch("pathlib.Path.exists")
140 |     @patch("pathlib.Path.mkdir")
141 |     def test_load_model_data_creates_directory(
142 |         self, mock_mkdir: MagicMock, mock_exists: MagicMock, mock_files: MagicMock
143 |     ) -> None:
144 |         """Test that model directory is created if it doesn't exist."""
145 |         # Setup mocks
146 |         mock_files.return_value = "/fake/package"
147 | 
148 |         mock_exists.return_value = False  # File doesn't exist
149 | 
150 |         class TestClass(Base):
151 |             MODELFN = "model"
152 | 
153 |         with patch("pranaam.base.download_file", return_value=True):
154 |             result = TestClass.load_model_data("test_model")
155 | 
156 |         mock_mkdir.assert_called_once_with(exist_ok=True)
157 |         assert result == Path("/fake/package/model")
158 | 
159 | 
160 | class TestBaseInheritance:
161 |     """Test Base class inheritance patterns."""
162 | 
163 |     def test_subclass_can_override_modelfn(self) -> None:
164 |         """Test that subclasses can override MODELFN."""
165 | 
166 |         class CustomBase(Base):
167 |             MODELFN = "custom_model"
168 | 
169 |         assert CustomBase.MODELFN == "custom_model"
170 |         assert Base.MODELFN is None  # Original unchanged
171 | 
172 |     def test_multiple_subclasses_independent(self) -> None:
173 |         """Test that multiple subclasses have independent MODELFN values."""
174 | 
175 |         class BaseA(Base):
176 |             MODELFN = "model_a"
177 | 
178 |         class BaseB(Base):
179 |             MODELFN = "model_b"
180 | 
181 |         assert BaseA.MODELFN == "model_a"
182 |         assert BaseB.MODELFN == "model_b"
183 |         assert Base.MODELFN is None
184 | 
185 | 
186 | class TestBaseLogging:
187 |     """Test logging in Base class."""
188 | 
189 |     @patch("pranaam.base.logger")
190 |     @patch("pranaam.base.files")
191 |     @patch("pranaam.base.download_file")
192 |     @patch("pathlib.Path.exists")
193 |     @patch("pathlib.Path.mkdir")
194 |     def test_debug_logging_download(
195 |         self,
196 |         mock_mkdir: MagicMock,
197 |         mock_exists: MagicMock,
198 |         mock_download: MagicMock,
199 |         mock_files: MagicMock,
200 |         mock_logger: MagicMock,
201 |     ) -> None:
202 |         """Test debug logging during download."""
203 |         # Setup mocks
204 |         mock_files.return_value = "/fake/package"
205 | 
206 |         # File doesn't exist
207 |         mock_exists.return_value = False
208 |         mock_download.return_value = True
209 | 
210 |         class TestClass(Base):
211 |             MODELFN = "model"
212 | 
213 |         TestClass.load_model_data("test_model")
214 | 
215 |         # Should log download message
216 |         mock_logger.debug.assert_called()
217 |         debug_calls = [call[0][0] for call in mock_logger.debug.call_args_list]
218 |         assert any("Downloading model data" in call for call in debug_calls)
219 | 
220 |     @patch("pranaam.base.logger")
221 |     @patch("pranaam.base.files")
222 |     @patch("pathlib.Path.exists")
223 |     @patch("pathlib.Path.mkdir")
224 |     def test_debug_logging_existing_model(
225 |         self,
226 |         mock_mkdir: MagicMock,
227 |         mock_exists: MagicMock,
228 |         mock_files: MagicMock,
229 |         mock_logger: MagicMock,
230 |     ) -> None:
231 |         """Test debug logging when using existing model."""
232 |         # Setup mocks
233 |         mock_files.return_value = "/fake/package"
234 | 
235 |         mock_exists.return_value = True  # File exists
236 | 
237 |         class TestClass(Base):
238 |             MODELFN = "model"
239 | 
240 |         TestClass.load_model_data("test_model")
241 | 
242 |         # Should log using existing model message
243 |         mock_logger.debug.assert_called()
244 |         debug_calls = [call[0][0] for call in mock_logger.debug.call_args_list]
245 |         assert any("Using model data from" in call for call in debug_calls)
246 | 
247 |     @patch("pranaam.base.logger")
248 |     @patch("pranaam.base.files")
249 |     @patch("pranaam.base.download_file")
250 |     @patch("pathlib.Path.exists")
251 |     @patch("pathlib.Path.mkdir")
252 |     def test_error_logging_download_failure(
253 |         self,
254 |         mock_mkdir: MagicMock,
255 |         mock_exists: MagicMock,
256 |         mock_download: MagicMock,
257 |         mock_files: MagicMock,
258 |         mock_logger: MagicMock,
259 |     ) -> None:
260 |         """Test error logging when download fails."""
261 |         # Setup mocks
262 |         mock_files.return_value = "/fake/package"
263 | 
264 |         # File doesn't exist
265 |         mock_exists.return_value = False
266 |         mock_download.return_value = False  # Download fails
267 | 
268 |         class TestClass(Base):
269 |             MODELFN = "model"
270 | 
271 |         TestClass.load_model_data("test_model")
272 | 
273 |         # Should log error message
274 |         mock_logger.error.assert_called_once_with(
275 |             "ERROR: Cannot download model data file"
276 |         )
277 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | """Tests for CLI functionality."""
  2 | 
  3 | from io import StringIO
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | import pandas as pd
  7 | import pytest
  8 | 
  9 | from pranaam.pranaam import main
 10 | 
 11 | 
 12 | class TestCLIMain:
 13 |     """Test main CLI function."""
 14 | 
 15 |     def test_help_option(self) -> None:
 16 |         """Test --help option displays help and exits."""
 17 |         with pytest.raises(SystemExit) as exc_info:
 18 |             main(["--help"])
 19 | 
 20 |         assert exc_info.value.code == 0
 21 | 
 22 |     def test_missing_required_argument(self) -> None:
 23 |         """Test that missing --input argument returns error."""
 24 |         with pytest.raises(SystemExit) as exc_info:
 25 |             main([])
 26 |         assert exc_info.value.code == 2
 27 | 
 28 |     @patch("pranaam.pranaam.pred_rel")
 29 |     def test_successful_prediction(self, mock_pred_rel: Mock) -> None:
 30 |         """Test successful prediction with valid arguments."""
 31 |         # Setup mock return value
 32 |         mock_result = pd.DataFrame(
 33 |             {
 34 |                 "name": ["Test Name"],
 35 |                 "pred_label": ["muslim"],
 36 |                 "pred_prob_muslim": [75.0],
 37 |             }
 38 |         )
 39 |         mock_pred_rel.return_value = mock_result
 40 | 
 41 |         with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
 42 |             result = main(["--input", "Test Name"])
 43 | 
 44 |         assert result == 0
 45 |         mock_pred_rel.assert_called_once_with("Test Name", lang="eng", latest=False)
 46 | 
 47 |         # Check output contains expected data
 48 |         output = mock_stdout.getvalue()
 49 |         assert "Test Name" in output
 50 |         assert "muslim" in output
 51 |         assert "75.0" in output
 52 | 
 53 |     @patch("pranaam.pranaam.pred_rel")
 54 |     def test_hindi_language_option(self, mock_pred_rel: Mock) -> None:
 55 |         """Test Hindi language option."""
 56 |         mock_result = pd.DataFrame(
 57 |             {
 58 |                 "name": ["टेस्ट नाम"],
 59 |                 "pred_label": ["not-muslim"],
 60 |                 "pred_prob_muslim": [25.0],
 61 |             }
 62 |         )
 63 |         mock_pred_rel.return_value = mock_result
 64 | 
 65 |         result = main(["--input", "टेस्ट नाम", "--lang", "hin"])
 66 | 
 67 |         assert result == 0
 68 |         mock_pred_rel.assert_called_once_with("टेस्ट नाम", lang="hin", latest=False)
 69 | 
 70 |     @patch("pranaam.pranaam.pred_rel")
 71 |     def test_latest_option(self, mock_pred_rel: Mock) -> None:
 72 |         """Test --latest option."""
 73 |         mock_result = pd.DataFrame(
 74 |             {
 75 |                 "name": ["Test Name"],
 76 |                 "pred_label": ["muslim"],
 77 |                 "pred_prob_muslim": [80.0],
 78 |             }
 79 |         )
 80 |         mock_pred_rel.return_value = mock_result
 81 | 
 82 |         result = main(["--input", "Test Name", "--latest"])
 83 | 
 84 |         assert result == 0
 85 |         mock_pred_rel.assert_called_once_with("Test Name", lang="eng", latest=True)
 86 | 
 87 |     @patch("pranaam.pranaam.pred_rel")
 88 |     def test_all_options_combined(self, mock_pred_rel: Mock) -> None:
 89 |         """Test all options used together."""
 90 |         mock_result = pd.DataFrame(
 91 |             {"name": ["हिंदी नाम"], "pred_label": ["muslim"], "pred_prob_muslim": [65.0]}
 92 |         )
 93 |         mock_pred_rel.return_value = mock_result
 94 | 
 95 |         result = main(["--input", "हिंदी नाम", "--lang", "hin", "--latest"])
 96 | 
 97 |         assert result == 0
 98 |         mock_pred_rel.assert_called_once_with("हिंदी नाम", lang="hin", latest=True)
 99 | 
100 |     def test_invalid_language(self) -> None:
101 |         """Test invalid language option."""
102 |         with pytest.raises(SystemExit) as exc_info:
103 |             main(["--input", "Test Name", "--lang", "invalid"])
104 |         assert exc_info.value.code == 2
105 | 
106 |     @patch("pranaam.pranaam.pred_rel")
107 |     def test_prediction_error_handling(self, mock_pred_rel: Mock) -> None:
108 |         """Test handling of prediction errors."""
109 |         mock_pred_rel.side_effect = Exception("Prediction failed")
110 | 
111 |         with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
112 |             result = main(["--input", "Test Name"])
113 | 
114 |         assert result == 1
115 |         error_output = mock_stderr.getvalue()
116 |         assert "Error: Prediction failed" in error_output
117 | 
118 |     def test_default_arguments(self) -> None:
119 |         """Test default argument values."""
120 |         # This test verifies the argument parser setup
121 |         import argparse
122 | 
123 |         # Create parser same way as in main function
124 |         parser = argparse.ArgumentParser(
125 |             description="Predict religion based on name",
126 |             formatter_class=argparse.ArgumentDefaultsHelpFormatter,
127 |         )
128 |         parser.add_argument(
129 |             "--input", required=True, help="Name to analyze (single name as string)"
130 |         )
131 |         parser.add_argument(
132 |             "--lang",
133 |             default="eng",
134 |             choices=["eng", "hin"],
135 |             help="Language of input name",
136 |         )
137 |         parser.add_argument(
138 |             "--latest", action="store_true", help="Download latest model version"
139 |         )
140 | 
141 |         # Test default parsing
142 |         args = parser.parse_args(["--input", "Test"])
143 |         assert args.lang == "eng"
144 |         assert args.latest is False
145 |         assert args.input == "Test"
146 | 
147 | 
148 | class TestCLIIntegration:
149 |     """Integration tests for CLI."""
150 | 
151 |     def test_cli_with_none_argv(self) -> None:
152 |         """Test CLI function when argv is None."""
153 |         # Should use sys.argv[1:] by default
154 |         with patch("sys.argv", ["script_name", "--input", "Test Name"]):
155 |             with patch("pranaam.pranaam.pred_rel") as mock_pred_rel:
156 |                 mock_result = pd.DataFrame(
157 |                     {
158 |                         "name": ["Test Name"],
159 |                         "pred_label": ["muslim"],
160 |                         "pred_prob_muslim": [75.0],
161 |                     }
162 |                 )
163 |                 mock_pred_rel.return_value = mock_result
164 | 
165 |                 result = main(None)  # argv=None should use sys.argv[1:]
166 | 
167 |                 assert result == 0
168 |                 mock_pred_rel.assert_called_once()
169 | 
170 |     @patch("pranaam.pranaam.pred_rel")
171 |     def test_output_formatting(self, mock_pred_rel: Mock) -> None:
172 |         """Test that output is formatted properly."""
173 |         mock_result = pd.DataFrame(
174 |             {
175 |                 "name": ["Name One", "Name Two"],
176 |                 "pred_label": ["muslim", "not-muslim"],
177 |                 "pred_prob_muslim": [75.0, 25.0],
178 |             }
179 |         )
180 |         mock_pred_rel.return_value = mock_result
181 | 
182 |         with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
183 |             main(["--input", "Test Names"])
184 | 
185 |         output = mock_stdout.getvalue()
186 | 
187 |         # Should contain column headers and data
188 |         assert "name" in output
189 |         assert "pred_label" in output
190 |         assert "pred_prob_muslim" in output
191 |         assert "Name One" in output
192 |         assert "Name Two" in output
193 |         assert "muslim" in output
194 |         assert "not-muslim" in output
195 | 
196 | 
197 | class TestPredRelFunction:
198 |     """Test the pred_rel function exposed at module level."""
199 | 
200 |     def test_pred_rel_is_naam_pred_rel(self) -> None:
201 |         """Test that pred_rel is the same as Naam.pred_rel."""
202 |         from pranaam.naam import Naam
203 |         from pranaam.pranaam import pred_rel as module_pred_rel
204 | 
205 |         assert module_pred_rel == Naam.pred_rel
206 | 
207 | 
208 | class TestCLIArgumentValidation:
209 |     """Test CLI argument validation."""
210 | 
211 |     def test_required_input_argument(self) -> None:
212 |         """Test that input argument is required."""
213 |         with pytest.raises(SystemExit) as exc_info:
214 |             main(["--lang", "eng"])  # Missing --input
215 |         assert exc_info.value.code == 2
216 | 
217 |     def test_input_argument_accepts_any_string(self) -> None:
218 |         """Test that input accepts various string types."""
219 |         test_inputs = [
220 |             "Simple Name",
221 |             "Name with numbers 123",
222 |             "Name-with-hyphens",
223 |             "Name.with.dots",
224 |             "नाम हिंदी में",
225 |             "Mixed English हिंदी Name",
226 |         ]
227 | 
228 |         for test_input in test_inputs:
229 |             with patch("pranaam.pranaam.pred_rel") as mock_pred_rel:
230 |                 mock_result = pd.DataFrame(
231 |                     {
232 |                         "name": [test_input],
233 |                         "pred_label": ["muslim"],
234 |                         "pred_prob_muslim": [50.0],
235 |                     }
236 |                 )
237 |                 mock_pred_rel.return_value = mock_result
238 | 
239 |                 result = main(["--input", test_input])
240 |                 assert result == 0
241 |                 mock_pred_rel.assert_called_once_with(
242 |                     test_input, lang="eng", latest=False
243 |                 )
244 | 
245 |     def test_language_choices(self) -> None:
246 |         """Test that only valid language choices are accepted."""
247 |         valid_langs = ["eng", "hin"]
248 |         invalid_langs = ["en", "hi", "english", "hindi", "spanish", ""]
249 | 
250 |         # Valid languages should work
251 |         for lang in valid_langs:
252 |             with patch("pranaam.pranaam.pred_rel") as mock_pred_rel:
253 |                 mock_result = pd.DataFrame(
254 |                     {
255 |                         "name": ["Test"],
256 |                         "pred_label": ["muslim"],
257 |                         "pred_prob_muslim": [50.0],
258 |                     }
259 |                 )
260 |                 mock_pred_rel.return_value = mock_result
261 | 
262 |                 result = main(["--input", "Test", "--lang", lang])
263 |                 assert result == 0
264 | 
265 |         # Invalid languages should fail
266 |         for lang in invalid_langs:
267 |             with pytest.raises(SystemExit) as exc_info:
268 |                 main(["--input", "Test", "--lang", lang])
269 |             assert exc_info.value.code == 2
270 | 
271 | 
272 | # Removed TestCLIErrorHandling class - was causing KeyboardInterrupt issues in CI
273 | 


--------------------------------------------------------------------------------
/tests/test_e2e.py:
--------------------------------------------------------------------------------
  1 | """
  2 | End-to-end integration tests with real models and predictions.
  3 | These tests actually download models and run real predictions.
  4 | """
  5 | 
  6 | from collections.abc import Generator
  7 | 
  8 | import pandas as pd
  9 | import pytest
 10 | 
 11 | import pranaam
 12 | from pranaam.naam import Naam
 13 | 
 14 | 
 15 | class TestRealModelDownloadAndPrediction:
 16 |     """Test real model download and prediction functionality."""
 17 | 
 18 |     @pytest.fixture(autouse=True)
 19 |     def setup_clean_environment(self) -> Generator[None, None, None]:
 20 |         """Ensure clean model state for each test."""
 21 |         # Reset class state
 22 |         Naam.model = None
 23 |         Naam.weights_loaded = False
 24 |         Naam.cur_lang = None  # type: ignore
 25 |         yield
 26 |         # Cleanup after test
 27 |         Naam.model = None
 28 |         Naam.weights_loaded = False
 29 |         Naam.cur_lang = None  # type: ignore
 30 | 
 31 |     @pytest.mark.integration
 32 |     def test_real_english_predictions(self) -> None:
 33 |         """Test real predictions with English names using actual models."""
 34 |         # Real Bollywood actor names with expected patterns
 35 |         test_names = [
 36 |             "Shah Rukh Khan",  # Expected: Muslim
 37 |             "Salman Khan",  # Expected: Muslim
 38 |             "Aamir Khan",  # Expected: Muslim
 39 |             "Saif Ali Khan",  # Expected: Muslim
 40 |             "Amitabh Bachchan",  # Expected: Not Muslim
 41 |             "Akshay Kumar",  # Expected: Not Muslim
 42 |             "Hrithik Roshan",  # Expected: Not Muslim
 43 |         ]
 44 | 
 45 |         # This will download models if not cached
 46 |         result = pranaam.pred_rel(test_names, lang="eng")
 47 | 
 48 |         # Verify DataFrame structure
 49 |         assert isinstance(result, pd.DataFrame)
 50 |         assert list(result.columns) == ["name", "pred_label", "pred_prob_muslim"]
 51 |         assert len(result) == len(test_names)
 52 | 
 53 |         # Verify all names are present
 54 |         assert set(result["name"]) == set(test_names)
 55 | 
 56 |         # Verify prediction labels are valid
 57 |         valid_labels = {"muslim", "not-muslim"}
 58 |         assert all(label in valid_labels for label in result["pred_label"])
 59 | 
 60 |         # Verify probabilities are reasonable (0-100)
 61 |         assert all(0 <= prob <= 100 for prob in result["pred_prob_muslim"])
 62 | 
 63 |         # Print actual results for inspection
 64 |         print("\n🎬 REAL ENGLISH PREDICTIONS:")
 65 |         for _, row in result.iterrows():
 66 |             print(
 67 |                 f"  {row['name']} → {row['pred_label']} ({row['pred_prob_muslim']:.1f}%)"
 68 |             )
 69 | 
 70 |         # Verify expected patterns (these are actual predictions, not mocks)
 71 |         khan_results = result[result["name"].str.contains("Khan")]
 72 |         muslim_khans = khan_results[khan_results["pred_label"] == "muslim"]
 73 | 
 74 |         # Should predict most Khans as Muslim (this is what the model should do)
 75 |         assert len(muslim_khans) >= 3, (
 76 |             f"Expected at least 3 Khans predicted as Muslim, got {len(muslim_khans)}"
 77 |         )
 78 | 
 79 |     @pytest.mark.integration
 80 |     def test_real_hindi_predictions(self) -> None:
 81 |         """Test real predictions with Hindi names using actual models."""
 82 |         # Real Hindi names in Devanagari
 83 |         test_names = [
 84 |             "शाहरुख खान",  # Shah Rukh Khan
 85 |             "सलमान खान",  # Salman Khan
 86 |             "अमिताभ बच्चन",  # Amitabh Bachchan
 87 |             "अक्षय कुमार",  # Akshay Kumar
 88 |         ]
 89 | 
 90 |         # This will download Hindi models if not cached
 91 |         result = pranaam.pred_rel(test_names, lang="hin")
 92 | 
 93 |         # Verify DataFrame structure
 94 |         assert isinstance(result, pd.DataFrame)
 95 |         assert list(result.columns) == ["name", "pred_label", "pred_prob_muslim"]
 96 |         assert len(result) == len(test_names)
 97 | 
 98 |         # Verify all names are present
 99 |         assert set(result["name"]) == set(test_names)
100 | 
101 |         # Verify prediction labels are valid
102 |         valid_labels = {"muslim", "not-muslim"}
103 |         assert all(label in valid_labels for label in result["pred_label"])
104 | 
105 |         # Verify probabilities are reasonable
106 |         assert all(0 <= prob <= 100 for prob in result["pred_prob_muslim"])
107 | 
108 |         # Print actual results for inspection
109 |         print("\n🇮🇳 REAL HINDI PREDICTIONS:")
110 |         for _, row in result.iterrows():
111 |             print(
112 |                 f"  {row['name']} → {row['pred_label']} ({row['pred_prob_muslim']:.1f}%)"
113 |             )
114 | 
115 |     @pytest.mark.integration
116 |     def test_model_caching_behavior(self) -> None:
117 |         """Test that models are properly cached after first download."""
118 |         # First prediction - should trigger download
119 |         result1 = pranaam.pred_rel("Shah Rukh Khan", lang="eng")
120 |         assert Naam.weights_loaded is True
121 |         assert Naam.cur_lang == "eng"
122 | 
123 |         # Second prediction - should use cached model
124 |         result2 = pranaam.pred_rel("Amitabh Bachchan", lang="eng")
125 |         assert Naam.weights_loaded is True
126 |         assert Naam.cur_lang == "eng"
127 | 
128 |         # Results should be consistent
129 |         assert result1.columns.tolist() == result2.columns.tolist()
130 | 
131 |         print("\n💾 MODEL CACHING VERIFIED")
132 | 
133 |     @pytest.mark.integration
134 |     def test_language_switching(self) -> None:
135 |         """Test switching between English and Hindi models."""
136 |         # Start with English
137 |         eng_result = pranaam.pred_rel("Shah Rukh Khan", lang="eng")
138 |         assert Naam.cur_lang == "eng"
139 | 
140 |         # Switch to Hindi - should reload model
141 |         hin_result = pranaam.pred_rel("शाहरुख खान", lang="hin")
142 |         assert Naam.cur_lang == "hin"
143 | 
144 |         # Switch back to English - should reload model again
145 |         eng_result2 = pranaam.pred_rel("Salman Khan", lang="eng")
146 |         assert Naam.cur_lang == "eng"
147 | 
148 |         # All results should have same structure
149 |         for result in [eng_result, hin_result, eng_result2]:
150 |             assert list(result.columns) == ["name", "pred_label", "pred_prob_muslim"]
151 | 
152 |         print("\n🔄 LANGUAGE SWITCHING VERIFIED")
153 | 
154 |     @pytest.mark.integration
155 |     def test_pandas_series_integration(self) -> None:
156 |         """Test real pandas Series input integration."""
157 |         # Create DataFrame with real names
158 |         df = pd.DataFrame(
159 |             {
160 |                 "actor_name": ["Shah Rukh Khan", "Amitabh Bachchan", "Salman Khan"],
161 |                 "movie_count": [50, 100, 45],
162 |             }
163 |         )
164 | 
165 |         # Use pandas Series as input
166 |         result = pranaam.pred_rel(df["actor_name"], lang="eng")
167 | 
168 |         # Verify integration
169 |         assert len(result) == len(df)
170 |         assert all(name in df["actor_name"].values for name in result["name"])
171 | 
172 |         # Create combined result
173 |         combined = pd.concat([df, result[["pred_label", "pred_prob_muslim"]]], axis=1)
174 | 
175 |         print("\n📊 PANDAS INTEGRATION:")
176 |         print(combined.to_string(index=False))
177 | 
178 |         # Verify combined structure
179 |         expected_cols = ["actor_name", "movie_count", "pred_label", "pred_prob_muslim"]
180 |         assert list(combined.columns) == expected_cols
181 | 
182 |     @pytest.mark.integration
183 |     def test_batch_processing_performance(self) -> None:
184 |         """Test batch processing with real models."""
185 |         import time
186 | 
187 |         # Large batch of real names
188 |         names = [
189 |             "Shah Rukh Khan",
190 |             "Amitabh Bachchan",
191 |             "Salman Khan",
192 |             "Aamir Khan",
193 |             "Akshay Kumar",
194 |             "Hrithik Roshan",
195 |             "Ranbir Kapoor",
196 |             "Saif Ali Khan",
197 |             "Ajay Devgan",
198 |             "John Abraham",
199 |             "Arjun Kapoor",
200 |             "Varun Dhawan",
201 |         ]
202 | 
203 |         start_time = time.time()
204 |         result = pranaam.pred_rel(names, lang="eng")
205 |         end_time = time.time()
206 | 
207 |         processing_time = end_time - start_time
208 |         avg_time_per_name = processing_time / len(names) * 1000  # ms
209 | 
210 |         # Verify all names processed
211 |         assert len(result) == len(names)
212 |         assert set(result["name"]) == set(names)
213 | 
214 |         print("\n⚡ PERFORMANCE METRICS:")
215 |         print(f"  Total time: {processing_time:.2f}s")
216 |         print(f"  Avg per name: {avg_time_per_name:.1f}ms")
217 |         print(f"  Names/second: {len(names) / processing_time:.1f}")
218 | 
219 |         # Performance should be reasonable (adjust based on actual performance)
220 |         assert processing_time < 10.0, (
221 |             f"Batch processing too slow: {processing_time:.2f}s"
222 |         )
223 | 
224 |     @pytest.mark.integration
225 |     def test_error_handling_real_scenarios(self) -> None:
226 |         """Test error handling in real scenarios."""
227 |         # Test with empty input
228 |         with pytest.raises(ValueError):
229 |             pranaam.pred_rel("", lang="eng")
230 | 
231 |         # Test with invalid language
232 |         with pytest.raises(ValueError):
233 |             pranaam.pred_rel("Test Name", lang="invalid")  # type: ignore
234 | 
235 |         # Test with None input
236 |         with pytest.raises((ValueError, TypeError)):
237 |             pranaam.pred_rel(None, lang="eng")  # type: ignore
238 | 
239 |         print("\n🛡️ ERROR HANDLING VERIFIED")
240 | 
241 | 
242 | class TestRealWorldScenarios:
243 |     """Test real-world usage scenarios."""
244 | 
245 |     @pytest.mark.integration
246 |     def test_mixed_cultural_names(self) -> None:
247 |         """Test predictions on mixed cultural background names."""
248 |         mixed_names = [
249 |             "John Smith",  # Western
250 |             "Mohammed Ali",  # Arabic/Muslim
251 |             "Priya Sharma",  # Hindu
252 |             "David Johnson",  # Western
253 |             "Fatima Khan",  # Muslim
254 |             "Raj Patel",  # Hindu
255 |         ]
256 | 
257 |         result = pranaam.pred_rel(mixed_names, lang="eng")
258 | 
259 |         print("\n🌍 MIXED CULTURAL PREDICTIONS:")
260 |         for _, row in result.iterrows():
261 |             print(
262 |                 f"  {row['name']} → {row['pred_label']} ({row['pred_prob_muslim']:.1f}%)"
263 |             )
264 | 
265 |         # Verify structure
266 |         assert len(result) == len(mixed_names)
267 |         assert all(0 <= prob <= 100 for prob in result["pred_prob_muslim"])
268 | 
269 |     @pytest.mark.integration
270 |     def test_edge_case_names(self) -> None:
271 |         """Test edge cases with real models."""
272 |         edge_cases = [
273 |             "A",  # Single character
274 |             "Mohammad",  # Common Muslim name variant
275 |             "Krishna",  # Hindu deity name
276 |             "Ali Khan",  # Short Muslim name
277 |             "Ram Singh",  # Hindu name
278 |         ]
279 | 
280 |         result = pranaam.pred_rel(edge_cases, lang="eng")
281 | 
282 |         print("\n🔍 EDGE CASE PREDICTIONS:")
283 |         for _, row in result.iterrows():
284 |             print(
285 |                 f"  {row['name']} → {row['pred_label']} ({row['pred_prob_muslim']:.1f}%)"
286 |             )
287 | 
288 |         assert len(result) == len(edge_cases)
289 | 
290 | 
291 | # Marker for running only E2E tests
292 | pytestmark = pytest.mark.integration
293 | 


--------------------------------------------------------------------------------
/tests/test_naam.py:
--------------------------------------------------------------------------------
  1 | """Comprehensive tests for naam module."""
  2 | 
  3 | from pathlib import Path
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pytest
  9 | 
 10 | from pranaam.naam import Naam, is_english
 11 | 
 12 | 
 13 | class TestIsEnglish:
 14 |     """Test the is_english utility function."""
 15 | 
 16 |     def test_english_text(self) -> None:
 17 |         """Test detection of English text."""
 18 |         assert is_english("Hello World") is True
 19 |         assert is_english("Shah Rukh Khan") is True
 20 |         assert is_english("123 ABC") is True
 21 | 
 22 |     def test_hindi_text(self) -> None:
 23 |         """Test detection of Hindi text."""
 24 |         assert is_english("शाहरुख खान") is False
 25 |         assert is_english("अमिताभ बच्चन") is False
 26 |         assert is_english("हैलो वर्ल्ड") is False
 27 | 
 28 |     def test_mixed_text(self) -> None:
 29 |         """Test mixed text (contains non-ASCII)."""
 30 |         assert is_english("Hello शाहरुख") is False
 31 |         assert is_english("Khan खान") is False
 32 | 
 33 |     def test_empty_string(self) -> None:
 34 |         """Test empty string."""
 35 |         assert is_english("") is True  # Empty string is ASCII
 36 | 
 37 |     def test_special_characters(self) -> None:
 38 |         """Test special characters."""
 39 |         assert is_english("Hello! @#$%") is True
 40 |         assert is_english("Test\nLine") is True
 41 | 
 42 | 
 43 | class TestNaamValidation:
 44 |     """Test input validation for Naam class."""
 45 | 
 46 |     # Removed validation tests that call Naam.pred_rel directly - could trigger model loading
 47 | 
 48 | 
 49 | class TestNaamInputHandling:
 50 |     """Test input handling and conversion in Naam class."""
 51 | 
 52 |     @patch.object(Naam, "_load_model")
 53 |     @patch.object(Naam, "model")
 54 |     def test_single_string_input(self, mock_model: Mock, mock_load_model: Mock) -> None:
 55 |         """Test handling of single string input."""
 56 |         # Setup mock model
 57 |         mock_model.predict.return_value = np.array([[0.2, 0.8]])
 58 |         Naam.weights_loaded = True
 59 |         Naam.cur_lang = "eng"
 60 | 
 61 |         result = Naam.pred_rel("Test Name")
 62 | 
 63 |         assert isinstance(result, pd.DataFrame)
 64 |         assert len(result) == 1
 65 |         assert result.iloc[0]["name"] == "Test Name"
 66 |         mock_model.predict.assert_called_once_with(["Test Name"], verbose=0)
 67 | 
 68 |     @patch.object(Naam, "_load_model")
 69 |     @patch.object(Naam, "model")
 70 |     def test_list_input(self, mock_model: Mock, mock_load_model: Mock) -> None:
 71 |         """Test handling of list input."""
 72 |         mock_model.predict.return_value = np.array([[0.2, 0.8], [0.7, 0.3]])
 73 |         Naam.weights_loaded = True
 74 |         Naam.cur_lang = "eng"
 75 | 
 76 |         names = ["Name One", "Name Two"]
 77 |         result = Naam.pred_rel(names)
 78 | 
 79 |         assert isinstance(result, pd.DataFrame)
 80 |         assert len(result) == 2
 81 |         assert list(result["name"]) == names
 82 | 
 83 |     @patch.object(Naam, "_load_model")
 84 |     @patch.object(Naam, "model")
 85 |     def test_pandas_series_input(self, mock_model: Mock, mock_load_model: Mock) -> None:
 86 |         """Test handling of pandas Series input."""
 87 |         mock_model.predict.return_value = np.array([[0.2, 0.8]])
 88 |         Naam.weights_loaded = True
 89 |         Naam.cur_lang = "eng"
 90 | 
 91 |         names = pd.Series(["Test Name"])
 92 |         result = Naam.pred_rel(names)
 93 | 
 94 |         assert isinstance(result, pd.DataFrame)
 95 |         assert len(result) == 1
 96 | 
 97 | 
 98 | class TestNaamPredictions:
 99 |     """Test prediction functionality."""
100 | 
101 |     @patch.object(Naam, "_load_model")
102 |     def test_prediction_output_structure(self, mock_load_model: Mock) -> None:
103 |         """Test that predictions return correct DataFrame structure."""
104 |         # Mock the model
105 |         mock_model = Mock()
106 |         mock_model.predict.return_value = np.array([[0.3, 0.7], [0.8, 0.2]])
107 |         Naam.model = mock_model
108 |         Naam.weights_loaded = True
109 |         Naam.cur_lang = "eng"
110 | 
111 |         names = ["Name One", "Name Two"]
112 |         result = Naam.pred_rel(names)
113 | 
114 |         # Check DataFrame structure
115 |         assert isinstance(result, pd.DataFrame)
116 |         assert list(result.columns) == ["name", "pred_label", "pred_prob_muslim"]
117 |         assert len(result) == 2
118 | 
119 |         # Check data types
120 |         assert result["name"].dtype == object
121 |         assert result["pred_label"].dtype == object
122 |         assert pd.api.types.is_numeric_dtype(result["pred_prob_muslim"])
123 | 
124 |     @patch.object(Naam, "_load_model")
125 |     def test_prediction_labels(self, mock_load_model: Mock) -> None:
126 |         """Test that predictions use correct labels."""
127 |         mock_model = Mock()
128 |         mock_model.predict.return_value = np.array([[0.3, 0.7], [0.8, 0.2]])
129 |         Naam.model = mock_model
130 |         Naam.weights_loaded = True
131 |         Naam.cur_lang = "eng"
132 | 
133 |         result = Naam.pred_rel(["Name One", "Name Two"])
134 | 
135 |         # First prediction: higher muslim probability -> "muslim"
136 |         # Second prediction: higher not-muslim probability -> "not-muslim"
137 |         assert result.iloc[0]["pred_label"] == "muslim"
138 |         assert result.iloc[1]["pred_label"] == "not-muslim"
139 | 
140 |     @patch.object(Naam, "_load_model")
141 |     def test_prediction_probabilities(self, mock_load_model: Mock) -> None:
142 |         """Test probability calculations."""
143 |         mock_model = Mock()
144 |         # Mock raw logits that will become [0.3, 0.7] after softmax
145 |         # Using logits that softmax to approximately [0.4, 0.6] (60% muslim)
146 |         mock_model.predict.return_value = np.array([[0.0, 0.405]])
147 |         Naam.model = mock_model
148 |         Naam.weights_loaded = True
149 |         Naam.cur_lang = "eng"
150 | 
151 |         result = Naam.pred_rel(["Test Name"])
152 | 
153 |         # Probability should be rounded percentage of muslim class after softmax
154 |         # softmax([0.0, 0.405]) ≈ [0.4, 0.6] -> 60% muslim
155 |         expected_prob = 60.0
156 |         assert result.iloc[0]["pred_prob_muslim"] == expected_prob
157 | 
158 | 
159 | class TestNaamModelLoading:
160 |     """Test model loading functionality."""
161 | 
162 |     @patch("tf_keras.models.load_model")
163 |     @patch.object(Naam, "load_model_data")
164 |     def test_model_loading_english(
165 |         self, mock_load_data: Mock, mock_load_model: Mock
166 |     ) -> None:
167 |         """Test loading English model."""
168 |         mock_load_data.return_value = Path("/fake/path")
169 |         mock_model = Mock()
170 |         mock_load_model.return_value = mock_model
171 | 
172 |         # Reset class state
173 |         Naam.weights_loaded = False
174 |         Naam.model = None
175 | 
176 |         Naam._load_model("eng")
177 | 
178 |         # Check that correct model path was used
179 |         fake_path = Path("/fake/path")
180 |         expected_path = str(fake_path / "eng_and_hindi_models_v2" / "eng_model.keras")
181 |         mock_load_model.assert_called_once_with(expected_path)
182 |         assert Naam.model == mock_model
183 |         assert Naam.weights_loaded is True
184 |         assert Naam.cur_lang == "eng"
185 | 
186 |     @patch("tf_keras.models.load_model")
187 |     @patch.object(Naam, "load_model_data")
188 |     def test_model_loading_hindi(
189 |         self, mock_load_data: Mock, mock_load_model: Mock
190 |     ) -> None:
191 |         """Test loading Hindi model."""
192 |         mock_load_data.return_value = Path("/fake/path")
193 |         mock_model = Mock()
194 |         mock_load_model.return_value = mock_model
195 | 
196 |         Naam.weights_loaded = False
197 |         Naam.model = None
198 | 
199 |         Naam._load_model("hin")
200 | 
201 |         fake_path = Path("/fake/path")
202 |         expected_path = str(fake_path / "eng_and_hindi_models_v2" / "hin_model.keras")
203 |         mock_load_model.assert_called_once_with(expected_path)
204 |         assert Naam.cur_lang == "hin"
205 | 
206 |     @patch.object(Naam, "load_model_data")
207 |     def test_model_loading_failure_no_data(self, mock_load_data: Mock) -> None:
208 |         """Test model loading failure when data loading fails."""
209 |         mock_load_data.return_value = None
210 | 
211 |         with pytest.raises(RuntimeError, match="Failed to load model data"):
212 |             Naam._load_model("eng")
213 | 
214 |     @patch("tf_keras.models.load_model")
215 |     @patch.object(Naam, "load_model_data")
216 |     def test_model_loading_failure_model_error(
217 |         self, mock_load_data: Mock, mock_load_model: Mock
218 |     ) -> None:
219 |         """Test model loading failure when TensorFlow fails."""
220 |         mock_load_data.return_value = Path("/fake/path")
221 |         mock_load_model.side_effect = Exception("TensorFlow error")
222 | 
223 |         with pytest.raises(RuntimeError, match="Failed to load eng model"):
224 |             Naam._load_model("eng")
225 | 
226 | 
227 | class TestNaamLanguageHandling:
228 |     """Test language-specific functionality."""
229 | 
230 |     @patch.object(Naam, "_load_model")
231 |     @patch.object(Naam, "model")
232 |     def test_language_change_triggers_reload(
233 |         self, mock_model: Mock, mock_load_model: Mock
234 |     ) -> None:
235 |         """Test that changing language triggers model reload."""
236 |         mock_model.predict.return_value = np.array([[0.5, 0.5]])
237 | 
238 |         # Set initial state
239 |         Naam.weights_loaded = True
240 |         Naam.cur_lang = "eng"
241 | 
242 |         # Call with different language
243 |         Naam.pred_rel(["Test"], lang="hin")
244 | 
245 |         # Should trigger model reload
246 |         mock_load_model.assert_called_once_with("hin", False)
247 | 
248 |     @patch.object(Naam, "_load_model")
249 |     @patch.object(Naam, "model")
250 |     def test_same_language_no_reload(
251 |         self, mock_model: Mock, mock_load_model: Mock
252 |     ) -> None:
253 |         """Test that same language doesn't trigger reload."""
254 |         mock_model.predict.return_value = np.array([[0.5, 0.5]])
255 | 
256 |         # Set state
257 |         Naam.weights_loaded = True
258 |         Naam.cur_lang = "eng"
259 | 
260 |         # Call with same language
261 |         Naam.pred_rel(["Test"], lang="eng")
262 | 
263 |         # Should not trigger model reload
264 |         mock_load_model.assert_not_called()
265 | 
266 | 
267 | class TestNaamErrorHandling:
268 |     """Test error handling in predictions."""
269 | 
270 |     @patch.object(Naam, "_load_model")
271 |     def test_prediction_with_no_model(self, mock_load_model: Mock) -> None:
272 |         """Test prediction fails when model is None."""
273 |         Naam.model = None
274 |         Naam.weights_loaded = True
275 | 
276 |         with pytest.raises(RuntimeError, match="Model not loaded properly"):
277 |             Naam.pred_rel(["Test"])
278 | 
279 |     @patch.object(Naam, "_load_model")
280 |     def test_prediction_tensorflow_error(self, mock_load_model: Mock) -> None:
281 |         """Test handling of TensorFlow prediction errors."""
282 |         mock_model = Mock()
283 |         mock_model.predict.side_effect = Exception("TensorFlow error")
284 |         Naam.model = mock_model
285 |         Naam.weights_loaded = True
286 | 
287 |         with pytest.raises(RuntimeError, match="Prediction failed"):
288 |             Naam.pred_rel(["Test"])
289 | 
290 | 
291 | class TestNaamIntegration:
292 |     """Integration tests (require actual model download - marked as slow)."""
293 | 
294 |     # Removed real prediction tests - they were trying to download models and do actual predictions
295 | 


--------------------------------------------------------------------------------
/scripts/migrate_models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Model Migration Script: Keras 2 SavedModel → Keras 3 Compatible Format
  4 | 
  5 | This script converts existing Keras 2 SavedModel format models to Keras 3 compatible
  6 | "weights + architecture" format to enable TensorFlow 2.16+ compatibility.
  7 | 
  8 | Usage:
  9 |     # Step 1: Export from Keras 2 environment (TensorFlow 2.15)
 10 |     python migrate_models.py export
 11 | 
 12 |     # Step 2: Import to Keras 3 environment (TensorFlow 2.16+)
 13 |     python migrate_models.py import
 14 | 
 15 | Requirements:
 16 |     - Step 1 requires TensorFlow 2.15 (Keras 2)
 17 |     - Step 2 requires TensorFlow 2.16+ (Keras 3)
 18 | """
 19 | 
 20 | import json
 21 | import sys
 22 | from pathlib import Path
 23 | 
 24 | import tensorflow as tf
 25 | 
 26 | 
 27 | class ModelMigrator:
 28 |     """Handles migration between Keras 2 and Keras 3 model formats."""
 29 | 
 30 |     def __init__(self):
 31 |         self.model_base_path = Path(__file__).parent / "pranaam" / "model"
 32 |         self.migration_output = Path(__file__).parent / "model_migration"
 33 |         self.migration_output.mkdir(exist_ok=True)
 34 | 
 35 |         # Model paths
 36 |         self.models = {
 37 |             "eng": self.model_base_path / "eng_and_hindi_models_v1" / "eng_model",
 38 |             "hin": self.model_base_path / "eng_and_hindi_models_v1" / "hin_model",
 39 |         }
 40 | 
 41 |     def export_from_keras2(self) -> None:
 42 |         """Export weights and architecture from Keras 2 SavedModel format."""
 43 |         print("🔄 Exporting models from Keras 2 format...")
 44 |         print(f"TensorFlow version: {tf.__version__}")
 45 | 
 46 |         # Use tf-keras for Keras 2 compatibility
 47 |         try:
 48 |             import tf_keras as keras
 49 | 
 50 |             print("✅ Using tf-keras for Keras 2 compatibility")
 51 |             print(f"tf-keras version: {keras.__version__}")
 52 |         except ImportError as e:
 53 |             print(f"❌ tf-keras not available: {e}")
 54 |             print("Please install tf-keras: pip install tf-keras")
 55 |             return
 56 | 
 57 |         for lang, model_path in self.models.items():
 58 |             print(f"\n📁 Processing {lang} model from {model_path}")
 59 | 
 60 |             if not model_path.exists():
 61 |                 print(f"❌ Model not found at {model_path}")
 62 |                 continue
 63 | 
 64 |             try:
 65 |                 # Load the SavedModel using tf-keras (Keras 2 compatibility)
 66 |                 print("🔄 Loading SavedModel with tf-keras...")
 67 |                 model = keras.models.load_model(str(model_path))
 68 |                 print(f"✅ Successfully loaded {lang} model")
 69 | 
 70 |                 # Print model info
 71 |                 print(f"   Model type: {type(model).__name__}")
 72 |                 print(
 73 |                     f"   Input shape: {model.input_shape if hasattr(model, 'input_shape') else 'Unknown'}"
 74 |                 )
 75 |                 print(
 76 |                     f"   Output shape: {model.output_shape if hasattr(model, 'output_shape') else 'Unknown'}"
 77 |                 )
 78 | 
 79 |                 # Create language-specific output directory
 80 |                 lang_output = self.migration_output / f"{lang}_model"
 81 |                 lang_output.mkdir(exist_ok=True)
 82 | 
 83 |                 # Try to save as Keras 3 format first, then fallback to tf format if needed
 84 |                 try:
 85 |                     # Save complete model in Keras 3 format
 86 |                     keras_model_path = lang_output / "model.keras"
 87 |                     model.save(str(keras_model_path))
 88 |                     print(f"✅ Complete model saved to {keras_model_path}")
 89 |                 except Exception as e:
 90 |                     print(f"⚠️  Keras format save failed: {e}")
 91 |                     # Fallback to TF weights format
 92 |                     weights_path = lang_output / "model_weights"
 93 |                     model.save_weights(str(weights_path), save_format="tf")
 94 |                     print(f"✅ Weights saved to {weights_path} (tf format)")
 95 | 
 96 |                 # Export model configuration
 97 |                 config = model.get_config()
 98 |                 config_path = lang_output / "model_config.json"
 99 |                 with open(config_path, "w") as f:
100 |                     json.dump(config, f, indent=2)
101 |                 print(f"✅ Config saved to {config_path}")
102 | 
103 |                 # Export additional metadata
104 |                 metadata = {
105 |                     "model_type": type(model).__name__,
106 |                     "input_shape": (
107 |                         model.input_shape if hasattr(model, "input_shape") else None
108 |                     ),
109 |                     "output_shape": (
110 |                         model.output_shape if hasattr(model, "output_shape") else None
111 |                     ),
112 |                     "layer_count": len(model.layers) if hasattr(model, "layers") else 0,
113 |                     "tensorflow_version": tf.__version__,
114 |                     "keras_version": (
115 |                         tf.keras.__version__
116 |                         if hasattr(tf.keras, "__version__")
117 |                         else None
118 |                     ),
119 |                 }
120 | 
121 |                 # Try to get input/output names
122 |                 try:
123 |                     metadata["input_names"] = [inp.name for inp in model.inputs]
124 |                     metadata["output_names"] = [out.name for out in model.outputs]
125 |                 except AttributeError:
126 |                     pass
127 | 
128 |                 metadata_path = lang_output / "model_metadata.json"
129 |                 with open(metadata_path, "w") as f:
130 |                     json.dump(metadata, f, indent=2)
131 |                 print(f"✅ Metadata saved to {metadata_path}")
132 | 
133 |                 # Test a prediction to validate the model works
134 |                 if lang == "eng":
135 |                     test_input = ["Shah Rukh Khan"]
136 |                 else:
137 |                     test_input = ["शाहरुख खान"]
138 | 
139 |                 try:
140 |                     result = model.predict(test_input, verbose=0)
141 |                     print(f"✅ Test prediction successful: {result.shape}")
142 | 
143 |                     # Save test prediction for validation
144 |                     test_data = {"input": test_input, "output": result.tolist()}
145 |                     test_path = lang_output / "test_prediction.json"
146 |                     with open(test_path, "w") as f:
147 |                         json.dump(test_data, f, indent=2)
148 |                     print(f"✅ Test prediction saved to {test_path}")
149 | 
150 |                 except Exception as e:
151 |                     print(f"⚠️  Test prediction failed: {e}")
152 | 
153 |             except Exception as e:
154 |                 print(f"❌ Failed to process {lang} model: {e}")
155 |                 continue
156 | 
157 |         print("\n✅ Export completed! Check the model_migration directory.")
158 |         print("Next step: Run 'python migrate_models.py import' with TensorFlow 2.16+")
159 | 
160 |     def import_to_keras3(self) -> None:
161 |         """Import weights and recreate models in Keras 3 format."""
162 |         print("🔄 Importing models to Keras 3 format...")
163 |         print(f"TensorFlow version: {tf.__version__}")
164 | 
165 |         # Verify we're in Keras 3 environment
166 |         try:
167 |             import tensorflow.keras.utils as utils
168 | 
169 |             if not hasattr(utils, "legacy"):
170 |                 print("⚠️  Detected Keras 2 environment - this step needs Keras 3!")
171 |                 print("Please run with TensorFlow 2.16+ or upgrade TensorFlow")
172 |                 sys.exit(1)
173 |         except ImportError:
174 |             pass
175 | 
176 |         # Create new model directory for Keras 3 models
177 |         new_model_dir = self.model_base_path / "eng_and_hindi_models_v2"
178 |         new_model_dir.mkdir(exist_ok=True)
179 | 
180 |         for lang in ["eng", "hin"]:
181 |             print(f"\n📁 Processing {lang} model migration to Keras 3")
182 | 
183 |             lang_input = self.migration_output / f"{lang}_model"
184 |             if not lang_input.exists():
185 |                 print(f"❌ Migration data not found at {lang_input}")
186 |                 print(
187 |                     "Run 'python migrate_models.py export' first with TensorFlow 2.15"
188 |                 )
189 |                 continue
190 | 
191 |             try:
192 |                 # Load metadata
193 |                 metadata_path = lang_input / "model_metadata.json"
194 |                 with open(metadata_path) as f:
195 |                     metadata = json.load(f)
196 |                 print(f"✅ Loaded metadata: {metadata['model_type']}")
197 | 
198 |                 # Load configuration (for metadata purposes)
199 |                 config_path = lang_input / "model_config.json"
200 |                 with open(config_path) as f:
201 |                     _ = json.load(f)  # Load but don't use since we load .keras directly
202 |                 print("✅ Loaded model configuration")
203 | 
204 |                 # Load the Keras 3 format model directly (tf-keras saved it for us!)
205 |                 keras_model_path = lang_input / "model.keras"
206 |                 if keras_model_path.exists():
207 |                     print("🔄 Loading Keras 3 format model...")
208 |                     model = tf.keras.models.load_model(str(keras_model_path))
209 |                     print("✅ Loaded Keras 3 model directly")
210 |                 else:
211 |                     print(f"❌ Keras 3 model not found at {keras_model_path}")
212 |                     continue
213 | 
214 |                 # Test the recreated model
215 |                 test_path = lang_input / "test_prediction.json"
216 |                 if test_path.exists():
217 |                     with open(test_path) as f:
218 |                         test_data = json.load(f)
219 | 
220 |                     result = model.predict(test_data["input"], verbose=0)
221 |                     original_output = test_data["output"]
222 | 
223 |                     # Compare predictions (allowing for small numerical differences)
224 |                     import numpy as np
225 | 
226 |                     if np.allclose(result, original_output, rtol=1e-5):
227 |                         print("✅ Model validation successful - predictions match!")
228 |                     else:
229 |                         print("⚠️  Predictions differ slightly (this might be normal)")
230 |                         print(f"   Original: {original_output}")
231 |                         print(f"   New: {result.tolist()}")
232 | 
233 |                 # Save in Keras 3 native format
234 |                 keras3_model_path = new_model_dir / f"{lang}_model.keras"
235 |                 model.save(keras3_model_path)
236 |                 print(f"✅ Saved Keras 3 model to {keras3_model_path}")
237 | 
238 |                 # Also save in HDF5 format as backup
239 |                 h5_model_path = new_model_dir / f"{lang}_model.h5"
240 |                 model.save(h5_model_path)
241 |                 print(f"✅ Saved H5 backup to {h5_model_path}")
242 | 
243 |             except Exception as e:
244 |                 print(f"❌ Failed to migrate {lang} model: {e}")
245 |                 import traceback
246 | 
247 |                 traceback.print_exc()
248 |                 continue
249 | 
250 |         print("\n✅ Import completed!")
251 |         print(f"New models saved in: {new_model_dir}")
252 |         print("Next step: Update pranaam code to use the new model format")
253 | 
254 | 
255 | def main():
256 |     if len(sys.argv) != 2 or sys.argv[1] not in ["export", "import"]:
257 |         print("Usage: python migrate_models.py [export|import]")
258 |         print("  export: Export from Keras 2 SavedModel (requires TF 2.15)")
259 |         print("  import: Import to Keras 3 format (requires TF 2.16+)")
260 |         sys.exit(1)
261 | 
262 |     migrator = ModelMigrator()
263 | 
264 |     if sys.argv[1] == "export":
265 |         migrator.export_from_keras2()
266 |     else:
267 |         migrator.import_to_keras3()
268 | 
269 | 
270 | if __name__ == "__main__":
271 |     main()
272 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | """Tests for utils module."""
  2 | 
  3 | import os
  4 | import tarfile
  5 | import tempfile
  6 | from pathlib import Path
  7 | from unittest.mock import MagicMock, Mock, mock_open, patch
  8 | 
  9 | import pytest
 10 | import requests
 11 | 
 12 | from pranaam.utils import REPO_BASE_URL, _safe_extract_tar, download_file
 13 | 
 14 | 
 15 | class TestDownloadFile:
 16 |     """Test download_file function."""
 17 | 
 18 |     @patch("pranaam.utils.requests.Session")
 19 |     @patch("pranaam.utils._safe_extract_tar")
 20 |     @patch("pranaam.utils.Path")
 21 |     @patch("pranaam.utils.tqdm")
 22 |     def test_successful_download(
 23 |         self,
 24 |         mock_tqdm: Mock,
 25 |         mock_path: Mock,
 26 |         mock_extract: Mock,
 27 |         mock_session: Mock,
 28 |     ) -> None:
 29 |         """Test successful file download and extraction."""
 30 |         # Setup mocks
 31 |         mock_response = Mock()
 32 |         mock_response.headers = {"Content-Length": "1000"}
 33 |         mock_response.iter_content.return_value = [b"chunk1", b"chunk2"]
 34 |         mock_response.raise_for_status.return_value = None
 35 | 
 36 |         mock_session_instance = Mock()
 37 |         mock_session_instance.get.return_value = mock_response
 38 |         mock_session.return_value.__enter__.return_value = mock_session_instance
 39 | 
 40 |         # Mock tqdm context manager
 41 |         mock_tqdm.return_value.__enter__.return_value.update = Mock()
 42 | 
 43 |         # Mock Path operations
 44 |         mock_file_path = MagicMock()
 45 |         mock_target_path = MagicMock()
 46 |         mock_path.side_effect = [mock_target_path, mock_file_path]
 47 |         mock_target_path.__truediv__.return_value = mock_file_path
 48 |         mock_file_path.open.return_value.__enter__.return_value = Mock()
 49 |         mock_file_path.open.return_value.__exit__.return_value = None
 50 |         mock_file_path.unlink.return_value = None
 51 | 
 52 |         # Mock the extract function to not do anything
 53 |         mock_extract.return_value = None
 54 | 
 55 |         # Call function
 56 |         result = download_file("http://test.com", "/tmp/target", "test_file")
 57 | 
 58 |         # Verify
 59 |         assert result is True
 60 |         mock_session_instance.get.assert_called_once_with(
 61 |             REPO_BASE_URL, stream=True, allow_redirects=True, timeout=120
 62 |         )
 63 | 
 64 |     @patch("pranaam.utils.requests.Session")
 65 |     def test_network_error(self, mock_session: Mock) -> None:
 66 |         """Test handling of network errors."""
 67 |         mock_session_instance = Mock()
 68 |         mock_session_instance.get.side_effect = requests.exceptions.ConnectionError(
 69 |             "Network error"
 70 |         )
 71 |         mock_session.return_value.__enter__.return_value = mock_session_instance
 72 | 
 73 |         result = download_file("http://test.com", "/tmp/target", "test_file")
 74 | 
 75 |         assert result is False
 76 | 
 77 |     @patch("pranaam.utils.requests.Session")
 78 |     def test_http_error(self, mock_session: Mock) -> None:
 79 |         """Test handling of HTTP errors."""
 80 |         mock_response = Mock()
 81 |         mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError(
 82 |             "404 Not Found"
 83 |         )
 84 | 
 85 |         mock_session_instance = Mock()
 86 |         mock_session_instance.get.return_value = mock_response
 87 |         mock_session.return_value.__enter__.return_value = mock_session_instance
 88 | 
 89 |         result = download_file("http://test.com", "/tmp/target", "test_file")
 90 | 
 91 |         assert result is False
 92 | 
 93 |     @patch("pranaam.utils.requests.Session")
 94 |     @patch("pranaam.utils._safe_extract_tar")
 95 |     @patch("builtins.open", new_callable=mock_open)
 96 |     def test_extraction_error(
 97 |         self, mock_file: Mock, mock_extract: Mock, mock_session: Mock
 98 |     ) -> None:
 99 |         """Test handling of extraction errors."""
100 |         # Setup successful download but failed extraction
101 |         mock_response = Mock()
102 |         mock_response.headers = {"Content-Length": "1000"}
103 |         mock_response.iter_content.return_value = [b"chunk"]
104 |         mock_response.raise_for_status.return_value = None
105 | 
106 |         mock_session_instance = Mock()
107 |         mock_session_instance.get.return_value = mock_response
108 |         mock_session.return_value.__enter__.return_value = mock_session_instance
109 | 
110 |         mock_extract.side_effect = tarfile.TarError("Corrupted tar file")
111 | 
112 |         result = download_file("http://test.com", "/tmp/target", "test_file")
113 | 
114 |         assert result is False
115 | 
116 |     @patch("pranaam.utils.requests.Session")
117 |     @patch("pranaam.utils._safe_extract_tar")
118 |     @patch("pranaam.utils.Path")
119 |     @patch("pranaam.utils.tqdm")
120 |     def test_no_content_length(
121 |         self, mock_tqdm: Mock, mock_path: Mock, mock_extract: Mock, mock_session: Mock
122 |     ) -> None:
123 |         """Test handling when Content-Length header is missing."""
124 |         mock_response = Mock()
125 |         mock_response.headers = {}  # No Content-Length
126 |         mock_response.iter_content.return_value = [b"chunk"]
127 |         mock_response.raise_for_status.return_value = None
128 | 
129 |         mock_session_instance = Mock()
130 |         mock_session_instance.get.return_value = mock_response
131 |         mock_session.return_value.__enter__.return_value = mock_session_instance
132 | 
133 |         # Mock tqdm context manager
134 |         mock_tqdm.return_value.__enter__.return_value.update = Mock()
135 | 
136 |         # Mock Path operations
137 |         mock_file_path = MagicMock()
138 |         mock_target_path = MagicMock()
139 |         mock_path.side_effect = [mock_target_path, mock_file_path]
140 |         mock_target_path.__truediv__.return_value = mock_file_path
141 |         mock_file_path.open.return_value.__enter__.return_value = Mock()
142 |         mock_file_path.open.return_value.__exit__.return_value = None
143 |         mock_file_path.unlink.return_value = None
144 | 
145 |         # Mock the extract function to not do anything
146 |         mock_extract.return_value = None
147 | 
148 |         result = download_file("http://test.com", "/tmp/target", "test_file")
149 | 
150 |         assert result is True
151 | 
152 | 
153 | class TestSafeExtractTar:
154 |     """Test _safe_extract_tar function."""
155 | 
156 |     def test_safe_extraction(self) -> None:
157 |         """Test safe extraction of tar file."""
158 |         with tempfile.TemporaryDirectory() as temp_dir:
159 |             # Create a test tar file
160 |             tar_path = os.path.join(temp_dir, "test.tar.gz")
161 |             test_file_path = os.path.join(temp_dir, "test.txt")
162 | 
163 |             # Create test content
164 |             with open(test_file_path, "w") as f:
165 |                 f.write("test content")
166 | 
167 |             # Create tar file
168 |             with tarfile.open(tar_path, "w:gz") as tar:
169 |                 tar.add(test_file_path, arcname="test.txt")
170 | 
171 |             # Remove original file
172 |             os.remove(test_file_path)
173 | 
174 |             # Extract using our function
175 |             extract_dir = os.path.join(temp_dir, "extracted")
176 |             os.makedirs(extract_dir)
177 | 
178 |             _safe_extract_tar(Path(tar_path), Path(extract_dir))
179 | 
180 |             # Verify extraction
181 |             extracted_file = os.path.join(extract_dir, "test.txt")
182 |             assert os.path.exists(extracted_file)
183 | 
184 |             with open(extracted_file) as f:
185 |                 assert f.read() == "test content"
186 | 
187 |     def test_path_traversal_prevention(self) -> None:
188 |         """Test prevention of path traversal attacks."""
189 |         with tempfile.TemporaryDirectory() as temp_dir:
190 |             tar_path = os.path.join(temp_dir, "malicious.tar.gz")
191 | 
192 |             # Create a tar file with path traversal attempt
193 |             with tarfile.open(tar_path, "w:gz") as tar:
194 |                 # Create a TarInfo object with malicious path
195 |                 info = tarfile.TarInfo(name="../../../malicious.txt")
196 |                 content = b"malicious content"
197 |                 info.size = len(content)
198 |                 # Create a temporary file with the actual content
199 |                 import io
200 | 
201 |                 tar.addfile(info, fileobj=io.BytesIO(content))
202 | 
203 |             extract_dir = os.path.join(temp_dir, "extracted")
204 |             os.makedirs(extract_dir)
205 | 
206 |             # Should raise exception
207 |             with pytest.raises(Exception, match="Attempted path traversal"):
208 |                 _safe_extract_tar(Path(tar_path), Path(extract_dir))
209 | 
210 |     def test_corrupted_tar_file(self) -> None:
211 |         """Test handling of corrupted tar files."""
212 |         with tempfile.TemporaryDirectory() as temp_dir:
213 |             # Create a corrupted file (not a valid tar)
214 |             tar_path = os.path.join(temp_dir, "corrupted.tar.gz")
215 |             with open(tar_path, "wb") as f:
216 |                 f.write(b"not a tar file")
217 | 
218 |             extract_dir = os.path.join(temp_dir, "extracted")
219 |             os.makedirs(extract_dir)
220 | 
221 |             with pytest.raises(tarfile.TarError):
222 |                 _safe_extract_tar(Path(tar_path), Path(extract_dir))
223 | 
224 |     def test_nonexistent_tar_file(self) -> None:
225 |         """Test handling of non-existent tar file."""
226 |         with tempfile.TemporaryDirectory() as temp_dir:
227 |             tar_path = os.path.join(temp_dir, "nonexistent.tar.gz")
228 |             extract_dir = os.path.join(temp_dir, "extracted")
229 |             os.makedirs(extract_dir)
230 | 
231 |             with pytest.raises((FileNotFoundError, tarfile.TarError)):
232 |                 _safe_extract_tar(Path(tar_path), Path(extract_dir))
233 | 
234 | 
235 | class TestConstants:
236 |     """Test module constants."""
237 | 
238 |     def test_repo_base_url_default(self) -> None:
239 |         """Test default repository base URL."""
240 |         # Should have default Harvard Dataverse URL
241 |         assert "dataverse.harvard.edu" in REPO_BASE_URL
242 | 
243 |     @patch.dict(os.environ, {"PRANAAM_MODEL_URL": "http://custom.url/model"})
244 |     def test_repo_base_url_custom(self) -> None:
245 |         """Test custom repository URL from environment."""
246 |         # Need to reimport to pick up environment variable
247 |         import importlib
248 | 
249 |         import pranaam.utils
250 | 
251 |         importlib.reload(pranaam.utils)
252 | 
253 |         assert pranaam.utils.REPO_BASE_URL == "http://custom.url/model"
254 | 
255 | 
256 | class TestErrorLogging:
257 |     """Test error logging in utils functions."""
258 | 
259 |     @patch("pranaam.utils.logger")
260 |     @patch("pranaam.utils.requests.Session")
261 |     @patch("pranaam.utils.tqdm")
262 |     def test_network_error_logging(
263 |         self, mock_tqdm: Mock, mock_session: Mock, mock_logger: Mock
264 |     ) -> None:
265 |         """Test that network errors are logged properly."""
266 |         mock_session_instance = Mock()
267 |         mock_session_instance.get.side_effect = requests.exceptions.ConnectionError(
268 |             "Network error"
269 |         )
270 |         mock_session.return_value.__enter__.return_value = mock_session_instance
271 |         mock_tqdm.return_value.__enter__.return_value = Mock()
272 | 
273 |         import tempfile
274 | 
275 |         with tempfile.TemporaryDirectory() as temp_dir:
276 |             download_file("http://test.com", temp_dir, "test")
277 | 
278 |         mock_logger.error.assert_called()
279 |         args, kwargs = mock_logger.error.call_args
280 |         assert "Network error downloading models" in args[0]
281 | 
282 |     @patch("pranaam.utils.logger")
283 |     @patch("pranaam.utils.requests.Session")
284 |     @patch("pranaam.utils._safe_extract_tar")
285 |     @patch("builtins.open", new_callable=mock_open)
286 |     @patch("pranaam.utils.tqdm")
287 |     def test_extraction_error_logging(
288 |         self,
289 |         mock_tqdm: Mock,
290 |         mock_file: Mock,
291 |         mock_extract: Mock,
292 |         mock_session: Mock,
293 |         mock_logger: Mock,
294 |     ) -> None:
295 |         """Test that extraction errors are logged properly."""
296 |         # Setup successful download
297 |         mock_response = Mock()
298 |         mock_response.headers = {"Content-Length": "1000"}
299 |         mock_response.iter_content.return_value = [b"chunk"]
300 |         mock_response.raise_for_status.return_value = None
301 | 
302 |         mock_session_instance = Mock()
303 |         mock_session_instance.get.return_value = mock_response
304 |         mock_session.return_value.__enter__.return_value = mock_session_instance
305 | 
306 |         # Mock tqdm context manager
307 |         mock_tqdm.return_value.__enter__.return_value.update = Mock()
308 | 
309 |         # Setup extraction failure
310 |         mock_extract.side_effect = tarfile.TarError("Extraction failed")
311 | 
312 |         download_file("http://test.com", "/tmp", "test")
313 | 
314 |         mock_logger.error.assert_called()
315 |         args, kwargs = mock_logger.error.call_args
316 |         assert "File extraction error" in args[0]
317 | 


--------------------------------------------------------------------------------