├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── build-and-test.yml │ ├── codeql.yml │ ├── deploy-website.yml │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── LICENSE-DOCUMENTATION ├── README.md ├── balance ├── __init__.py ├── adjustment.py ├── balancedf_class.py ├── cli.py ├── datasets │ ├── __init__.py │ ├── sim_data_cbps.R │ └── sim_data_cbps.csv ├── sample_class.py ├── stats_and_plots │ ├── __init__.py │ ├── general_stats.py │ ├── weighted_comparisons_plots.py │ ├── weighted_comparisons_stats.py │ ├── weighted_stats.py │ └── weights_stats.py ├── testutil.py ├── typing.py ├── util.py └── weighting_methods │ ├── __init__.py │ ├── adjust_null.py │ ├── cbps.py │ ├── ipw.py │ ├── poststratify.py │ └── rake.py ├── scripts └── make_docs.sh ├── setup.py ├── sphinx ├── Makefile ├── _static │ └── css │ │ └── custom.css ├── _templates │ └── autosummary │ │ ├── class.rst │ │ └── module.rst ├── balance.rst ├── conf.py ├── index.rst └── make.bat ├── tests ├── test_adjust_null.py ├── test_adjustment.py ├── test_balancedf.py ├── test_cbps.py ├── test_cli.py ├── test_datasets.py ├── test_ipw.py ├── test_logging.py ├── test_poststratify.py ├── test_rake.py ├── test_sample.py ├── test_stats_and_plots.py ├── test_testutil.py ├── test_util.py └── test_weighted_comparisons_plots.py ├── tutorials ├── balance_quickstart.ipynb ├── balance_quickstart_cbps.ipynb ├── balance_quickstart_rake.ipynb ├── balance_transformations_and_formulas.ipynb └── comparing_cbps_in_r_vs_python_using_sim_data.ipynb └── website ├── .npmrc ├── README.md ├── babel.config.js ├── blog └── 2023 │ └── 01 │ └── 09 │ ├── bringing-balance-to-your-data.md │ └── sample_vs_target_bar_chart.webp ├── docs ├── api_reference │ └── index.md ├── docs │ ├── contributing.md │ ├── general_framework │ │ ├── adjusting_sample_to_population.md │ │ ├── evaluation_of_results.md │ │ ├── general_framework.md │ │ └── pre_adjustment_diagnostics.md │ ├── img │ │ ├── fig_01_qqplot_income_before.png │ │ ├── fig_02_barplot_age_before.png │ │ ├── fig_03_barplot_gender_before.png │ │ ├── fig_04_qqplot_income_after.png │ │ ├── fig_05_barplot_age_after.png │ │ ├── fig_06_barplot_gender_after.png │ │ ├── fig_07_seaborn_after.png │ │ ├── fig_08_weights_kde.png │ │ ├── fig_09_seaborn_outcome_kde_after.png │ │ ├── total_survey_error_flow_v02.png │ │ ├── total_survey_error_flow_v02.svg │ │ ├── total_survey_error_image.png │ │ └── total_survey_error_image.svg │ ├── overview.md │ └── statistical_methods │ │ ├── cbps.md │ │ ├── index.md │ │ ├── ipw.md │ │ ├── poststratify.md │ │ └── rake.md └── tutorials │ ├── balance_transformations_and_formulas.mdx │ ├── comparing_cbps_in_r_vs_python_using_sim_data.mdx │ ├── index.mdx │ ├── quickstart.mdx │ ├── quickstart_cbps.mdx │ └── quickstart_rake.mdx ├── docusaurus.config.js ├── package.json ├── sidebars.js ├── src ├── components │ ├── HTMLLoader.js │ ├── HomepageFeatures.js │ └── HomepageFeatures.module.css ├── css │ └── custom.css └── pages │ ├── index.js │ ├── index.module.css │ └── markdown-page.md ├── static ├── .nojekyll ├── CNAME ├── docs │ ├── Balancing_biased_data_samples_with_the_balance_Python_package_-_ISA_conference_2023-06-01.pdf │ └── Balancing_biased_data_samples_with_the_balance_Python_package_-_ISA_conference_2023-06-01.pptx └── img │ ├── balance_logo │ ├── AI │ │ └── balance_Logo_FINAL.ai │ ├── PNG │ │ ├── Horizontal │ │ │ ├── balance_Logo_Horizontal_Black_RGB.png │ │ │ ├── balance_Logo_Horizontal_FullColor_RGB.png │ │ │ └── balance_Logo_Horizontal_White_RGB.png │ │ ├── Icon │ │ │ ├── balance_Logo_Icon_Black_RGB.png │ │ │ ├── balance_Logo_Icon_FullColor_RGB.png │ │ │ └── balance_Logo_Icon_White_RGB.png │ │ └── Vertical │ │ │ ├── balance_Logo_Vertical_Black_RGB.png │ │ │ ├── balance_Logo_Vertical_FullColor_RGB.png │ │ │ └── balance_Logo_Vertical_White_RGB.png │ ├── SVG │ │ ├── Horizontal │ │ │ ├── balance_Logo_Horizontal_Black_RGB.svg │ │ │ ├── balance_Logo_Horizontal_FullColor_RGB.svg │ │ │ └── balance_Logo_Horizontal_White_RGB.svg │ │ ├── Icon │ │ │ ├── balance_Logo_Icon_Black_RGB.svg │ │ │ ├── balance_Logo_Icon_FullColor_RGB.svg │ │ │ └── balance_Logo_Icon_White_RGB.svg │ │ └── Vertical │ │ │ ├── balance_Logo_Vertical_Black_RGB.svg │ │ │ ├── balance_Logo_Vertical_FullColor_RGB.svg │ │ │ └── balance_Logo_Vertical_White_RGB.svg │ ├── balance_Social__1280x720.png │ ├── balance_Social__1920x1080.png │ ├── icon.png │ ├── icon.svg │ ├── vertical.svg │ └── vertical_white.svg │ ├── balance_logo_hex_stickers │ ├── balance_hex_fill.svg │ ├── balance_hex_fill__3x.png │ ├── balance_hex_outline.svg │ └── balance_hex_outline__3x.png │ ├── docusaurus.png │ ├── favicon.ico │ ├── fontawesome │ ├── code.svg │ ├── layer-group.svg │ └── users.svg │ ├── logo.svg │ ├── meta_opensource_logo_negative.svg │ └── tutorial │ ├── docsVersionDropdown.png │ └── localeDropdown.png └── yarn.lock /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | 3 | # E501: Line too long (80 chars) 4 | # W503: Line break before binary operator 5 | # E203: Whitespace before ':' 6 | ignore = E501, W503, E203 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a bug report to help us improve 4 | title: '[BUG] ' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Describe the bug 11 | A clear and concise description of what the bug is, what you expected to happen versus what you got. You can add here a paste of any errors or warnings you got. 12 | 13 | # Session information 14 | Please run paste here the output of running the following in your notebook/terminal: 15 | 16 | ```python 17 | # Sessions info 18 | import session_info 19 | session_info.show(html=False, dependencies=True) 20 | ``` 21 | 22 | # Screenshots 23 | If applicable, add screenshots to help explain your problem. 24 | 25 | # Reproducible example 26 | Please provide us with (any that apply): 27 | 1. **Code**: code we can run to reproduce the issue (in terminal or python notebook) 28 | 2. **Reference**: If the issue is in a tutorial, please provide the link to it, and the exact place in which the code fails. 29 | 30 | # Additional context 31 | Add any other context about the problem here that might help us solve it. 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE] " 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Is your feature request related to a problem? Please describe. 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | # Describe the solution you'd like 14 | A clear and concise description of what you want to happen. 15 | 16 | # Describe alternatives you've considered 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | # Additional context 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/build-and-test.yml: -------------------------------------------------------------------------------- 1 | name: Build and Test Workflow 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | schedule: 9 | # midnight EST 10 | - cron: '0 5 * * *' 11 | # allow this to be scheduled manually in addition to cron 12 | workflow_dispatch: 13 | 14 | jobs: 15 | tests: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | python-version: ['3.9', '3.10', '3.11'] 20 | fail-fast: false 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v2 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install Pkg + Dependencies 29 | run: | 30 | python -m pip install .[dev] 31 | - name: Test with pytest 32 | run: | 33 | python -m pytest -ra 34 | - name: Build wheels pkg 35 | run: | 36 | python setup.py bdist_wheel 37 | 38 | lint: 39 | runs-on: ubuntu-latest 40 | steps: 41 | - name: Checkout 42 | uses: actions/checkout@v2 43 | - name: Set up Python 44 | uses: actions/setup-python@v2 45 | with: 46 | python-version: 3.9 47 | - name: Install dependencies 48 | run: | 49 | pip install flake8 50 | - name: Flake8 51 | run: | 52 | flake8 53 | 54 | test-deploy-website: 55 | name: Test website build 56 | runs-on: ubuntu-latest 57 | steps: 58 | - uses: actions/checkout@v2 59 | - uses: actions/setup-node@v3 60 | with: 61 | node-version: 18 62 | cache: yarn 63 | cache-dependency-path: "./website/yarn.lock" 64 | - name: Set up Python 65 | uses: actions/setup-python@v2 66 | with: 67 | python-version: 3.9 68 | - name: Install Pkg + Dependencies 69 | run: | 70 | python -m pip install .[dev] 71 | - name: Build website 72 | run: bash ./scripts/make_docs.sh -n 73 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '26 6 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'javascript', 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Use only 'java' to analyze code written in Java, Kotlin or both 38 | # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both 39 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 40 | 41 | steps: 42 | - name: Checkout repository 43 | uses: actions/checkout@v3 44 | 45 | # Initializes the CodeQL tools for scanning. 46 | - name: Initialize CodeQL 47 | uses: github/codeql-action/init@v2 48 | with: 49 | languages: ${{ matrix.language }} 50 | # If you wish to specify custom queries, you can do so here or in a config file. 51 | # By default, queries listed here will override any specified in a config file. 52 | # Prefix the list here with "+" to use these queries and those in the config file. 53 | 54 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 55 | # queries: security-extended,security-and-quality 56 | 57 | 58 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 59 | # If this step fails, then you should remove it and run the build manually (see below) 60 | - name: Autobuild 61 | uses: github/codeql-action/autobuild@v2 62 | 63 | # ℹ️ Command-line programs to run using the OS shell. 64 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 65 | 66 | # If the Autobuild fails above, remove it and uncomment the following three lines. 67 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 68 | 69 | # - run: | 70 | # echo "Run, Build Application using script" 71 | # ./location_of_script_within_repo/buildscript.sh 72 | 73 | - name: Perform CodeQL Analysis 74 | uses: github/codeql-action/analyze@v2 75 | with: 76 | category: "/language:${{matrix.language}}" 77 | -------------------------------------------------------------------------------- /.github/workflows/deploy-website.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | deploy: 8 | name: Deploy to GitHub Pages 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions/setup-node@v3 13 | with: 14 | node-version: 18 15 | cache: yarn 16 | cache-dependency-path: "./website/yarn.lock" 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.9 21 | - name: Install Pkg + Dependencies 22 | run: | 23 | python -m pip install .[dev] 24 | - name: Build website 25 | run: bash ./scripts/make_docs.sh -n 26 | # Popular action to deploy to GitHub Pages: 27 | # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus 28 | - name: Deploy to GitHub Pages 29 | uses: peaceiris/actions-gh-pages@v3 30 | with: 31 | github_token: ${{ secrets.GITHUB_TOKEN }} 32 | # Build output to publish to the `gh-pages` branch: 33 | publish_dir: ./website/build 34 | # The following lines assign commit authorship to the official 35 | # GH-Actions bot for deploys to `gh-pages` branch: 36 | # https://github.com/actions/checkout/issues/13#issuecomment-724415212 37 | # The GH actions bot is used by default if you didn't specify the two fields. 38 | # You can swap them out with your own user credentials. 39 | user_name: github-actions[bot] 40 | user_email: 41898282+github-actions[bot]@users.noreply.github.com 41 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This workflow uploads a Python Package using Twine when a release is published. 2 | # For more information, see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Deploy 5 | 6 | on: 7 | release: 8 | types: [published] 9 | 10 | jobs: 11 | tests: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ['3.9', '3.10', '3.11'] 16 | fail-fast: false 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install Pkg + Dependencies 25 | run: | 26 | python -m pip install .[dev] 27 | - name: Test with pytest 28 | run: | 29 | python -m pytest -ra 30 | - name: Build wheels pkg 31 | run: | 32 | python setup.py bdist_wheel 33 | 34 | deploy: 35 | needs: tests # only run if previous step succeeds 36 | runs-on: ubuntu-latest 37 | steps: 38 | - name: Checkout 39 | uses: actions/checkout@v4 40 | - name: Set up Python 41 | uses: actions/setup-python@v5 42 | with: 43 | python-version: "3.10" 44 | - name: Install Pkg + Dependencies 45 | run: | 46 | python -m pip install .[dev] 47 | - name: Fetch all history for all tags and branches 48 | run: git fetch --prune --unshallow 49 | - name: Build wheels pkg 50 | run: | 51 | python setup.py sdist bdist_wheel 52 | - name: Publish a Python distribution to PyPI 53 | uses: pypa/gh-action-pypi-publish@release/v1 54 | with: 55 | user: __token__ 56 | password: ${{ secrets.PYPI_API_TOKEN }} 57 | verbose: true 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | 4 | # OSX 5 | *.DS_Store 6 | 7 | # Jupyter Notebook 8 | .ipynb_checkpoints 9 | 10 | # Misc 11 | .DS_Store 12 | .env.local 13 | .env.development.local 14 | .env.test.local 15 | .env.production.local 16 | 17 | 18 | ################## 19 | # Docusaurus 20 | ################## 21 | 22 | # Dependencies 23 | website/node_modules 24 | 25 | # Production 26 | website/build 27 | 28 | # Generated files 29 | .docusaurus 30 | .cache-loader 31 | 32 | npm-debug.log* 33 | yarn-debug.log* 34 | yarn-error.log* 35 | 36 | # FB docusaurus artifacts 37 | website/fb 38 | 39 | # Generated static artifacts 40 | website/static/api_reference 41 | website/static/html/tutorials 42 | 43 | 44 | ################## 45 | # Sphinx 46 | ################## 47 | 48 | # Build artifacts 49 | sphinx/_autosummary 50 | sphinx/_build 51 | 52 | # Auto-generated rst files 53 | balance.*.rst 54 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to balance 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## Code Requirements 30 | 31 | ### Coding Style 32 | * 4 spaces for indentation rather than tabs 33 | * 80 character line length 34 | 35 | ### Linting 36 | Run the linter via `flake8` (`pip install flake8`) from the root of the Ax repository. Note that we have a [custom flake8 configuration](https://github.com/facebookresearch/balance/blob/main/.flake8). 37 | 38 | ### Static Type Checking 39 | We use [Pyre](https://pyre-check.org/) for static type checking and require code to be fully type annotated. 40 | 41 | ### Unit testing 42 | We strongly recommend adding unit testing when introducing new code. To run all unit tests, we recommend installing pytest using `pip install pytest` and running `pytest -ra` from the root of the balance repo. 43 | 44 | ### Documentation 45 | * We require docstrings on all public functions and classes (those not prepended with `_`). 46 | * We use the [Google docstring style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) & use Sphinx to compile API reference documentation. 47 | * Our [website](https://import-balance.org) leverages Docusaurus 2.0 + Sphinx + Jupyter notebook for generating our documentation content. 48 | * To rule out parsing errors, we suggesting [installing sphinx](https://www.sphinx-doc.org/en/master/usage/installation.html) and running `make html` from the balance/sphinx folder. 49 | 50 | ## Website Development 51 | 52 | ### Overview 53 | balance's website is also open source and part of this repository. balance leverages several open source frameworks for website development. 54 | * [Docusaurus 2](https://docusaurus.io/): The main site is generated using Docusaurus, with the code living under the [website](https://github.com/facebookresearch/balance/tree/main/website) folder). This includes the website template (navbar, footer, sidebars), landing page, and main sections (Blog, Docs, Tutorials, API Reference). 55 | * Markdown is used for the content of several sections, particularly the "Docs" section. Files are under the [docs/](https://github.com/facebookresearch/balance/tree/main/website/docs/docs) folder 56 | * [Jupyter notebook](https://fburl.com/55p6vvxo) is used to generate the notebook tutorials under the "Tutorials" section, based on our ipynb tutorials in our [tutorials](https://github.com/facebookresearch/balance/tree/main/tutorials) folder. 57 | * [Sphinx](https://www.sphinx-doc.org/en/master/index.html) is used for Python documentation generation, populated under the "API Reference" section. Files are under the [sphinx](https://github.com/facebookresearch/balance/tree/main/sphinx) folder. 58 | 59 | 60 | ### Setup 61 | 62 | To install the necessary dependencies for website development, run the following from the repo root: 63 | ``` 64 | python -m pip install git+https://github.com/bbalasub1/glmnet_python.git@1.0 65 | python -m pip install .[dev] 66 | ``` 67 | 68 | ### Adding Notebook Tutorials 69 | All our notebook tutorials are housed under the [tutorials](https://github.com/facebookresearch/balance/tree/main/tutorials) folder at the root of the repo. We use these notebooks as the source of truth for the "Tutorials" section of the website, executing & generating HTML pages for each notebook. 70 | 71 | To add a new tutorial: 72 | 1. Check in your notebook (.ipynb) to our [tutorials](https://github.com/facebookresearch/balance/tree/main/tutorials) folder. We strongly suggest clearing notebook output cells. 73 | 2. Extend the "Building tutorial HTML" section of [`scripts/make_docs.sh`](https://github.com/facebookresearch/balance/blob/main/scripts/make_docs.sh) to execute & generate HTML for the new tutorial e.g. `jupyter nbconvert tutorials/my_tutorial.ipynb --execute --to html --output-dir website/static/html/tutorials`. 74 | 3. Introduce a new .mdx page under the [website/docs/tutorials](https://github.com/facebookresearch/balance/tree/main/website/docs/tutorials) folder for the new tutorial. Use HTMLLoader to load the generated HTML e.g. ``. [quickstart.mdx](https://github.com/facebookresearch/balance/blob/main/website/docs/tutorials/quickstart.mdx) is a good reference for the setup 75 | 76 | To test the setup, see the [`Building & Testing Website Changes`](#building--testing-website-changes) section below. 77 | 78 | Note: The generated HTML should not be checked into the main repo. 79 | 80 | ### Building & Testing Website Changes 81 | We've developed a helper script for running the full website build process: 82 | 83 | ``` 84 | ./scripts/make_docs.sh 85 | 86 | # To start up the local webserver 87 | cd website 88 | yarn serve 89 | ``` 90 | Once the local webserver is up, you'll get a link you can follow to visit the newly-built site. See [Docusaurus docs](https://docusaurus.io/docs/deployment#testing-build-locally) for more info. 91 | 92 | ## Deployment 93 | We rely on Github Actions to run our CI/CD. The workflow files can be found [here](https://fburl.com/5kwhksbu). 94 | 95 | In summary 96 | * On every pull request, we run our "Build & Test" workflow, which includes PyTest tests, Wheels package builds, flake8 linting, and website build. 97 | * We also run the same "Build & Test" suite nightly. 98 | * On every push, we deploy a new version of the website. The `make_docs.sh` script is run from the main branch and the build artifacts are published to the `gh-pages` branch, which is linked to our repo's Github Page's deployment. 99 | 100 | ### Releasing a new version 101 | To create a new release, simply navigate to the ["Release" page](https://github.com/facebookresearch/balance/releases) of the repo, draft a new release, and publish. The Github Action workflow should be triggered on publish and you should see a new version of the package live on PyPi in ~10 mins. You can check the status of the job via the [GH Actions tab](https://github.com/facebookresearch/balance/actions). 102 | 103 | Guidelines when drafting a new release: 104 | * Follow semantic versioning conventions when chosing the next version. 105 | * The release's tag should only be the version itself (e.g. "0.1.0"). Do not add any prefixes like "v" or "version". The build process relies on proper formatting of this tag. 106 | 107 | The Github Actions job is configured at [release.yml](https://github.com/facebookresearch/balance/blob/main/.github/workflows/release.yml). 108 | 109 | ## License 110 | By contributing to balance, you agree that your contributions will be licensed 111 | under the LICENSE file in the root directory of this source tree. 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /balance/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | 9 | import logging 10 | from typing import Optional 11 | 12 | from balance.balancedf_class import ( # noqa 13 | BalanceCovarsDF, # noqa 14 | BalanceOutcomesDF, # noqa 15 | BalanceWeightsDF, # noqa 16 | ) 17 | from balance.datasets import load_data # noqa 18 | from balance.sample_class import Sample # noqa 19 | from balance.util import TruncationFormatter # noqa 20 | 21 | # TODO: which objects do we want to explicitly externalize? 22 | # TODO: verify this works. 23 | 24 | global __version__ 25 | __version__ = "0.10.0" 26 | 27 | 28 | def setup_logging( 29 | logger_name: Optional[str] = __package__, 30 | level: str = "INFO", 31 | removeHandler: bool = True, 32 | ) -> logging.Logger: 33 | """ 34 | Initiates a nicely formatted logger called "balance", with level "info". 35 | """ 36 | if removeHandler: 37 | for handler in logging.root.handlers[:]: 38 | logging.root.removeHandler(handler) 39 | 40 | logger = logging.getLogger(logger_name) 41 | 42 | logger.setLevel(getattr(logging, level)) 43 | formatter = TruncationFormatter( 44 | "%(levelname)s (%(asctime)s) [%(module)s/%(funcName)s (line %(lineno)d)]: %(message)s" 45 | ) 46 | handler = logging.StreamHandler() 47 | handler.setFormatter(formatter) 48 | logger.addHandler(handler) 49 | 50 | return logger 51 | 52 | 53 | logger: logging.Logger = setup_logging() 54 | logger.info(f"Using {__package__} version {__version__}") 55 | 56 | 57 | # TODO: add example in the notebooks for using this function. 58 | def set_warnings(level: str = "WARNING") -> None: 59 | logger.setLevel(getattr(logging, level)) 60 | -------------------------------------------------------------------------------- /balance/datasets/sim_data_cbps.R: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # R code to re-create the simulated data from the CBPS R package. 7 | # The code here is kept for reference, and is not run directly from the balance package. 8 | 9 | # install.packages("CBPS") 10 | library("CBPS") 11 | 12 | set.seed(123456) 13 | n <- 500 14 | X <- mvrnorm(n, mu = rep(0, 4), Sigma = diag(4)) 15 | prop <- 1 / (1 + exp(X[,1] - 0.5 * X[,2] + 16 | 0.25*X[,3] + 0.1 * X[,4])) 17 | treat <- rbinom(n, 1, prop) 18 | y <- 210 + 27.4*X[,1] + 13.7*X[,2] + 13.7*X[,3] + 13.7*X[,4] + rnorm(n) 19 | 20 | ##Estimate CBPS with a misspecified model 21 | X.mis <- cbind(exp(X[,1]/2), X[,2]*(1+exp(X[,1]))^(-1)+10, 22 | (X[,1]*X[,3]/25+.6)^3, (X[,2]+X[,4]+20)^2) 23 | fit1 <- CBPS(treat ~ X.mis, ATT = 1) # we treat 1 as the "target population" 24 | summary(fit1) 25 | # Call: 26 | # CBPS(formula = treat ~ X.mis, ATT = 1) 27 | 28 | # Coefficients: 29 | # Estimate Std. Error z value Pr(>|z|) 30 | # (Intercept) -4.34 1.51 -2.86 0.00419 ** 31 | # X.mis1 -1.6 0.125 -12.8 0.000 *** 32 | # X.mis2 0.548 0.144 3.82 0.000136 *** 33 | # X.mis3 1.99 0.0425 46.9 0.000 *** 34 | # X.mis4 0.000537 0.163 0.0033 0.997 35 | # --- 36 | # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 37 | 38 | # J - statistic: 0.005098105 39 | # Log-Likelihood: -299.7309 40 | 41 | df <- data.frame(treat, X.mis, cbps_weights = fit1$weights, y) 42 | head(df) 43 | # > head(df) 44 | # treat X1 X2 X3 X4 cbps_weights y 45 | # 1 0 1.0744852 10.320361 0.21255104 463.2808 0.005368889 227.5325 46 | # 2 1 0.7237691 9.911956 0.18948750 383.7598 0.003937008 199.8175 47 | # 3 0 0.6909134 10.645240 0.21737631 424.2880 0.011736818 196.8860 48 | # 4 1 0.3470712 9.907768 0.09670587 399.3661 0.003937008 174.6853 49 | # 5 0 0.5016829 9.594918 0.23255891 472.8546 0.009463430 191.2977 50 | # 6 0 1.5231081 10.031016 0.32572077 438.3759 0.002758207 280.4517 51 | with(df, boxplot(X1~treat)) 52 | with(df, boxplot(X2~treat)) 53 | with(df, boxplot(X3~treat)) 54 | with(df, boxplot(X4~treat)) 55 | with(df, boxplot(cbps_weights~treat)) # Showing that the target "treat==1" all have the same weight 56 | 57 | with(df[df$treat == 1,], sum(cbps_weights * y) / sum(cbps_weights)) # 199.5444 58 | with(df[df$treat == 0,], sum(cbps_weights * y) / sum(cbps_weights)) # 206.8441 59 | tapply(df[,"treat"], df[,"treat"], mean) 60 | # 0 1 61 | # 220.6768 199.5444 62 | 63 | df$id <- 1:nrow(df) 64 | 65 | write.csv(df, "~/Downloads/simulated_cbps_data.csv", row.names = FALSE) 66 | 67 | 68 | sessionInfo() 69 | # R version 3.6.1 (2019-07-05) 70 | # Platform: x86_64-apple-darwin15.6.0 (64-bit) 71 | # Running under: macOS 10.16 72 | 73 | # Matrix products: default 74 | # LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib 75 | 76 | # locale: 77 | # [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 78 | 79 | # attached base packages: 80 | # [1] stats graphics grDevices utils datasets methods base 81 | 82 | # other attached packages: 83 | # [1] CBPS_0.23 glmnet_4.1-3 Matrix_1.2-17 numDeriv_2016.8-1.1 84 | # [5] nnet_7.3-12 MatchIt_4.5.0 MASS_7.3-51.4 85 | 86 | # loaded via a namespace (and not attached): 87 | # [1] Rcpp_1.0.8 lattice_0.20-38 codetools_0.2-16 foreach_1.4.4 grid_3.6.1 88 | # [6] backports_1.4.1 splines_3.6.1 iterators_1.0.13 tools_3.6.1 yaml_2.2.0 89 | # [11] survival_3.1-11 compiler_3.6.1 shape_1.4.6 90 | -------------------------------------------------------------------------------- /balance/stats_and_plots/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /balance/stats_and_plots/general_stats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | from typing import Optional 9 | 10 | import pandas as pd 11 | 12 | 13 | def relative_response_rates( 14 | df: pd.DataFrame, 15 | df_target: Optional[pd.DataFrame] = None, 16 | per_column: bool = True, 17 | ) -> pd.DataFrame: 18 | """Produces a summary table of number of responses and proportion of completed responses. 19 | 20 | Args: 21 | df (pd.DataFrame): A DataFrame to calculate aggregated response rates for. 22 | df_target (Optional[pd.DataFrame], optional): Defaults to None. 23 | Determines what is the denominator from which notnull are a fraction of. 24 | If None - it's the number of rows in df. 25 | If some df is provided - then it is assumed that df is a subset of df_target, and the 26 | response rate is calculated as the fraction of notnull values in df from (divided by) 27 | the number of notnull values in df_target. 28 | per_column (bool, optional): Default is True. 29 | The per_column argument is relevant only if df_target is other than None (i.e.: trying to compare df to some df_target). 30 | If per_column is True (default) - it indicates that the relative response rates of columns in df will be 31 | by comparing each column in df to the same column in target. 32 | If this is True, the columns in df and df_target must be identical. 33 | If per_column is False then df is compared to the overall number of nonnull rows in the target df. 34 | 35 | Returns: 36 | pd.DataFrame: A column per column in the original df, and two rows: 37 | One row with number of non-null observations, and 38 | A second row with the proportion of non-null observations. 39 | 40 | Examples: 41 | :: 42 | 43 | import numpy as np 44 | import pandas as pd 45 | from balance.stats_and_plots.general_stats import relative_response_rates 46 | 47 | df = pd.DataFrame({"o1": (7, 8, 9, 10), "o2": (7, 8, 9, np.nan), "id": (1, 2, 3, 4)}) 48 | 49 | relative_response_rates(df).to_dict() 50 | 51 | # {'o1': {'n': 4.0, '%': 100.0}, 52 | # 'o2': {'n': 3.0, '%': 75.0}, 53 | # 'id': {'n': 4.0, '%': 100.0}} 54 | 55 | df_target = pd.concat([df, df]) 56 | relative_response_rates(df, df_target).to_dict() 57 | 58 | # {'o1': {'n': 4.0, '%': 50.0}, 59 | # 'o2': {'n': 3.0, '%': 50.0}, 60 | # 'id': {'n': 4.0, '%': 50.0}} 61 | 62 | 63 | # Dividing by number of total notnull rows in df_rarget 64 | df_target.notnull().all(axis=1).sum() # == 6 65 | relative_response_rates(df, df_target, False).to_dict() 66 | 67 | # {'o1': {'n': 4.0, '%': 66.66666666666666}, 68 | # 'o2': {'n': 3.0, '%': 50.0}, 69 | # 'id': {'n': 4.0, '%': 66.66666666666666}} 70 | 71 | """ 72 | df_n_notnull_rows = df.notnull().sum() 73 | 74 | if df_target is None: 75 | target_n_notnull_rows = df.shape[0] 76 | elif per_column: # number of notnull rows, *per column*, in df_target 77 | # verify that the columns of df and df_target are identical: 78 | if (len(df.columns) != len(df_target.columns)) or ( 79 | df.columns.tolist() != df_target.columns.tolist() 80 | ): 81 | raise ValueError( 82 | f""" 83 | df and df_target must have the exact same columns. 84 | Instead, thes column names are, (df, df_target) = ({df.columns.tolist()}, {df_target.columns.tolist()}) 85 | """ 86 | ) 87 | 88 | # If they are, we can proceed forward: 89 | target_n_notnull_rows = df_target.notnull().sum() 90 | else: # number of notnull *rows* (i.e.: complete rows) in df_target 91 | target_n_notnull_rows = df_target.notnull().all(axis=1).sum() 92 | 93 | if any(df_n_notnull_rows > target_n_notnull_rows): 94 | raise ValueError( 95 | f""" 96 | The number of (notnull) rows in df MUST be smaller or equal to the number of rows in df_target. 97 | These were, (df_n_notnull_rows, target_n_notnull_rows) = ({df_n_notnull_rows}, {target_n_notnull_rows}) 98 | """ 99 | ) 100 | 101 | return pd.DataFrame( 102 | { 103 | "n": df_n_notnull_rows, 104 | "%": 100 * (df_n_notnull_rows / target_n_notnull_rows), 105 | } 106 | ).transpose() 107 | -------------------------------------------------------------------------------- /balance/testutil.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | import io 9 | import re 10 | import sys 11 | 12 | import unittest 13 | from contextlib import contextmanager 14 | from typing import Any, Union 15 | 16 | import numpy as np 17 | import numpy.typing as npt 18 | import pandas as pd 19 | 20 | 21 | def _assert_frame_equal_lazy( 22 | x: pd.DataFrame, y: pd.DataFrame, lazy: bool = True 23 | ) -> None: 24 | """Wrapper around pd.testing.assert_frame_equal, which transforms the 25 | dataframes to ignore some errors. 26 | 27 | Ignores order of columns 28 | 29 | Args: 30 | x (pd.DataFrame): DataFrame to compare 31 | y (pd.DataFrame): DataFrame to compare 32 | lazy (bool, optional): Should Ignores be applied. Defaults to True. 33 | 34 | Returns: 35 | None. 36 | """ 37 | if lazy: 38 | x = x.sort_index(axis=0).sort_index(axis=1) 39 | y = y.sort_index(axis=0).sort_index(axis=1) 40 | 41 | return pd.testing.assert_frame_equal(x, y) 42 | 43 | 44 | def _assert_index_equal_lazy(x: pd.Index, y: pd.Index, lazy: bool = True) -> None: 45 | """ 46 | Wrapper around pd.testing.assert_index_equal which transforms the 47 | dataindexs to ignore some errors. 48 | 49 | Ignores: 50 | - order of entries 51 | 52 | Args: 53 | x (pd.Index): Index to compare 54 | y (pd.Index): Index to compare 55 | lazy (bool, optional): Should Ignores be applied. Defaults to True. 56 | """ 57 | if lazy: 58 | x = x.sort_values() 59 | y = y.sort_values() 60 | 61 | return pd.testing.assert_index_equal(x, y) 62 | 63 | 64 | @contextmanager 65 | def _capture_output(): 66 | redirect_out, redirect_err = io.StringIO(), io.StringIO() 67 | original_out, original_err = sys.stdout, sys.stderr 68 | try: 69 | sys.stdout, sys.stderr = redirect_out, redirect_err 70 | yield sys.stdout, sys.stderr 71 | finally: 72 | sys.stdout, sys.stderr = original_out, original_err 73 | 74 | 75 | class BalanceTestCase(unittest.TestCase): 76 | # Some Warns 77 | def assertIfWarns(self, callable, *args, **kwargs) -> None: 78 | with self.assertLogs(level="NOTSET") as cm: 79 | callable(*args, **kwargs) 80 | self.assertTrue(len(cm.output) > 0, "No warning produced.") 81 | 82 | def assertNotWarns(self, callable, *args, **kwargs) -> None: 83 | output = None 84 | try: 85 | with self.assertLogs() as cm: 86 | callable(*args, **kwargs) 87 | output = cm 88 | except AssertionError: 89 | return 90 | raise AssertionError(f"Warning produced {output.output}.") 91 | 92 | def assertWarnsRegexp(self, regexp, callable, *args, **kwargs) -> None: 93 | with self.assertLogs(level="NOTSET") as cm: 94 | callable(*args, **kwargs) 95 | self.assertTrue( 96 | any((re.search(regexp, c) is not None) for c in cm.output), 97 | f"Warning {cm.output} does not match regex {regexp}.", 98 | ) 99 | 100 | def assertNotWarnsRegexp(self, regexp, callable, *args, **kwargs) -> None: 101 | with self.assertLogs(level="NOTSET") as cm: 102 | callable(*args, **kwargs) 103 | self.assertFalse( 104 | any((re.search(regexp, c) is not None) for c in cm.output), 105 | f"Warning {cm.output} matches regex {regexp}.", 106 | ) 107 | 108 | # Some Equal 109 | def assertEqual( 110 | self, 111 | first: Union[npt.NDArray, pd.DataFrame, pd.Index, pd.Series, Any], 112 | second: Union[npt.NDArray, pd.DataFrame, pd.Index, pd.Series, Any], 113 | msg: Any = ..., 114 | **kwargs, 115 | ) -> None: 116 | """ 117 | Check if first and second are equal. 118 | Uses np.testing.assert_array_equal for np.ndarray, 119 | _assert_frame_equal_lazy for pd.DataFrame, 120 | assert_series_equal for pd.DataFrame, 121 | _assert_index_equal_lazy for pd.Index, 122 | or unittest.TestCase.assertEqual otherwise. 123 | 124 | Args: 125 | first (Union[np.ndarray, pd.DataFrame, pd.Index, pd.Series]): first element to compare. 126 | second (Union[np.ndarray, pd.DataFrame, pd.Index, pd.Series]): second element to compare. 127 | msg (Any, optional): The error message on failure. 128 | """ 129 | lazy: bool = kwargs.get("lazy", False) 130 | if isinstance(first, np.ndarray) or isinstance(second, np.ndarray): 131 | np.testing.assert_array_equal(first, second, **kwargs) 132 | elif isinstance(first, pd.DataFrame) or isinstance(second, pd.DataFrame): 133 | _assert_frame_equal_lazy( 134 | first, 135 | second, 136 | lazy, 137 | ) 138 | elif isinstance(first, pd.Series) or isinstance(second, pd.Series): 139 | pd.testing.assert_series_equal(first, second) 140 | elif isinstance(first, pd.Index) or isinstance(second, pd.Index): 141 | _assert_index_equal_lazy(first, second, lazy) 142 | else: 143 | super().assertEqual(first, second, msg=msg, **kwargs) 144 | 145 | # Some Prints 146 | def assertPrints(self, callable, *args, **kwargs) -> None: 147 | with _capture_output() as (out, err): 148 | callable(*args, **kwargs) 149 | out, err = out.getvalue(), err.getvalue() 150 | self.assertTrue((len(out) + len(err)) > 0, "No printed output.") 151 | 152 | def assertNotPrints(self, callable, *args, **kwargs) -> None: 153 | with _capture_output() as (out, err): 154 | callable(*args, **kwargs) 155 | out, err = out.getvalue(), err.getvalue() 156 | self.assertTrue( 157 | (len(out) + len(err)) == 0, 158 | f"Printed output is longer than 0: {(out, err)}.", 159 | ) 160 | 161 | def assertPrintsRegexp(self, regexp, callable, *args, **kwargs) -> None: 162 | with _capture_output() as (out, err): 163 | callable(*args, **kwargs) 164 | out, err = out.getvalue(), err.getvalue() 165 | self.assertTrue( 166 | any((re.search(regexp, o) is not None) for o in (out, err)), 167 | f"Printed output {(out, err)} does not match regex {regexp}.", 168 | ) 169 | 170 | def assertNotPrintsRegexp(self, regexp, callable, *args, **kwargs) -> None: 171 | with _capture_output() as (out, err): 172 | callable(*args, **kwargs) 173 | out, err = out.getvalue(), err.getvalue() 174 | self.assertFalse( 175 | any((re.search(regexp, o) is not None) for o in (out, err)), 176 | f"Printed output {(out, err)} matches regex {regexp}.", 177 | ) 178 | -------------------------------------------------------------------------------- /balance/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | # LICENSE file in the root directory of this source tree. 9 | 10 | from pathlib import Path 11 | from typing import AnyStr, IO, Union 12 | 13 | FilePathOrBuffer = Union[str, Path, IO[AnyStr]] 14 | -------------------------------------------------------------------------------- /balance/weighting_methods/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /balance/weighting_methods/adjust_null.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | import logging 11 | from typing import Dict, Union 12 | 13 | import pandas as pd 14 | 15 | logger: logging.Logger = logging.getLogger(__package__) 16 | 17 | 18 | def adjust_null( 19 | sample_df: pd.DataFrame, 20 | sample_weights: pd.Series, 21 | target_df: pd.DataFrame, 22 | target_weights: pd.Series, 23 | *args, 24 | **kwargs, 25 | ) -> Dict[str, Union[Dict[str, str], pd.Series]]: 26 | """Doesn't apply any adjustment to the data. Returns the design weights as they are. 27 | This may be useful when one needs the output of Sample.adjust() (i.e.: an adjusted object), 28 | but wishes to not run any model for it. 29 | 30 | Args: 31 | sample_df (pd.DataFrame): a dataframe representing the sample 32 | sample_weights (pd.Series): design weights for sample 33 | target_df (pd.DataFrame): a dataframe representing the target 34 | target_weights (pd.Series): design weights for target 35 | 36 | Returns: 37 | Dict[str, Union[Dict[str, str], pd.Series]]: Dict of weights (original sample weights) and model (with method = null_adjustment) 38 | """ 39 | 40 | return { 41 | "weight": sample_weights, 42 | "model": { 43 | "method": "null_adjustment", 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /balance/weighting_methods/poststratify.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | import logging 11 | from typing import Dict, List, Optional, Union 12 | 13 | import pandas as pd 14 | 15 | from balance import adjustment as balance_adjustment, util as balance_util 16 | 17 | logger: logging.Logger = logging.getLogger(__package__) 18 | 19 | 20 | # TODO: Add tests for all arguments of function 21 | # TODO: Add argument for na_action 22 | def poststratify( 23 | sample_df: pd.DataFrame, 24 | sample_weights: pd.Series, 25 | target_df: pd.DataFrame, 26 | target_weights: pd.Series, 27 | variables: Optional[List[str]] = None, 28 | transformations: str = "default", 29 | transformations_drop: bool = True, 30 | *args, 31 | **kwargs, 32 | ) -> Dict[str, Union[pd.Series, Dict[str, str]]]: 33 | """Perform cell-based post-stratification. The output weights take into account 34 | the design weights and the post-stratification weights. 35 | Reference: https://docs.wfp.org/api/documents/WFP-0000121326/download/ 36 | 37 | Args: 38 | sample_df (pd.DataFrame): a dataframe representing the sample 39 | sample_weights (pd.Series): design weights for sample 40 | target_df (pd.DataFrame): a dataframe representing the target 41 | target_weights (pd.Series): design weights for target 42 | variables (Optional[List[str]], optional): list of variables to include in the model. 43 | If None all joint variables of sample_df and target_df are used 44 | transformations (str, optional): what transformations to apply to data before fitting the model. 45 | Default is "default" (see apply_transformations function) 46 | transformations_drop (bool, optional): whether the function should drop non-transformed variables. 47 | Default is True. 48 | Raises: 49 | ValueError: _description_ 50 | ValueError: _description_ 51 | 52 | Returns: 53 | Dict[str, Union[pd.Series, Dict[str, str]]]: 54 | weight (pd.Series): final weights (sum up to target's sum of weights) 55 | model (dict): method of adjustment 56 | 57 | Dict shape: 58 | { 59 | "weight": w, 60 | "model": {"method": "poststratify"}, 61 | } 62 | """ 63 | balance_util._check_weighting_methods_input(sample_df, sample_weights, "sample") 64 | balance_util._check_weighting_methods_input(target_df, target_weights, "target") 65 | 66 | if ("weight" in sample_df.columns.values) or ("weight" in target_df.columns.values): 67 | raise ValueError( 68 | "weight can't be a name of a column in sample or target when applying poststratify" 69 | ) 70 | 71 | variables = balance_util.choose_variables(sample_df, target_df, variables=variables) 72 | logger.debug(f"Join variables for sample and target: {variables}") 73 | 74 | sample_df = sample_df.loc[:, variables] 75 | target_df = target_df.loc[:, variables] 76 | 77 | sample_df, target_df = balance_adjustment.apply_transformations( 78 | (sample_df, target_df), 79 | transformations=transformations, 80 | drop=transformations_drop, 81 | ) 82 | variables = list(sample_df.columns) 83 | logger.debug(f"Final variables in the model after transformations: {variables}") 84 | 85 | target_df = target_df.assign(weight=target_weights) 86 | target_cell_props = target_df.groupby(list(variables))["weight"].sum() 87 | 88 | sample_df = sample_df.assign(design_weight=sample_weights) 89 | sample_cell_props = sample_df.groupby(list(variables))["design_weight"].sum() 90 | 91 | combined = pd.merge( 92 | target_cell_props, 93 | sample_cell_props, 94 | right_index=True, 95 | left_index=True, 96 | how="outer", 97 | ) 98 | 99 | # check that all combinations of cells in sample_df are also in target_df 100 | if any(combined["weight"].isna()): 101 | raise ValueError("all combinations of cells in sample_df must be in target_df") 102 | 103 | combined["weight"] = combined["weight"] / combined["design_weight"] 104 | sample_df = sample_df.join(combined["weight"], on=variables) 105 | w = sample_df.weight * sample_df.design_weight 106 | 107 | return { 108 | "weight": w, 109 | "model": {"method": "poststratify"}, 110 | } 111 | -------------------------------------------------------------------------------- /scripts/make_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Meta Platforms, Inc. and affiliates. 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Exit if any error occurs 8 | set -e 9 | 10 | usage() { 11 | echo "Usage: $0 [-n] [-o]" 12 | echo "" 13 | echo " -n Enable openssl legacy providers from Node (for Docusaurus builds)" 14 | echo " -o Only Docusaurus (skip Sphinx, tutorials). Useful when just make change to Docusaurus settings." 15 | echo "Build balance documentation. Must be executed from root of balance repository." 16 | echo "" 17 | exit 1 18 | } 19 | 20 | NODE_LEGACY_SSL=false 21 | ONLY_DOCUSAURUS=false 22 | 23 | autosync_doc() { 24 | sed -i -e '/<\!--AUTOGENERATED START-->/r '"$1" \ 25 | -e 's/<\!--AUTOGENERATED END-->/&/' "$2" || exit 26 | } 27 | 28 | cleanup_doc() { 29 | sed -i '/<\!--AUTOGENERATED START-->/,/<\!--AUTOGENERATED END-->/{/^<\!--AUTOGENERATED START-->/!{/<\!--AUTOGENERATED END-->/!d}}' "$1" || exit 30 | } 31 | 32 | while getopts 'hno' flag; do 33 | case "${flag}" in 34 | h) 35 | usage 36 | ;; 37 | n) 38 | NODE_LEGACY_SSL=true 39 | ;; 40 | o) 41 | ONLY_DOCUSAURUS=true 42 | ;; 43 | *) 44 | usage 45 | ;; 46 | esac 47 | done 48 | 49 | 50 | echo "--------------------------------------------" 51 | echo "Pre-process Docusaurus docs" 52 | echo "--------------------------------------------" 53 | # For idempotency/safety, remove any content between the autogen delimiters 54 | cleanup_doc "website/docs/docs/overview.md" 55 | cleanup_doc "website/docs/docs/contributing.md" 56 | 57 | # Copy the contents of README.md -> website/docs/docs/overview.md 58 | autosync_doc "README.md" "website/docs/docs/overview.md" 59 | autosync_doc "CONTRIBUTING.md" "website/docs/docs/contributing.md" 60 | 61 | if [[ $ONLY_DOCUSAURUS == false ]]; then 62 | echo "-----------------------------------" 63 | echo "Generating API reference via Sphinx" 64 | echo "-----------------------------------" 65 | cd sphinx || exit 66 | make html 67 | cd .. || exit 68 | 69 | echo "--------------------------------------------" 70 | echo "Moving Sphinx artifacts to Docusaurus" 71 | echo "--------------------------------------------" 72 | mkdir -p "website/static/api_reference/" 73 | cp -r sphinx/_build/* website/static/api_reference/ || exit 74 | 75 | echo "-----------------------------------" 76 | echo "Building tutorial HTML" 77 | echo "-----------------------------------" 78 | jupyter nbconvert tutorials/balance_quickstart.ipynb --execute --to html \ 79 | --output-dir website/static/html/tutorials 80 | jupyter nbconvert tutorials/balance_quickstart_cbps.ipynb --execute --to html \ 81 | --output-dir website/static/html/tutorials 82 | jupyter nbconvert tutorials/balance_quickstart_rake.ipynb --execute --to html \ 83 | --output-dir website/static/html/tutorials 84 | jupyter nbconvert tutorials/balance_transformations_and_formulas.ipynb --execute --to html \ 85 | --output-dir website/static/html/tutorials 86 | jupyter nbconvert tutorials/comparing_cbps_in_r_vs_python_using_sim_data.ipynb --execute --to html \ 87 | --output-dir website/static/html/tutorials 88 | fi 89 | 90 | echo "-----------------------------------" 91 | echo "Getting Docusaurus deps" 92 | echo "-----------------------------------" 93 | cd website || exit 94 | yarn 95 | 96 | echo "-----------------------------------" 97 | echo "Building static Docusaurus site" 98 | echo "-----------------------------------" 99 | if [[ $NODE_LEGACY_SSL == true ]]; then 100 | NODE_OPTIONS=--openssl-legacy-provider yarn build 101 | else 102 | yarn build 103 | fi 104 | cd .. || exit 105 | 106 | 107 | echo "--------------------------------------------" 108 | echo "Clean up" 109 | echo "--------------------------------------------" 110 | cleanup_doc "website/docs/docs/overview.md" 111 | cleanup_doc "website/docs/docs/contributing.md" 112 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from setuptools import find_packages, setup 7 | 8 | """ 9 | Core library deps 10 | 11 | Version requirements 12 | * pandas<=2.0.3: Newer versions lead to "AttributeError: module 'pandas.core.arrays.numpy_' has no attribute 'PandasArray'" 13 | * scipy<=1.10.1 and scikit-learn<=1.2.2: Necessary for numerical tests to pass. May be possible to relax these without major issues. 14 | """ 15 | REQUIRES = [ 16 | "numpy", 17 | "pandas<=2.0.3", 18 | "ipython", 19 | "scipy<=1.10.1", 20 | "patsy", 21 | "seaborn", 22 | "plotly", 23 | "matplotlib", 24 | "statsmodels", 25 | "scikit-learn<=1.2.2", 26 | "ipfn", 27 | "session-info", 28 | ] 29 | 30 | # Development deps (e.g. pytest, builds) 31 | # TODO[scubasteve]: Add dev requirements 32 | DEV_REQUIRES = [ 33 | "setuptools_scm", 34 | "wheel", 35 | "pytest", 36 | "sphinx", 37 | "notebook", 38 | "nbconvert", 39 | ] 40 | 41 | DESCRIPTION = ( 42 | "balance is a Python package offering a simple workflow and methods for " 43 | "dealing with biased data samples when looking to infer from them to " 44 | "some target population of interest." 45 | ) 46 | 47 | 48 | def setup_package() -> None: 49 | """Used for building/installing the balance package.""" 50 | with open("README.md", "r", encoding="utf-8") as fh: 51 | long_description = fh.read() 52 | 53 | setup( 54 | name="balance", 55 | description=DESCRIPTION, 56 | author="Facebook, Inc.", 57 | license="MIT", 58 | url="https://github.com/facebookresearch/balance", 59 | keywords=[""], 60 | long_description=long_description, 61 | long_description_content_type="text/markdown", 62 | python_requires=">=3.9", 63 | install_requires=REQUIRES, 64 | packages=find_packages(include=["balance*"]), 65 | # Include all csv files 66 | package_data={"": ["*.csv"]}, 67 | extras_require={ 68 | "dev": DEV_REQUIRES, 69 | }, 70 | use_scm_version={ 71 | "write_to": "version.py", 72 | }, 73 | setup_requires=["setuptools_scm"], 74 | classifiers=[ 75 | "Development Status :: 3 - Alpha", 76 | "Intended Audience :: Science/Research", 77 | "Topic :: Scientific/Engineering", 78 | "Programming Language :: Python :: 3", 79 | "Programming Language :: Python :: 3.9", 80 | "Programming Language :: Python :: 3.10", 81 | "Programming Language :: Python :: 3.11", 82 | "License :: OSI Approved :: MIT License", 83 | ], 84 | ) 85 | 86 | 87 | if __name__ == "__main__": 88 | setup_package() 89 | -------------------------------------------------------------------------------- /sphinx/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /sphinx/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * 4 | * This software may be used and distributed according to the terms of the 5 | * GNU General Public License version 2. 6 | */ 7 | 8 | /**/ 9 | -------------------------------------------------------------------------------- /sphinx/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | 5 | .. autoclass:: {{ objname }} 6 | :members: 7 | :show-inheritance: 8 | :inherited-members: 9 | :special-members: __call__, __add__, __mul__ 10 | 11 | {% block methods %} 12 | {% if methods %} 13 | .. rubric:: {{ _('Methods') }} 14 | 15 | .. autosummary:: 16 | :nosignatures: 17 | {% for item in methods %} 18 | {%- if not item.startswith('_') %} 19 | ~{{ name }}.{{ item }} 20 | {%- endif -%} 21 | {%- endfor %} 22 | {% endif %} 23 | {% endblock %} 24 | 25 | {% block attributes %} 26 | {% if attributes %} 27 | .. rubric:: {{ _('Attributes') }} 28 | 29 | .. autosummary:: 30 | {% for item in attributes %} 31 | ~{{ name }}.{{ item }} 32 | {%- endfor %} 33 | {% endif %} 34 | {% endblock %} 35 | -------------------------------------------------------------------------------- /sphinx/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | {{ fullname | escape | underline}} 2 | 3 | .. automodule:: {{ fullname }} 4 | :members: 5 | 6 | {% block modules %} 7 | {% if modules %} 8 | 9 | **Sub-Modules** 10 | 11 | .. autosummary:: 12 | :toctree: 13 | :recursive: 14 | {% for item in modules %} 15 | {{ item }} 16 | {%- endfor %} 17 | {% endif %} 18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /sphinx/balance.rst: -------------------------------------------------------------------------------- 1 | balance 2 | ======= 3 | 4 | .. automodule:: balance 5 | :members: 6 | 7 | 8 | 9 | 10 | .. autosummary:: 11 | :toctree: 12 | :recursive: 13 | 14 | balance.adjustment 15 | balance.balancedf_class 16 | balance.cli 17 | balance.sample_class 18 | balance.stats_and_plots 19 | balance.testutil 20 | balance.typing 21 | balance.util 22 | balance.weighting_methods 23 | 24 | -------------------------------------------------------------------------------- /sphinx/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | import os 7 | import sys 8 | 9 | # Configuration file for the Sphinx documentation builder. 10 | # 11 | # For the full list of built-in configuration values, see the documentation: 12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 13 | 14 | # -- Project information ----------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 16 | 17 | project = "balance" 18 | copyright = "Copyright © 2022 Meta Platforms, Inc." 19 | author = "Meta Platforms" 20 | 21 | # If extensions (or modules to document with autodoc) are in another directory, 22 | # add these directories to sys.path here. If the directory is relative to the 23 | # documentation root, use os.path.abspath to make it absolute, like shown here. 24 | sys.path.insert(0, os.path.abspath("..")) 25 | 26 | # -- General configuration --------------------------------------------------- 27 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 28 | 29 | extensions = [ 30 | "sphinx.ext.autodoc", # HTML generation from Py docstrings 31 | "sphinx.ext.autosummary", # Generate Py docs recursively 32 | "sphinx.ext.napoleon", # Support Google style docstrings 33 | "sphinx.ext.doctest", 34 | "sphinx.ext.todo", 35 | "sphinx.ext.coverage", 36 | "sphinx.ext.mathjax", 37 | "sphinx.ext.ifconfig", 38 | "sphinx.ext.viewcode", 39 | ] 40 | autosummary_generate = True # Turn on sphinx.ext.autosummary 41 | templates_path = ["_templates"] 42 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 43 | 44 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 45 | html_show_sphinx = False 46 | 47 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 48 | html_show_copyright = False 49 | 50 | # The name of the Pygments (syntax highlighting) style to use. 51 | pygments_style = "sphinx" 52 | 53 | # -- Options for HTML output ------------------------------------------------- 54 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 55 | 56 | html_theme = "basic" 57 | html_static_path = ["_static"] 58 | -------------------------------------------------------------------------------- /sphinx/index.rst: -------------------------------------------------------------------------------- 1 | .. balance documentation master file, created by 2 | sphinx-quickstart on Wed Nov 9 13:42:48 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | API Reference 7 | =============================================== 8 | 9 | .. toctree:: 10 | 11 | balance 12 | 13 | .. autosummary:: 14 | :toctree: 15 | :recursive: 16 | -------------------------------------------------------------------------------- /sphinx/make.bat: -------------------------------------------------------------------------------- 1 | @REM Copyright (c) Meta Platforms, Inc. and affiliates. 2 | @REM 3 | @REM This software may be used and distributed according to the terms of the 4 | @REM GNU General Public License version 2. 5 | 6 | @ECHO OFF 7 | 8 | pushd %~dp0 9 | 10 | REM Command file for Sphinx documentation 11 | 12 | if "%SPHINXBUILD%" == "" ( 13 | set SPHINXBUILD=sphinx-build 14 | ) 15 | set SOURCEDIR=. 16 | set BUILDDIR=_build 17 | 18 | %SPHINXBUILD% >NUL 2>NUL 19 | if errorlevel 9009 ( 20 | echo. 21 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 22 | echo.installed, then set the SPHINXBUILD environment variable to point 23 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 24 | echo.may add the Sphinx directory to PATH. 25 | echo. 26 | echo.If you don't have Sphinx installed, grab it from 27 | echo.https://www.sphinx-doc.org/ 28 | exit /b 1 29 | ) 30 | 31 | if "%1" == "" goto help 32 | 33 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 34 | goto end 35 | 36 | :help 37 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 38 | 39 | :end 40 | popd 41 | -------------------------------------------------------------------------------- /tests/test_adjust_null.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | import balance.testutil 11 | import pandas as pd 12 | 13 | from balance.sample_class import Sample 14 | from balance.weighting_methods import adjust_null as balance_adjust_null 15 | 16 | 17 | sample = Sample.from_frame( 18 | df=pd.DataFrame( 19 | { 20 | "a": (1, 2, 3, 1), 21 | "b": (-42, 8, 2, -42), 22 | "o": (7, 8, 9, 10), 23 | "c": ("x", "y", "z", "x"), 24 | "id": (1, 2, 3, 4), 25 | "w": (0.5, 2, 1, 1), 26 | } 27 | ), 28 | id_column="id", 29 | weight_column="w", 30 | outcome_columns="o", 31 | ) 32 | 33 | target = Sample.from_frame( 34 | pd.DataFrame( 35 | { 36 | "a": (1, 2, 3), 37 | "b": (-42, 8, 2), 38 | "c": ("x", "y", "z"), 39 | "id": (1, 2, 3), 40 | "w": (2, 0.5, 1), 41 | } 42 | ), 43 | id_column="id", 44 | weight_column="w", 45 | ) 46 | 47 | 48 | class TestAdjustmentNull( 49 | balance.testutil.BalanceTestCase, 50 | ): 51 | def test_adjust_null(self): 52 | res = balance_adjust_null.adjust_null( 53 | pd.DataFrame({"a": [1, 2, 3]}), 54 | pd.Series([4, 5, 6]), 55 | pd.DataFrame({"a": [7, 8, 9]}), 56 | pd.Series([10, 11, 12]), 57 | ) 58 | self.assertEqual(res["weight"], pd.Series([4, 5, 6])) 59 | self.assertEqual(res["model"]["method"], "null_adjustment") 60 | 61 | result = sample.adjust(target, method="null") 62 | self.assertEqual(sample.weights().df, result.weights().df) 63 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | import balance.testutil 9 | import numpy as np 10 | 11 | from balance.datasets import load_data 12 | 13 | 14 | class TestDatasets( 15 | balance.testutil.BalanceTestCase, 16 | ): 17 | def test_load_data(self): 18 | target_df, sample_df = load_data() 19 | 20 | self.assertEqual(sample_df.shape, (1000, 5)) 21 | self.assertEqual(target_df.shape, (10000, 5)) 22 | 23 | self.assertEqual( 24 | target_df.columns.to_numpy().tolist(), 25 | ["id", "gender", "age_group", "income", "happiness"], 26 | ) 27 | self.assertEqual( 28 | sample_df.columns.to_numpy().tolist(), 29 | ["id", "gender", "age_group", "income", "happiness"], 30 | ) 31 | 32 | o = sample_df.head().round(2).to_dict() 33 | e = { 34 | "id": {0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}, 35 | "gender": {0: "Male", 1: "Female", 2: "Male", 3: np.nan, 4: np.nan}, 36 | "age_group": {0: "25-34", 1: "18-24", 2: "18-24", 3: "18-24", 4: "18-24"}, 37 | "income": {0: 6.43, 1: 9.94, 2: 2.67, 3: 10.55, 4: 2.69}, 38 | "happiness": {0: 26.04, 1: 66.89, 2: 37.09, 3: 49.39, 4: 72.3}, 39 | } 40 | self.assertEqual(o.__str__(), e.__str__()) 41 | # NOTE: using .__str__() since doing o==e will give False 42 | 43 | o = target_df.head().round(2).to_dict() 44 | e = { 45 | "id": {0: "100000", 1: "100001", 2: "100002", 3: "100003", 4: "100004"}, 46 | "gender": {0: "Male", 1: "Male", 2: "Male", 3: np.nan, 4: np.nan}, 47 | "age_group": {0: "45+", 1: "45+", 2: "35-44", 3: "45+", 4: "25-34"}, 48 | "income": {0: 10.18, 1: 6.04, 2: 5.23, 3: 5.75, 4: 4.84}, 49 | "happiness": {0: 61.71, 1: 79.12, 2: 44.21, 3: 83.99, 4: 49.34}, 50 | } 51 | self.assertEqual(o.__str__(), e.__str__()) 52 | 53 | def test_load_data_cbps(self): 54 | target_df, sample_df = load_data("sim_data_cbps") 55 | 56 | self.assertEqual(sample_df.shape, (246, 7)) 57 | self.assertEqual(target_df.shape, (254, 7)) 58 | 59 | self.assertEqual( 60 | target_df.columns.to_numpy().tolist(), 61 | ["X1", "X2", "X3", "X4", "cbps_weights", "y", "id"], 62 | ) 63 | self.assertEqual( 64 | sample_df.columns.to_numpy().tolist(), 65 | target_df.columns.to_numpy().tolist(), 66 | ) 67 | 68 | o = sample_df.head().round(2).to_dict() 69 | e = { 70 | "X1": {0: 1.07, 2: 0.69, 4: 0.5, 5: 1.52, 6: 1.03}, 71 | "X2": {0: 10.32, 2: 10.65, 4: 9.59, 5: 10.03, 6: 9.79}, 72 | "X3": {0: 0.21, 2: 0.22, 4: 0.23, 5: 0.33, 6: 0.22}, 73 | "X4": {0: 463.28, 2: 424.29, 4: 472.85, 5: 438.38, 6: 436.39}, 74 | "cbps_weights": {0: 0.01, 2: 0.01, 4: 0.01, 5: 0.0, 6: 0.0}, 75 | "y": {0: 227.53, 2: 196.89, 4: 191.3, 5: 280.45, 6: 227.07}, 76 | "id": {0: 1, 2: 3, 4: 5, 5: 6, 6: 7}, 77 | } 78 | self.assertEqual(o.__str__(), e.__str__()) 79 | # NOTE: using .__str__() since doing o==e will give False 80 | 81 | o = target_df.head().round(2).to_dict() 82 | e = { 83 | "X1": {1: 0.72, 3: 0.35, 11: 0.69, 12: 0.78, 13: 0.82}, 84 | "X2": {1: 9.91, 3: 9.91, 11: 10.73, 12: 9.56, 13: 9.8}, 85 | "X3": {1: 0.19, 3: 0.1, 11: 0.21, 12: 0.18, 13: 0.21}, 86 | "X4": {1: 383.76, 3: 399.37, 11: 398.31, 12: 370.18, 13: 434.45}, 87 | "cbps_weights": {1: 0.0, 3: 0.0, 11: 0.0, 12: 0.0, 13: 0.0}, 88 | "y": {1: 199.82, 3: 174.69, 11: 189.58, 12: 208.18, 13: 214.28}, 89 | "id": {1: 2, 3: 4, 11: 12, 12: 13, 13: 14}, 90 | } 91 | self.assertEqual(o.__str__(), e.__str__()) 92 | -------------------------------------------------------------------------------- /tests/test_logging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | 9 | from __future__ import absolute_import, division, print_function, unicode_literals 10 | 11 | import logging 12 | 13 | import balance 14 | import balance.testutil 15 | 16 | 17 | class TestBalanceSetWarnings(balance.testutil.BalanceTestCase): 18 | def test_balance_set_warnings(self): 19 | logger = logging.getLogger(__package__) 20 | 21 | balance.set_warnings("WARNING") 22 | self.assertNotWarns(logger.debug, "test_message") 23 | 24 | balance.set_warnings("DEBUG") 25 | self.assertWarnsRegexp("test_message", logger.debug, "test_message") 26 | self.assertWarnsRegexp("test_message", logger.warning, "test_message") 27 | 28 | balance.set_warnings("WARNING") 29 | self.assertWarnsRegexp("test_message", logger.warning, "test_message") 30 | self.assertNotWarns(logger.debug, "test_message") 31 | -------------------------------------------------------------------------------- /tests/test_poststratify.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | from __future__ import absolute_import, division, print_function, unicode_literals 9 | 10 | import balance.testutil 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from balance.sample_class import Sample 15 | from balance.weighting_methods.poststratify import poststratify 16 | 17 | 18 | class Testpoststratify( 19 | balance.testutil.BalanceTestCase, 20 | ): 21 | def test_poststratify(self): 22 | s = pd.DataFrame( 23 | { 24 | "a": (0, 1, 0, 1), 25 | "c": ("a", "a", "b", "b"), 26 | }, 27 | ) 28 | s_weights = pd.Series([4, 2, 1, 1]) 29 | t = s 30 | t_weights = pd.Series([4, 2, 2, 8]) 31 | result = poststratify( 32 | sample_df=s, sample_weights=s_weights, target_df=t, target_weights=t_weights 33 | )["weight"] 34 | self.assertEqual(result, t_weights.astype("float64")) 35 | 36 | # same example when dataframe of elements are all related to weights of one 37 | s = pd.DataFrame( 38 | { 39 | "a": (0, 0, 0, 0, 1, 1, 0, 1), 40 | "c": ("a", "a", "a", "a", "a", "a", "b", "b"), 41 | }, 42 | ) 43 | s_weights = pd.Series([1, 1, 1, 1, 1, 1, 1, 1]) 44 | result = poststratify( 45 | sample_df=s, sample_weights=s_weights, target_df=t, target_weights=t_weights 46 | )["weight"] 47 | self.assertEqual(result, pd.Series((1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 8.0))) 48 | 49 | # same example with normalized weights 50 | s = pd.DataFrame( 51 | { 52 | "a": (0, 1, 0, 1), 53 | "c": ("a", "a", "b", "b"), 54 | }, 55 | ) 56 | s_weights = pd.Series([1 / 2, 1 / 4, 1 / 8, 1 / 8]) 57 | result = poststratify( 58 | sample_df=s, sample_weights=s_weights, target_df=t, target_weights=t_weights 59 | )["weight"] 60 | self.assertEqual(result, t_weights.astype("float64")) 61 | 62 | # test through adjustment 63 | # TODO: test the previous example through adjustment as well 64 | sample = Sample.from_frame( 65 | df=pd.DataFrame( 66 | { 67 | "a": (1, 2, 3, 1), 68 | "b": (-42, 8, 2, -42), 69 | "o": (7, 8, 9, 10), 70 | "c": ("x", "y", "z", "x"), 71 | "id": (1, 2, 3, 4), 72 | "w": (0.5, 2, 1, 1), 73 | } 74 | ), 75 | id_column="id", 76 | weight_column="w", 77 | outcome_columns="o", 78 | ) 79 | target = Sample.from_frame( 80 | pd.DataFrame( 81 | { 82 | "a": (1, 2, 3), 83 | "b": (-42, 8, 2), 84 | "c": ("x", "y", "z"), 85 | "id": (1, 2, 3), 86 | "w": (2, 0.5, 1), 87 | } 88 | ), 89 | id_column="id", 90 | weight_column="w", 91 | ) 92 | result = sample.adjust(target, method="poststratify", transformations=None) 93 | expected = pd.Series( 94 | ( 95 | (2 / 1.5 * 0.5), 96 | (0.5 / 2 * 2), 97 | (1 / 1 * 1), 98 | (2 / 1.5 * 1), 99 | ) 100 | ) 101 | self.assertEqual(expected, result.weights().df.iloc[:, 0].values) 102 | 103 | def test_poststratify_variables_arg(self): 104 | s = pd.DataFrame( 105 | { 106 | "a": (0, 1, 0, 1), 107 | "c": ("a", "a", "b", "b"), 108 | }, 109 | ) 110 | s_weights = pd.Series([4, 2, 2, 3]) 111 | t = s 112 | t_weights = pd.Series([4, 2, 2, 8]) 113 | result = poststratify( 114 | sample_df=s, 115 | sample_weights=s_weights, 116 | target_df=t, 117 | target_weights=t_weights, 118 | variables=["a"], 119 | )["weight"] 120 | self.assertEqual(result, pd.Series([4.0, 4.0, 2.0, 6.0])) 121 | 122 | def test_poststratify_transformations(self): 123 | # for numeric 124 | size = 10000 125 | s = pd.DataFrame({"age": np.random.uniform(0, 1, size)}) 126 | tmp = int(size * 0.2) 127 | t = pd.DataFrame( 128 | { 129 | "age": np.concatenate( 130 | ( 131 | np.random.uniform(0, 0.4, tmp), 132 | np.random.uniform(0.4, 1, size - tmp), 133 | ) 134 | ) 135 | } 136 | ) 137 | result = poststratify( 138 | sample_df=s, 139 | sample_weights=pd.Series([1] * size), 140 | target_df=t, 141 | target_weights=pd.Series([1] * size), 142 | )["weight"] 143 | 144 | # age>0.4 has 4 times as many people than age <0.4 in the target 145 | # Check that the weights come out as 0.2 and 0.8 146 | eps = 0.05 147 | self.assertTrue(abs(result[s.age < 0.4].sum() / size - 0.2) < eps) 148 | self.assertTrue(abs(result[s.age >= 0.4].sum() / size - 0.8) < eps) 149 | 150 | # for strings 151 | size = 10000 152 | s = pd.DataFrame( 153 | {"x": np.random.choice(("a", "b", "c"), size, p=(0.95, 0.035, 0.015))} 154 | ) 155 | t = pd.DataFrame( 156 | {"x": np.random.choice(("a", "b", "c"), size, p=(0.95, 0.015, 0.035))} 157 | ) 158 | result = poststratify( 159 | sample_df=s, 160 | sample_weights=pd.Series([1] * size), 161 | target_df=t, 162 | target_weights=pd.Series([1] * size), 163 | )["weight"] 164 | 165 | # Output weights should ignore the difference between values 'b' and 'c' 166 | # since these are combined in default transformations (into '_lumped_other'). 167 | # Hence their frequency would be as in sample 168 | eps = 0.05 169 | self.assertTrue(abs(result[s.x == "a"].sum() / size - 0.95) < eps) 170 | self.assertTrue(abs(result[s.x == "b"].sum() / size - 0.035) < eps) 171 | self.assertTrue(abs(result[s.x == "c"].sum() / size - 0.015) < eps) 172 | 173 | def test_poststratify_exceptions(self): 174 | # column with name weight 175 | s = pd.DataFrame( 176 | { 177 | "weight": (0, 1, 0, 1), 178 | "c": ("a", "a", "b", "b"), 179 | }, 180 | ) 181 | s_weights = pd.Series([4, 2, 1, 1]) 182 | t = pd.DataFrame( 183 | { 184 | "a": (0, 1, 0, 1), 185 | "c": ("a", "a", "b", "b"), 186 | }, 187 | ) 188 | t_weights = pd.Series([4, 2, 2, 8]) 189 | with self.assertRaisesRegex( 190 | ValueError, 191 | "weight can't be a name of a column in sample or target when applying poststratify", 192 | ): 193 | poststratify(s, s_weights, t, t_weights) 194 | with self.assertRaisesRegex( 195 | ValueError, 196 | "weight can't be a name of a column in sample or target when applying poststratify", 197 | ): 198 | poststratify(t, t_weights, s, s_weights) 199 | 200 | # not all sample cells are in target 201 | s = pd.DataFrame( 202 | { 203 | "a": ("x", "y"), 204 | "b": (0, 1), 205 | }, 206 | ) 207 | s_weights = pd.Series([1] * 2) 208 | t = pd.DataFrame( 209 | { 210 | "a": ("x", "x", "y"), 211 | "b": (0, 1, 0), 212 | }, 213 | ) 214 | t_weights = pd.Series([2] * 3) 215 | with self.assertRaisesRegex( 216 | ValueError, "all combinations of cells in sample_df must be in target_df" 217 | ): 218 | poststratify(s, s_weights, t, t_weights) 219 | -------------------------------------------------------------------------------- /tests/test_testutil.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # pyre-unsafe 7 | 8 | import logging 9 | import sys 10 | 11 | import balance.testutil 12 | 13 | import numpy as np 14 | import pandas as pd 15 | 16 | 17 | class TestTestUtil( 18 | balance.testutil.BalanceTestCase, 19 | ): 20 | def test_testutil(self): 21 | # _assert_frame_equal_lazy 22 | self.assertRaises( 23 | AssertionError, 24 | balance.testutil._assert_frame_equal_lazy, 25 | pd.DataFrame({"a": (1, 2, 3)}), 26 | pd.DataFrame({"a": (1, 2, 4)}), 27 | ) 28 | 29 | self.assertRaises( 30 | AssertionError, 31 | balance.testutil._assert_frame_equal_lazy, 32 | pd.DataFrame({"a": (1, 2, 3), "b": (1, 2, 3)}), 33 | pd.DataFrame({"a": (1, 2, 4), "c": (1, 2, 3)}), 34 | ) 35 | 36 | a = pd.DataFrame({"a": (1, 2, 3), "b": (4, 5, 6)}) 37 | b = pd.DataFrame({"a": (1, 2, 3), "b": (4, 5, 6)}, columns=("b", "a")) 38 | 39 | # Doesn't raise an error 40 | balance.testutil._assert_frame_equal_lazy(a, b) 41 | 42 | self.assertRaises( 43 | AssertionError, balance.testutil._assert_frame_equal_lazy, a, b, False 44 | ) 45 | 46 | # _assert_index_equal_lazy 47 | self.assertRaises( 48 | AssertionError, 49 | balance.testutil._assert_index_equal_lazy, 50 | pd.Index([1, 2, 3]), 51 | pd.Index([1, 2, 4]), 52 | ) 53 | 54 | a = pd.Index([1, 2, 3]) 55 | b = pd.Index([1, 3, 2]) 56 | # Doesn't raise an error 57 | balance.testutil._assert_index_equal_lazy(a, b) 58 | self.assertRaises( 59 | AssertionError, balance.testutil._assert_index_equal_lazy, a, b, False 60 | ) 61 | 62 | 63 | class TestTestUtil_BalanceTestCase_Equal( 64 | balance.testutil.BalanceTestCase, 65 | ): 66 | def test_additional_equality_tests_mixin(self): 67 | # np.array 68 | self.assertRaises(AssertionError, self.assertEqual, 1, 2) 69 | self.assertRaises( 70 | AssertionError, self.assertEqual, np.array((1, 2)), np.array((2, 1)) 71 | ) 72 | 73 | # Does not raise 74 | self.assertEqual(1, 1) 75 | self.assertEqual(np.array((1, 2)), np.array((1, 2))) 76 | 77 | # pd.DataFrames 78 | # The default is for non-lazy testing of pandas DataFrames 79 | a = pd.DataFrame({"a": (1, 2, 3), "b": (4, 5, 6)}) 80 | b = pd.DataFrame({"a": (1, 2, 3), "b": (4, 5, 6)}, columns=("b", "a")) 81 | 82 | # Does raise an error by default or if lazy=False 83 | self.assertRaises(AssertionError, self.assertEqual, a, b) 84 | self.assertRaises(AssertionError, self.assertEqual, a, b, lazy=False) 85 | 86 | # Doesn't raise an error 87 | self.assertEqual(a, b, lazy=True) 88 | 89 | # pd.Series 90 | self.assertEqual(pd.Series([1, 2]), pd.Series([1, 2])) 91 | self.assertRaises( 92 | AssertionError, self.assertEqual, pd.Series([1, 2]), pd.Series([2, 1]) 93 | ) 94 | 95 | # Indices 96 | self.assertEqual(pd.Index((1, 2)), pd.Index((1, 2))) 97 | self.assertRaises( 98 | AssertionError, self.assertEqual, pd.Index((1, 2)), pd.Index((2, 1)) 99 | ) 100 | self.assertRaises( 101 | AssertionError, 102 | self.assertEqual, 103 | pd.Index((1, 2)), 104 | pd.Index((2, 1)), 105 | lazy=False, 106 | ) 107 | self.assertEqual(pd.Index((1, 2)), pd.Index((2, 1)), lazy=True) 108 | 109 | # Other types 110 | self.assertEqual("a", "a") 111 | self.assertRaises(AssertionError, self.assertEqual, "a", "b") 112 | 113 | 114 | class TestTestUtil_BalanceTestCase_Warns( 115 | balance.testutil.BalanceTestCase, 116 | ): 117 | def test_unit_test_warning_mixin(self): 118 | logger = logging.getLogger(__package__) 119 | 120 | self.assertIfWarns(lambda: logger.warning("test")) 121 | self.assertNotWarns(lambda: "x") 122 | 123 | self.assertWarnsRegexp("abc", lambda: logger.warning("abcde")) 124 | self.assertRaises( 125 | AssertionError, 126 | self.assertWarnsRegexp, 127 | "abcdef", 128 | lambda: logger.warning("abcde"), 129 | ) 130 | 131 | self.assertNotWarnsRegexp("abcdef", lambda: logger.warning("abcde")) 132 | 133 | 134 | class TestTestUtil_BalanceTestCase_Print( 135 | balance.testutil.BalanceTestCase, 136 | ): 137 | def test_unit_test_print_mixin(self): 138 | self.assertPrints(lambda: print("x")) 139 | self.assertNotPrints(lambda: "x") 140 | 141 | self.assertPrintsRegexp("abc", lambda: print("abcde")) 142 | self.assertRaises( 143 | AssertionError, self.assertPrintsRegexp, "abcdef", lambda: print("abcde") 144 | ) 145 | 146 | # assertPrintsRegexp() doesn't necessarily work with logging.warning(), 147 | # as logging handlers can change (e.g. in PyTest) 148 | self.assertPrintsRegexp("abc", lambda: print("abcde", file=sys.stderr)) 149 | -------------------------------------------------------------------------------- /website/.npmrc: -------------------------------------------------------------------------------- 1 | # Stop people use npm instead of yarn by accident 2 | engine-strict = true 3 | -------------------------------------------------------------------------------- /website/README.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator. 4 | 5 | API Reference documentation was built using [Sphinx](https://www.sphinx-doc.org/en/master/index.html), a python documentation generator 6 | 7 | FontAwesome icons were used under the 8 | [Creative Commons Attribution 4.0 International](https://fontawesome.com/license/free). 9 | 10 | ### Installation 11 | 12 | ``` 13 | $ yarn 14 | ``` 15 | 16 | ### Local Development 17 | 18 | ``` 19 | $ yarn start 20 | ``` 21 | 22 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. 23 | 24 | ### Build 25 | 26 | balance's website uses Sphinx & Docusaurus for website generation. We suggest running our custom build script for generating all 27 | artifacts: 28 | 29 | ``` 30 | // Run from repo root folder (balance) 31 | $ ./scripts/make_docs.sh 32 | ``` 33 | 34 | This command generates static content into the `build` directory and can be served using any static contents hosting service. 35 | -------------------------------------------------------------------------------- /website/babel.config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * 4 | * This software may be used and distributed according to the terms of the 5 | * GNU General Public License version 2. 6 | */ 7 | 8 | module.exports = { 9 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 10 | }; 11 | -------------------------------------------------------------------------------- /website/blog/2023/01/09/bringing-balance-to-your-data.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Bringing "balance" to your data 3 | authors: 4 | - name: Roee Eilat 5 | title: Research Scientist Manager @ Meta 6 | url: https://research.facebook.com/people/eilat-roee/ 7 | tags: [] 8 | hide_table_of_contents: true 9 | --- 10 | 11 | In research and data science, we sometimes encounter biased data: that is, data that has not been sampled completely randomly and suffers from an over- or under-indexing toward the population of interest. Survey data is an example in this regard. Surveys play an important role in providing measurements on subjective user experience indicators, such as sentiment and opinions, which cannot be measured by other means. But because survey data is collected from a self-selected group of participants, it needs to be analyzed carefully. 12 | 13 | 14 | 15 | Bias in survey data is often the result of survey non-response or when the data collection suffers from [sampling bias](https://en.wikipedia.org/wiki/Sampling_bias). A similar issue arises in[ observational studies](https://en.wikipedia.org/wiki/Observational_study) when comparing treatment groups, and in any data that suffers from [selection bias](https://en.wikipedia.org/wiki/Selection_bias). Directly inferring insights from data with such biases or training ML models on such data can result in erroneous estimates or underperforming models. Hence, it is important for practitioners to understand if and how data is biased and, when possible, use statistical methods to minimize such biases. 16 | 17 | For example, say we invite a random set of adults from a population of interest to participate in a survey we are running to estimate the sentiment towards some brand. If, for example, younger people have a higher propensity to participate in our survey, the proportion of younger people from the survey respondents will be higher than their proportion in the population. If younger people also have stronger affiliation with the brand, a simple average of the survey responses will be a biased estimate of the average sentiment in the population. 18 | 19 | 20 | The field of [survey statistics](https://en.wikipedia.org/wiki/Survey_methodology) offers methods for mitigating bias in samples, at least partially, by relying on auxiliary information (a.k.a., “covariates” or “features”). When such information is available for all items in the sample as well as for the population from which it was sampled, it can be used to create weights. Applying the weights to the data allows us to produce less biased estimates or models. In the example above, we may wish to adjust for non-response using not only age but other demographic information such as gender, education, etc. This can be done by weighting the sample to the population using the auxiliary information. 21 | 22 | The figure below shows a sample (red) vs. population (blue) distribution representing the age bias described in the brand sentiment estimation example above. This figure was produced on simulated data which we set to be biased on other variables such as gender, education as well. 23 | 24 | ![sample_vs_target_bar_chart](./sample_vs_target_bar_chart.webp) 25 | 26 | When weights are calculated and applied the weighted sample distribution (green) becomes much closer to the population distribution, and the weighted average will also be less biased to the extent the response is correlated with respondent’s age. Notice the weighted distribution is not fully corrected, mainly because of bias-variance considerations. 27 | 28 | ## balance: a Python package for adjusting biased samples 29 | 30 | With survey data playing a key role in research and product work at Meta, we observed a growing need for software tools that make survey statistics methods accessible for researchers and engineers. This has led us to develop “_balance”_: A Python package for adjusting biased data samples. In _balance_ we introduce a simple easy-to-use framework for weighting data and evaluating its biases with and without adjustments. The package is designed to provide best practices for weights fitting and offers several modeling approaches. The methodology in “balance” already supports ongoing automated survey data processing at Meta, as well as ad-hoc analyses of survey data by dozens of researchers every month. 31 | 32 | The main workflow API of _balance_ includes three steps: (1) **understanding** the initial bias in the data relative to a target we would like to infer, (2) **adjusting** the data to correct for the bias by producing weights for each unit in the sample based on propensity scores, and (3) **evaluating** the final bias and the variance inflation after applying the fitted weights. The adjustment step provides several alternatives for the researcher to choose from. Current options include: Inverse propensity weighting of the form of logistic regression model based on LASSO (Least Absolute Shrinkage and Selection Operator [1]), Covariate Balancing Propensity Scores [2], and post-stratification. The focus is on providing a simple to use API, based on Pandas DataFrame structure, that can be used by researchers from a wide spectrum of fields. 33 | 34 | **We’re releasing “_balance_” as a Meta Open Source project.** We want researchers, data scientists, engineers, and other practitioners to be able to apply these practices when they work in Python, benefiting from Meta’s long research and experience in the field. With relation to “_balance_” we hope to also create an active community of data science practitioners where people can come together to discuss methodology and build tools that benefit survey-based research across academia and industry. If you work in Python with potentially biased data, we encourage you to use “balance” in your project. 35 | 36 | “_balance_“ website: [https://import-balance.org/](https://import-balance.org/) 37 | 38 | github repository: [https://github.com/facebookresearch/balance](https://github.com/facebookresearch/balance) 39 | 40 | **References** 41 | 42 | [1] Tibshirani, R. (1996). Regression shrinkage and selection via the lasso. Journal of the Royal Statistical Society: Series B (Methodological), 58(1), 267-288. 43 | 44 | [2] Imai, K., & Ratkovic, M. (2014). Covariate balancing propensity score. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 76(1), 243-263. 45 | -------------------------------------------------------------------------------- /website/blog/2023/01/09/sample_vs_target_bar_chart.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/blog/2023/01/09/sample_vs_target_bar_chart.webp -------------------------------------------------------------------------------- /website/docs/api_reference/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: API Reference 3 | sidebar_position: 3 4 | hide_table_of_contents: true 5 | hide_title: true 6 | --- 7 | 8 | import HTMLLoader from '@site/src/components/HTMLLoader'; 9 | import useBaseUrl from '@docusaurus/useBaseUrl'; 10 | 11 | 12 | -------------------------------------------------------------------------------- /website/docs/docs/contributing.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: contributing 3 | title: Contributing 4 | sidebar_position: 4 5 | --- 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /website/docs/docs/general_framework/adjusting_sample_to_population.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: adjusting_sample_to_population 3 | title: Adjusting Sample to Population 4 | description: How to produce weights for a sample to represent the target population of interest 5 | sidebar_position: 2 6 | keywords: 7 | - adjustment 8 | --- 9 | 10 | To produce the balancing weights, use the ```Sample.adjust()``` method to adjust a sample to population: 11 | 12 | ``` 13 | adjusted = sample.adjust() 14 | ``` 15 | The output of this method is an adjusted `Sample` class object of the form: 16 | 17 | ``` 18 | Adjusted balance Sample object with target set using ipw 19 | 1000 observations x 3 variables: gender,age_group,income 20 | id_column: id, weight_column: weight, 21 | outcome_columns: happiness 22 | 23 | target: 24 | 25 | balance Sample object 26 | 10000 observations x 3 variables: gender,age_group,income 27 | id_column: id, weight_column: weight, 28 | outcome_columns: None 29 | 30 | 3 common variables: income,gender,age_group 31 | ``` 32 | Note that the `adjust` method in balance is performing three main steps: 33 | 1. **Pre-processing** of the data - getting data ready for adjustment using best practices in the field: 34 | * Handling missing values - balance handles missing values automatically by adding a column '_is_na' to any variable that contains missing values. The advantage of this is that these are then considered as a separate category for the adjustment. 35 | * Feature engineering - by default, balance applies feature engineering to be able to fit the covariate distribution better, and not only the first moment. Specifically, each continues variable is bucketed into 10 quantiles buckets. Furthermore, rare categories in categorical variables are grouped together so to avoid overfitting rare events. 36 | 2. **Fitting the model** and calculating the weights: the model fitted depends on the ```method``` chosen by the user. Current options are [inverse propensity score weighting](../statistical_methods/ipw.md) using regularized logistic regression (```ipw```), [covariate balancing propensity score](../statistical_methods/cbps.md) (```cbps```), [post-stratification](../statistical_methods/poststratify.md) (```poststratify```), and [raking](../statistical_methods/rake.md) (```rake```). 37 | 3. **Post-processing** of the weights: 38 | * Trimming weights - balance trims the weights in order to avoid over fitting of the model and unnecessary variance inflation. 39 | * Normalizing weights to population size. The resulting weights of balance can be described as approximating the number of unit in the population this unit of the sample represents. 40 | 41 | ## Optional arguments 42 | 43 | * **`method`**: `ipw`, `poststratify`, `rake`, or `cbps`. Default is `ipw`. 44 | * `ipw`: stands for [Inverse Propensity Weighting](https://en.wikipedia.org/wiki/Inverse_probability_weighting). The propensity scores are calculated with [LASSO](https://en.wikipedia.org/wiki/Lasso_(statistics)) [logistic regression](https://en.wikipedia.org/wiki/Logistic_regression). Details about the implementation can be found [here](../../statistical_methods/ipw/). For a quick-start tutorial, see [here](https://import-balance.org/docs/tutorials/quickstart/). 45 | * `cbps`: stands for [Covariate Balancing Propensity Score](https://imai.fas.harvard.edu/software/CBPS.html). The CBPS algorithm estimates the propensity score in a way that optimizes prediction of the probability of sample inclusion as well as the covariates balance. Its main advantage is in cases when the researcher wants better balance on the covariates than traditional propensity score methods - because one believes the assignment model might be misspecified and would like to avoid an iterative procedure of balancing the covariates. Details about the implementation can be found [here](../../statistical_methods/cbps/). For a quick-start tutorial, see [here](https://import-balance.org/docs/tutorials/quickstart_cbps/). 46 | * `poststratify`: stands for post-stratification. Details about the implementation can be found [here](../../statistical_methods/poststratify/). 47 | * `rake`: Details about the implementation can be found [here](../../statistical_methods/rake/). For a quick-start tutorial, see [here](https://import-balance.org/docs/tutorials/quickstart_rake/). 48 | 49 | * **`variables`**: allows user to pass a list of the covariates that they want to adjust for; if variables argument is not specified, all joint variables in sample and target are used. 50 | 51 | * **`transformations`**: which transformations to apply to data before fitting the model. Default is cutting numeric variables into 10 quantile buckets and lumping together infrequent levels with less than 5% prevalence into `lumped_other` category. The transformations are done on both the sample dataframe and the target dataframe together. User can also specify specific transformations in a dictionary format. For a quick-start tutorial on transformations and formulas, see [here](https://import-balance.org/docs/tutorials/balance_transformations_and_formulas/). 52 | 53 | * **`max_de`**: (for `ipw` and `cbps` methods): The default value is 1.5. It limits the [**design effect**](https://en.wikipedia.org/wiki/Design_effect) to be within 1.5. If set to None, the optimization is performed by cross-validation of the logistic model for ipw (see the `choose_regularization` function for more details) or without constrained optimization for cbps. Setting `max_de` to `None` can sometimes significantly improve the running time of the code. 54 | 55 | * **`weight_trimming_mean_ratio`** **or** **`weight_trimming_percentile`**: (only one of these arguments can be specified). `weight_trimming_mean_ratio` indicates the ratio from above according to which the weights are trimmed by mean(weights) * ratio. Default is 20. If `weight_trimming_percentile` is not none, [winsorization](https://en.wikipedia.org/wiki/Winsorizing) is applied. Default is None, i.e. trimming from above is applied. However, note that when `max_de` is not None (and default is 1.5), the trimming-ratio is optimized by `ipw` and these arguments are ignored. 56 | 57 | * **`na_action`** (for `ipw` method): how to handle missing values in the data (sample and target). Default is to replace NAs with 0's and add indicator for which observations were NA (this is done after applying the transformations). Another option is `drop`, which drops all observations with NA values. 58 | 59 | * **`formula`** (for `ipw` and `cbps` methods): The formula according to which build the model matrix for the logistic regression. Default is a linear additive formula of all covariates. For a quick-start tutorial on transformations and formulas, see [here](https://import-balance.org/docs/tutorials/balance_transformations_and_formulas/). 60 | 61 | * **`penalty_factor`** (for `ipw` method): the penalty used in the regularized logistic regression. 62 | -------------------------------------------------------------------------------- /website/docs/docs/general_framework/evaluation_of_results.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: evaluation_of_results 3 | title: Evaluating and using the adjustment weights 4 | description: Diagnosing, evaluating, and using the weighted adjusted sample 5 | sidebar_position: 3 6 | keywords: 7 | - diagnostics 8 | - evaluation 9 | - results 10 | --- 11 | 12 | After weights are fitted in order to balance the sample, the results should be evaluated so to understand the quality of the weighting. 13 | 14 | ## Summary statistics 15 | 16 | ### Summary 17 | 18 | Printing the adjusted object gives a high level overview of the content of the object: 19 | 20 | ```python 21 | print(adjusted) 22 | ``` 23 | 24 | Output: 25 | 26 | ``` 27 | Adjusted balance Sample object with target set using ipw 28 | 1000 observations x 3 variables: gender,age_group,income 29 | id_column: id, weight_column: weight, 30 | outcome_columns: happiness 31 | 32 | target: 33 | 34 | balance Sample object 35 | 10000 observations x 3 variables: gender,age_group,income 36 | id_column: id, weight_column: weight, 37 | outcome_columns: None 38 | 39 | 3 common variables: income,age_group,gender 40 | ``` 41 | 42 | 43 | To generate a summary of the data, use the summary method: 44 | 45 | ```python 46 | print(adjusted.summary()) 47 | ``` 48 | 49 | This will return several results: 50 | - Covariate mean ASMD improvement: ASMD is "Absolute Standardized Mean Difference". For continuous variables, this measure is the same as taking the absolute value of [Cohen's d statistic](https://en.wikipedia.org/wiki/Effect_size#Cohen's_d) (also related to [SSMD](https://en.wikipedia.org/wiki/Strictly_standardized_mean_difference)), when using the (weighted) standard deviation of the target population. For categorical variables it uses [one-hot encoding](https://en.wikipedia.org/wiki/One-hot). 51 | - [Design effect](https://en.wikipedia.org/wiki/Design_effect) 52 | - Covariate mean Adjusted Standardized Mean Deviation (ASMD) versus Unadjusted covariate mean ASMD 53 | - Model proportion deviance explained (if inverese propensity weighting method was used) 54 | 55 | Output: 56 | 57 | ``` 58 | Covar ASMD reduction: 62.3%, design effect: 2.249 59 | Covar ASMD (7 variables): 0.335 -> 0.126 60 | Model performance: Model proportion deviance explained: 0.174 61 | ``` 62 | 63 | Note that although we had 3 variables in our original data (age_group, gender, income), the asmd counts each level of the categorical variables as separate variable, and thus it considered 7 variables for the covar ASMD improvement. 64 | 65 | ## Covariate Balance 66 | 67 | 68 | We can check the mean of each variable before and after applying the weights using `.mean()`: 69 | 70 | ```python 71 | adjusted.covars().mean().T 72 | ``` 73 | 74 | To get: 75 | 76 | ``` 77 | source self target unadjusted 78 | _is_na_gender[T.True] 0.103449 0.089800 0.08800 79 | age_group[T.25-34] 0.279072 0.297400 0.30900 80 | age_group[T.35-44] 0.290137 0.299200 0.17200 81 | age_group[T.45+] 0.150714 0.206300 0.04600 82 | gender[Female] 0.410664 0.455100 0.26800 83 | gender[Male] 0.485887 0.455100 0.64400 84 | gender[_NA] 0.103449 0.089800 0.08800 85 | income 9.519935 12.737608 5.99102 86 | ``` 87 | 88 | The `self` is the adjusted ASMD, while `unadjusted` is the unadjusted ASMD. 89 | 90 | 91 | And `.asmd()` to get ASMD: 92 | 93 | ```python 94 | adjusted.covars().asmd().T 95 | ``` 96 | 97 | To get: 98 | 99 | ``` 100 | source self unadjusted unadjusted - self 101 | age_group[T.25-34] 0.040094 0.025375 -0.014719 102 | age_group[T.35-44] 0.019792 0.277771 0.257980 103 | age_group[T.45+] 0.137361 0.396127 0.258765 104 | gender[Female] 0.089228 0.375699 0.286472 105 | gender[Male] 0.061820 0.379314 0.317494 106 | gender[_NA] 0.047739 0.006296 -0.041444 107 | income 0.246918 0.517721 0.270802 108 | mean(asmd) 0.126310 0.334860 0.208551 109 | ``` 110 | 111 | We can see that on average the ASMD improved from 0.33 to 0.12 thanks to the weights. We got improvements in income, gender, and age_group. 112 | Although we can see that `age_group[T.25-34]` didn't get improved. 113 | 114 | 115 | ## Understanding the model 116 | 117 | For a summary of the diagnostics measures, use: 118 | 119 | ```python 120 | adjusted.diagnostics() 121 | ``` 122 | 123 | This will give a long table that can be filterred to focus on various diagnostics metrics. For example, when the `.adjust()` method is run with `model="ipw"` (the default method), then the rows from the diagnostics output with `metric == "model_coef"` represent the coefficients of the variables in the model. These can be used to understand the model that was fitted (after transformations and regularization). 124 | 125 | ## Visualization post adjustments 126 | 127 | We can create all (interactive) plots using: 128 | 129 | ```python 130 | adjusted.covars().plot() 131 | ``` 132 | 133 | And get: 134 | 135 | ![](../img/fig_04_qqplot_income_after.png) 136 | 137 | ![](../img/fig_05_barplot_age_after.png) 138 | 139 | ![](../img/fig_06_barplot_gender_after.png) 140 | 141 | We can also use different plots, using the seaborn library, for example with the "kde" dist_type. 142 | 143 | ```python 144 | adjusted.covars().plot(library = "seaborn", dist_type = "kde") 145 | ``` 146 | 147 | And get: 148 | 149 | ![](../img/fig_07_seaborn_after.png) 150 | 151 | 152 | ## Distribution of Weights 153 | 154 | We can look at the distribution of weights using the following method call: 155 | 156 | 157 | ```python 158 | adjusted.weights().plot() 159 | ``` 160 | 161 | And get: 162 | 163 | ![](../img/fig_08_weights_kde.png) 164 | 165 | Or calculate the design effect using: 166 | 167 | ```python 168 | adjusted.weights().design_effect() 169 | # 2.24937 170 | ``` 171 | 172 | ## Analyzing the outcome 173 | 174 | The `.summary()` method gives us the response rates (if we have missing values in the outcome), and the weighted means before and after applying the weights: 175 | 176 | ```python 177 | print(adjust.outcomes().summary()) 178 | ``` 179 | 180 | To get: 181 | ``` 182 | 183 | 1 outcomes: ['happiness'] 184 | Mean outcomes: 185 | happiness 186 | source 187 | self 54.221388 188 | unadjusted 48.392784 189 | 190 | Response rates (relative to number of respondents in sample): 191 | happiness 192 | n 1000.0 193 | % 100.0 194 | ``` 195 | 196 | For example, we see that the estimated mean happiness according to our sample is 48 without any adjustment and 54 with adjustment. The following shows the distribution of happinnes before and after applying the weights: 197 | 198 | ```python 199 | adjusted.outcomes().plot() 200 | ``` 201 | 202 | And we get: 203 | 204 | ![](../img/fig_09_seaborn_outcome_kde_after.png) 205 | -------------------------------------------------------------------------------- /website/docs/docs/general_framework/general_framework.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: general_framework 3 | title: General Framework 4 | description: The main workflow of balance 5 | sidebar_position: 2 6 | 7 | keywords: 8 | - framework 9 | --- 10 | 11 | Following the [total survey error](https://en.wikipedia.org/wiki/Total_survey_error) framework, surveys responses are often biased due to coverage error, sampling error and non-response bias [1]. Weighting is often an important step when analyzing survey data. For each unit in the sample (e.g. respondent to a survey), we attach a weight that can be understood as the approximate number of people from the target population that this respondent represents. 12 | 13 | ![total_survey_error_img](../img/total_survey_error_flow_v02.png) 14 | 15 | The weighting of survey data through balance is done in 6 main steps: 16 | 17 | 1. **Loading data of the respondents** of the survey. This is done by loading a ```pandas``` DataFrame (using ```pandas.read_csv()``` for example), and then converting the DataFrame into a balance `Sample` class object: 18 | ``` 19 | sample = Sample.from_frame(sample_df) 20 | ``` 21 | 2. **Loading data about the target population** we would like to correct for. Similarly to loading the respondents data, the next step is to load the target population data and keep it as a balance Sample object. Then, we combine the two `Sample` objects by setting the target object as the target of the sample object: 22 | ``` 23 | target = Sample.from_frame(target_df) 24 | sample_with_target = sample.set_target(target) 25 | ``` 26 | 3. **Diagnostics of the sample covariates** so to evaluate whether weighting is needed. Several diagnostics tools are available to provide insights on the need to adjust the sample to match the target population. See the [Pre-Adjustment Diagnostics](pre_adjustment_diagnostics.md) page for details. 27 | 28 | 4. **Adjusting the sample to the target**. Producing the weights for sample to represent the target population distributions. See the [Adjusting Sample to Population](adjusting_sample_to_population.md) page for details. 29 | ``` 30 | adjusted = sample_with_target.adjust() 31 | ``` 32 | 5. **Evaluation of the results**. Several tools are available for evaluation of the resulted weights, the amount of bias that the weights reduce, and the estimated [design effect](https://en.wikipedia.org/wiki/Design_effect#Haphazard_weights_with_estimated_ratio-mean_(%7F'%22%60UNIQ--postMath-0000003A-QINU%60%22'%7F)_-_Kish's_design_effect) due to weighting. See the [Evaluation of the Results](evaluation_of_results.md) page for details. 33 | 34 | 6. **Use the weights for producing population level estimations**. The produced weights are then used to evaluate the population outcome, often the population average (first moment), by using: 35 | ``` 36 | adjusted.outcomes().summary() 37 | ``` 38 | 39 | 7. **Saving the output weights**. To save the output weights, use: 40 | ``` 41 | adjusted.to_download() 42 | ``` 43 | 44 | All these steps are described in more details in the rest of this website. Examples are available in the [tutorials section](../../tutorials). 45 | For a quick start of using balance with your survey data, take a look at this [notebook](../../tutorials/quickstart.mdx) 46 | 47 | 48 | 49 | 50 | ## References 51 | [1] Salganik, Matthew J. 2017. Bit by Bit: Social Research in the Digital Age. Princeton, NJ: Princeton University Press. Open review edition. 52 | -------------------------------------------------------------------------------- /website/docs/docs/general_framework/pre_adjustment_diagnostics.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: pre_adjustment_diagnostics 3 | title: Pre-Adjustment Diagnostics 4 | description: Diagnostics for an unadjusted Sample 5 | sidebar_position: 1 6 | keywords: 7 | - unadjusted sample 8 | - diagnostics 9 | --- 10 | 11 | ## Covariate balance 12 | 13 | A way to check if adjustments are needed is looking at covariate balance by comparing the distribution of covariates in our sample (the respondents before any adjustment), to the distribution of covariates of the population. The same methods will be later used to evaluate the quality of the adjustment in [evaluating the results](evaluation_of_results.md). 14 | 15 | There are various methods for comparing covariate balance, either via summary statistics, or through visualizations. The visualizations are implemented either via [plotly](https://plotly.com/python/) (offering an interactive interface) or [seaborn](https://seaborn.pydata.org/) (leading to a static image). 16 | 17 | The methods implemented in balance include: 18 | 1. Summary statistics 19 | 1. Means 20 | 2. ASMD (Absolute Standardized Mean Difference) 21 | 2. Visualizations 22 | 1. Numerical variables 23 | 1. QQ-plots (interactive) 24 | 2. Kernel density estimation (static) 25 | 3. Empirical Cumulative Distribution Function (static) 26 | 4. Histogram (static) 27 | 2. Categorical variables 28 | 1. Barplots (interactive or static) 29 | 2. Probability scatter plot (static) 30 | 31 | 32 | ## Summary statistics 33 | 34 | ### Means and ASMD (Absolute Standardized Mean Difference) 35 | 36 | The mean of the covariates in the sample versus the target is a basic measure to evaluate the distance of the sample from the target population of interest. 37 | 38 | For categorical variables the means are calculated to each of the [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) of the categories of the variable. This is basically the proportion of observations in that bucket. 39 | 40 | It can be calculated simply by running: 41 | ```python 42 | sample_with_target.covars().mean().T 43 | ``` 44 | An example of the output: 45 | 46 | ``` 47 | source self target 48 | _is_na_gender[T.True] 0.08800 0.089800 49 | age_group[T.25-34] 0.30900 0.297400 50 | age_group[T.35-44] 0.17200 0.299200 51 | age_group[T.45+] 0.04600 0.206300 52 | gender[Female] 0.26800 0.455100 53 | gender[Male] 0.64400 0.455100 54 | gender[_NA] 0.08800 0.089800 55 | income 5.99102 12.737608 56 | ``` 57 | 58 | (TODO: the one hot encoding acts a bit differently for different variables - this will be resolved in future releases) 59 | 60 | The limitation of the mean is that it is not easily comparable between different variables since they may have different variances. The simplest attempt in addressing this issue is using the ASMD. 61 | 62 | The ASMD (Absolute Standardized Mean Deviation) measures the difference per covariate between the sample and target. It uses weighted average and std for the calculations (e.g.: to take design weights into account). 63 | This measure is the same as taking the absolute value of [Cohen's d statistic](https://en.wikipedia.org/wiki/Effect_size#Cohen's_d) (also related to [SSMD](https://en.wikipedia.org/wiki/Strictly_standardized_mean_difference)), when using the (weighted) standard deviation of the target population. Other options that occur in the literature includes using the standard deviation based on the sample, or some average of the std of the sample and the target. In order to allow this to be compared across different samples and adjustments, we opted to use the std of the target as the default. 64 | 65 | It can be calculated simply by running: 66 | ```python 67 | sample_with_target.covars().asmd().T 68 | ``` 69 | An example of the output: 70 | 71 | ``` 72 | source self 73 | age_group[T.25-34] 0.025375 74 | age_group[T.35-44] 0.277771 75 | age_group[T.45+] 0.396127 76 | gender[Female] 0.375699 77 | gender[Male] 0.379314 78 | gender[_NA] 0.006296 79 | income 0.517721 80 | mean(asmd) 0.334860 81 | ``` 82 | 83 | For categorical variables the ASMD can be calculated as the average of the ASMD applied to each of the [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) of the categories of the variable by using the `aggregate_by_main_covar` argument: 84 | 85 | ```python 86 | sample_with_target.covars().asmd(aggregate_by_main_covar = True).T 87 | ``` 88 | 89 | The output: 90 | 91 | ``` 92 | source self 93 | age_group 0.233091 94 | gender 0.253769 95 | income 0.517721 96 | mean(asmd) 0.334860 97 | ``` 98 | 99 | An average ASMD is calculated for all covariates. It is a simple average of the ASMD for each covariate. Each ASMD value of categorical variable is used once after aggregated the ASMD from all the [dummy variables](https://en.wikipedia.org/wiki/Dummy_variable_(statistics)). 100 | 101 | ## Visualizations 102 | 103 | ### Q-Q Plot (plotly) 104 | 105 | We provide [Q-Q Plots](https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot) as a visual to compare two distributions to one another. 106 | 107 | For example, the plot below is a Q-Q plot for the income covariate for the sample against a straight line of the target population: 108 | 109 | ![](../img/fig_01_qqplot_income_before.png) 110 | 111 | The closer the line is to the 45-degree-line the better (i.e.: the less bias is observed in the sample as compared to the target population). 112 | 113 | To make a QQ-plot for a specific variable, simply use the following method (the default uses QQ plot with the plotly engine): 114 | 115 | ```python 116 | sample_with_target.covars().plot(variables = ['income',]) 117 | ``` 118 | 119 | ### Barplots 120 | 121 | [Barplots](https://en.wikipedia.org/wiki/Bar_chart) provides a way to visually compare the sample and target for categorical covariates. 122 | 123 | Here is an example of the plot for age_group and gender before adjustment: 124 | 125 | 126 | ![](../img/fig_02_barplot_age_before.png) 127 | 128 | ![](../img/fig_03_barplot_gender_before.png) 129 | 130 | To make these plots, simply use the following: 131 | 132 | ```python 133 | sample_with_target.covars().plot(variables = ['age_group', 'gender', ]) 134 | ``` 135 | 136 | ### Plotting all varibales 137 | 138 | If you do not specify a variables list in the plot method, all covariates of you sample object will be plotted: 139 | 140 | ```python 141 | sample_with_target.covars().plot() 142 | ``` 143 | -------------------------------------------------------------------------------- /website/docs/docs/img/fig_01_qqplot_income_before.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_01_qqplot_income_before.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_02_barplot_age_before.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_02_barplot_age_before.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_03_barplot_gender_before.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_03_barplot_gender_before.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_04_qqplot_income_after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_04_qqplot_income_after.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_05_barplot_age_after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_05_barplot_age_after.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_06_barplot_gender_after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_06_barplot_gender_after.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_07_seaborn_after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_07_seaborn_after.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_08_weights_kde.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_08_weights_kde.png -------------------------------------------------------------------------------- /website/docs/docs/img/fig_09_seaborn_outcome_kde_after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/fig_09_seaborn_outcome_kde_after.png -------------------------------------------------------------------------------- /website/docs/docs/img/total_survey_error_flow_v02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/total_survey_error_flow_v02.png -------------------------------------------------------------------------------- /website/docs/docs/img/total_survey_error_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/balance/f20a0a707bac57bf785f135f6ccef3c692f5b862/website/docs/docs/img/total_survey_error_image.png -------------------------------------------------------------------------------- /website/docs/docs/overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: overview 3 | title: Overview 4 | sidebar_position: 1 5 | hide_title: true 6 | --- 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /website/docs/docs/statistical_methods/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: index 3 | title: Statistical Methods 4 | sidebar_position: 3 5 | --- 6 | This section descirbes the statistical methodologies used in balance for weighting: 7 | 1. [Inverse propensity score weighting](ipw.md) 8 | 2. [Covariate balancing proponsity score](cbps.md) 9 | 3. [Post-stratification](poststratify.md) 10 | 4. [Raking](rake.md) 11 | -------------------------------------------------------------------------------- /website/docs/docs/statistical_methods/ipw.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: ipw 3 | title: Inverse Propensity Score Weighting 4 | description: Inverse Propensity Score Weighting 5 | sidebar_position: 1 6 | keywords: 7 | - inverse propensity score weighting 8 | - ipw 9 | - ipsw 10 | --- 11 | ## Introduction 12 | The inverse propensity score weighting is a statistical method to adjust a non-random sample to represent a population by weighting the sample units. It assumes two samples: 13 | 14 | (1) A sample of respondents to a survey (or in a more general framework, a biased panel), will be referred to as "sample". 15 | 16 | (2) A sample of a target population, often referred to as "reference sample" or "reference survey" [1], will be referred to as "target". This sample includes a larger coverage of the population or a better sampling properties in a way that represents the population better. It often includes only a limited number of covariates and doesn't include the outcome variables (the survey responses). In different cases it can be the whole target population (in case it is available), a census data (based on a survey) or an existing survey. 17 | 18 | 19 | ## Mathematical model 20 | 21 | Let $S$ represent the sample of respondents, with $n$ units, and $T$ represent the target population, with $N$ units. We may assume each unit $i$ in the sample and target have a base weight, which is referred to as a design weight, $d_i$. These are often set to be 1 for the sample (assuming unknown sampling probabilities), and are based on the sampling procedure for the target. In addition, we assume all units in sample and target have a covariates vector attached, $x_i$. Note that we assume that the same covariates are available for the sample and the target, otherwise we ignore the non-overlapping covariates. 22 | 23 | 24 | 25 | Define the propensity score as the probability to be included in the sample (the respondents group) conditioned on the characteristics of the unit, i.e. let $p_i = Pr\{i \in S | x_i\}$, $i=1...n$. $p_i$ is then estimated using logistic regression, assuming a linear relation between the covariates and the logit of the probability: $\ln(\frac{p_i}{1-p_i})=\beta_0+\beta_1 x_i$. 26 | 27 | 28 | Note that balance's implementation for ```ipw``` uses a regularized logistic model through using [LASSO](https://en.wikipedia.org/wiki/Lasso_(statistics)) (by using [glmnet-python](https://glmnet-python.readthedocs.io/en/latest/glmnet_vignette.html)). This is in order to keep the inflation of the variance as minimal as possible while still addressing the meaningful differences in the covariates between the sample and the target. 29 | 30 | ### How are the regularization parameter and trimming ratio parameter chosen? 31 | There are two options to choose the regularization parameter and trimming ratio parameter in balance: 32 | 33 | 1. Bounding the design effect by setting ```max_de = X```. In this case the regularization parameter and the trimming ratio parameter are chosen by a grid search over the 10 models with the largest design effect. This is based on the assumption that a larger design effect often implies better covariate balancing. Within these 10 models, the model with the smallest ASMD is chosen. 34 | 35 | 2. Choosing the regularization parameter by the "1se rule" (or "One Standard Error Rule") of cross validation, i.e. the largest penalty factor $\lambda$ at which the MSE is at most 1 standard error from the minimal MSE . This is applied when ```max_de``` is set to ```None```. In this case the trimming ratio parameter is set by the user, and default to 20. 36 | 37 | ### Weights estimation 38 | 39 | The estimated propensity scores are then used to estimate the weights of the sample by setting $w_i = \frac{1-p_i}{p_i} d_i$. 40 | 41 | 42 | 43 | 44 | 45 | ## References 46 | [1] Lee, S., & Valliant, R. (2009). Estimation for volunteer panel web surveys using propensity score adjustment and calibration adjustment. Sociological Methods & Research, 37(3), 319-343. 47 | 48 | - More about [Inverse Probability Weighting](https://en.wikipedia.org/wiki/Inverse_probability_weighting) in Wikipedia. 49 | -------------------------------------------------------------------------------- /website/docs/docs/statistical_methods/poststratify.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: poststratify 3 | title: Post-Stratification 4 | description: Post-Stratification 5 | sidebar_position: 3 6 | keywords: 7 | - Post-Stratification 8 | - poststratify 9 | --- 10 | ## Introduction 11 | 12 | Post-stratification is one of the most common weighing approaches in survey statistics. It origins from a stratified sample, where the population is divided into subpopulations (strata) and the sample is conducted independently on each of them. However, when one doesn't know in advance the subpopulations to sample from (for example, when the stratum of the units in the sample is unknown in advance), or when non-response is presented, stratification can be done after the sample has been selected. 13 | 14 | The goal of post-stratification is to have the sample match exactly the joint-distribution of the target population. However, this is also the main limitation of this method. It is limited by the number of variables we are able to use for adjustment due to the nature of fitting the target exactly, and thus require a minimal number of respondent in each strata. Hence, usually at most 2 to 4 variables are used (with limited number of buckets). In addition, continues variables cannot be used for adjustment (unless bucketed). A more general approach is the inverse propensity score weighting ([ipw](../ipw)). 15 | 16 | ## Methodology 17 | The idea behind post-stratification is simple. For each cell (strata) in the population, compute the percent of the total population in this cell. Then fit weights so that they adjust the sample so to have the same proportions for each strata as in the population. 18 | 19 | We will illustrate this with an example. Assume that we have sampled people from a certain population to a survey and asked for their age and gender so to use these for weighing. Assume also that the joint distribution of age and gender in this population is known from a census, and is the following: 20 | 21 | | | Young adults | Adults | Total | 22 | |--------|--------------|--------|-------| 23 | | Female | 120 | 380 | 500 | 24 | | Male | 80 | 420 | 500 | 25 | | Total | 200 | 800 | 1000 | 26 | 27 | 28 | In addition, assume that for the specific survey we ran young adults tend to reply more, so that the distribution of responses in the survey is the following: 29 | 30 | | | Young adults | Adults | Total | 31 | |--------|--------------|--------|-------| 32 | | Female | 30 | 10 | 40 | 33 | | Male | 50 | 10 | 60 | 34 | | Total | 80 | 20 | 100 | 35 | 36 | The post-stratification weights are then computed as follows: 37 | 38 | - Proportion of Female young adults in the population is $120/1000 = 0.12$ 39 | - Proportion of Female young adults in the sample is $30/100 = 0.3$ 40 | 41 | Inflation factor - this is the inverse probability factor indicating by how much we need to multiply the total sample size to get to the total population size. It is equal to population size / sample size. In our case it is: $1000/100 = 10$. 42 | 43 | Calculate weights for each Female young adult in the sample: (population %) / (sample %) * (inflation factor). In our example this is: $0.12/0.3 * 10= 0.4 * 10= 4$. 44 | 45 | This means that the assigned weight of each Female young adult in the sample is 4. 46 | 47 | Similarly, we can compute the weight for people from each cell in the table: 48 | 49 | | | Young adults | Adults | 50 | |--------|----------------------|---------------------| 51 | | Female | $0.12/0.3 * 10 = 4$ | $0.38/0.1 * 10 = 38$| 52 | | Male | $0.08/0.5 * 10 = 1.6$| $0.42/0.1 *10 = 42$ | 53 | 54 | 55 | 56 | ## References 57 | - More about post-stratification: [Introduction to post-stratification](https://docs.wfp.org/api/documents/WFP-0000121326/download/) 58 | - Kolenikov, Stas. 2016. “Post-Stratification or Non-Response Adjustment?” Survey Practice 9 (3). https://doi.org/10.29115/SP-2016-0014. 59 | -------------------------------------------------------------------------------- /website/docs/docs/statistical_methods/rake.md: -------------------------------------------------------------------------------- 1 | --- 2 | id: rake 3 | title: Raking 4 | description: rake 5 | sidebar_position: 5 6 | keywords: 7 | - rake 8 | - raking 9 | --- 10 | ## Introduction 11 | 12 | Raking, also known as iterative proportional fitting, is a statistical technique widely used in survey sampling to adjust weights and enhance the representativeness of the collected data. When a sample is drawn from a population, there might be differences in the distribution of certain variables between the sample and the population. Raking, similar to other methods in the `balance` package, helps to account for these differences, making the sample's distribution closely resemble that of the population. 13 | 14 | Raking is an iterative process that involves adjusting the weights of sampled units based on the marginal distributions of certain variables in the population. Typically, we have access to such marginal distributions, but not their combined joint distribution. The variables chosen for raking are usually demographic variables, such as age, gender, education, income, and other socioeconomic variables, which are known to influence survey outcomes. By adjusting the weights of the sampled units, raking helps to correct for potential biases that may arise due to nonresponse, undercoverage, or oversampling of certain groups. 15 | 16 | 17 | ## Methodology 18 | 19 | Raking essentially applies [post-stratification](https://import-balance.org/docs/docs/statistical_methods/poststratify/) repeatedly over all the covariates. For example, we may have the marginal distribution of age\*gender and education. Raking would first adjust weights to match the age\*gender distribution and then take these weights as input to adjust for education. It would then adjust again to age\*gender and then again to education, and so forth. This process will repeat until either a max_iteration is met, or the weights have converged and no longer seem to change from one iteration to another. 20 | 21 | Raking is a valuable technique for addressing potential biases and enhancing the representativeness of survey data. By iteratively adjusting the weights of sampled units based on the marginal distribution of key variables, raking ensures that survey estimates are more accurate and reliable. 22 | 23 | You can see a detailed example of how to perform raking in `balance` in the tutorial: [**quickstart_rake**](https://import-balance.org/docs/tutorials/quickstart_rake/). 24 | 25 | ## References 26 | - https://en.wikipedia.org/wiki/Raking 27 | - https://www.pewresearch.org/methods/2018/01/26/how-different-weighting-methods-work/ 28 | - Practical Considerations in Raking Survey Data ([url](https://www.surveypractice.org/article/2953-practical-considerations-in-raking-survey-data)) 29 | -------------------------------------------------------------------------------- /website/docs/tutorials/balance_transformations_and_formulas.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: balance_transformations_and_formulas 3 | sidebar_position: 5 4 | hide_table_of_contents: true 5 | hide_title: true 6 | --- 7 | 8 | import HTMLLoader from '@site/src/components/HTMLLoader'; 9 | import useBaseUrl from '@docusaurus/useBaseUrl'; 10 | 11 | Link to notebook: [balance_transformations_and_formulas.ipynb](https://github.com/facebookresearch/balance/blob/main/tutorials/balance_transformations_and_formulas.ipynb) 12 | 13 | 14 | -------------------------------------------------------------------------------- /website/docs/tutorials/comparing_cbps_in_r_vs_python_using_sim_data.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Comparing_cbps_in_r_vs_python_using_sim_data 3 | sidebar_position: 6 4 | hide_table_of_contents: true 5 | hide_title: true 6 | --- 7 | 8 | import HTMLLoader from '@site/src/components/HTMLLoader'; 9 | import useBaseUrl from '@docusaurus/useBaseUrl'; 10 | 11 | Link to notebook: [comparing_cbps_in_r_vs_python_using_sim_data.ipynb](https://github.com/facebookresearch/balance/blob/main/tutorials/comparing_cbps_in_r_vs_python_using_sim_data.ipynb) 12 | 13 | 14 | -------------------------------------------------------------------------------- /website/docs/tutorials/index.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Tutorials and notebooks Overview 3 | sidebar_position: 1 4 | --- 5 | 6 | The tutorials here present various examples of applying balance end-to-end. Each tutorial is linked to a Jupyter notebook you can download and run from your own environment. 7 | 8 | **Requirements**: You will need a [Jupyter installation](https://jupyter.org/) to run these notebooks yourselves. We also assume you have the [balance pkg](/docs/docs/overview) installed. 9 | 10 | If you are new to balance, we suggest getting started with the [balance Quickstart](./quickstart) tutorial. 11 | 12 | ## Tutorials list (more tutorials to be added soon): 13 | 1. [**quickstart**](./quickstart) - this is based on a simulated data and presents the simple end-to-end workflow of balance package with default arguments. It demonstrates the process from reading the data, through understanding the biases in the sample, producing weights, evaluating the results and producing the population estimations. 14 | 2. [**quickstart_cbps**](./quickstart_cbps) - like the [**quickstart**](./quickstart) tutorial, but shows how to use the CBPS algorithm and compares the results to IPW (logistic regression with LASSO). 15 | 3. [**quickstart_rake**](./quickstart_rake) - like the [**quickstart**](./quickstart) tutorial, but shows how to use the rake (raking) algorithm and compares the results to IPW (logistic regression with LASSO). 16 | 4. [**balance_transformations_and_formulas**](./balance_transformations_and_formulas) - This tutorial showcases ways in which transformations, formulas and penalty can be included in your pre-processing of the covariates before adjusting for them. 17 | 5. [**comparing_cbps_in_r_vs_python_using_sim_data**](./comparing_cbps_in_r_vs_python_using_sim_data) - This notebook compares the results of running CBPS in R and Python. In R using the `BCPS` package, and in Python using the `balance` package. The results are almost identical. 18 | -------------------------------------------------------------------------------- /website/docs/tutorials/quickstart.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quickstart 3 | sidebar_position: 2 4 | hide_table_of_contents: true 5 | hide_title: true 6 | --- 7 | 8 | import HTMLLoader from '@site/src/components/HTMLLoader'; 9 | import useBaseUrl from '@docusaurus/useBaseUrl'; 10 | 11 | Link to notebook: [balance_quickstart.ipynb](https://github.com/facebookresearch/balance/blob/main/tutorials/balance_quickstart.ipynb) 12 | 13 | 14 | -------------------------------------------------------------------------------- /website/docs/tutorials/quickstart_cbps.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quickstart_cbps 3 | sidebar_position: 3 4 | hide_table_of_contents: true 5 | hide_title: true 6 | --- 7 | 8 | import HTMLLoader from '@site/src/components/HTMLLoader'; 9 | import useBaseUrl from '@docusaurus/useBaseUrl'; 10 | 11 | Link to notebook: [balance_quickstart_cbps.ipynb](https://github.com/facebookresearch/balance/blob/main/tutorials/balance_quickstart_cbps.ipynb) 12 | 13 | 14 | -------------------------------------------------------------------------------- /website/docs/tutorials/quickstart_rake.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quickstart_rake 3 | sidebar_position: 4 4 | hide_table_of_contents: true 5 | hide_title: true 6 | --- 7 | 8 | import HTMLLoader from '@site/src/components/HTMLLoader'; 9 | import useBaseUrl from '@docusaurus/useBaseUrl'; 10 | 11 | Link to notebook: [balance_quickstart_rake.ipynb](https://github.com/facebookresearch/balance/blob/main/tutorials/balance_quickstart_rake.ipynb) 12 | 13 | 14 | -------------------------------------------------------------------------------- /website/docusaurus.config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * 4 | * This software may be used and distributed according to the terms of the 5 | * GNU General Public License version 2. 6 | */ 7 | 8 | const lightCodeTheme = require('prism-react-renderer/themes/github'); 9 | const darkCodeTheme = require('prism-react-renderer/themes/dracula'); 10 | const {fbContent} = require('docusaurus-plugin-internaldocs-fb/internal'); 11 | const math = require('remark-math'); 12 | const katex = require('rehype-katex'); 13 | 14 | // With JSDoc @type annotations, IDEs can provide config autocompletion 15 | /** @type {import('@docusaurus/types').DocusaurusConfig} */ 16 | (module.exports = { 17 | title: 'balance', 18 | tagline: 'A python package for balancing biased data samples', 19 | // TODO[scubasteve]: Migrate to final URL once set up 20 | url: 'https://internalfb.com', 21 | baseUrl: '/', 22 | onBrokenLinks: 'throw', 23 | onBrokenMarkdownLinks: 'throw', 24 | trailingSlash: true, 25 | favicon: 'img/balance_logo/icon.png', 26 | organizationName: 'facebook', 27 | projectName: 'balance', 28 | 29 | presets: [ 30 | [ 31 | require.resolve('docusaurus-plugin-internaldocs-fb/docusaurus-preset'), 32 | /** @type {import('docusaurus-plugin-internaldocs-fb').PresetOptions} */ 33 | ({ 34 | docs: { 35 | sidebarPath: require.resolve('./sidebars.js'), 36 | editUrl: fbContent({ 37 | internal: 'https://www.internalfb.com/intern/diffusion/FBS/browse/master/fbcode/core_stats/balance/parent_balance/website', 38 | external: 'https://github.com/facebookresearch/balance/tree/main/website', 39 | }), 40 | remarkPlugins: [math], 41 | rehypePlugins: [katex], 42 | }, 43 | theme: { 44 | customCss: require.resolve('./src/css/custom.css'), 45 | }, 46 | staticDocsProject: 'Balance', 47 | trackingFile: 'fbcode/core_stats/balance/WATCHED_FILES', 48 | 'remark-code-snippets': { 49 | baseDir: '..', 50 | }, 51 | enableEditor: true, 52 | }), 53 | ], 54 | ], 55 | 56 | stylesheets: [ 57 | { 58 | href: 'https://cdn.jsdelivr.net/npm/katex@0.13.24/dist/katex.min.css', 59 | type: 'text/css', 60 | integrity: 61 | 'sha384-odtC+0UGzzFL/6PNoE8rX/SPcQDXBJ+uRepguP4QkPCm2LBxH3FA3y+fKSiJ+AmM', 62 | crossorigin: 'anonymous', 63 | }, 64 | ], 65 | 66 | themeConfig: 67 | /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ 68 | ({ 69 | navbar: { 70 | title: 'balance', 71 | logo: { 72 | alt: 'balance Logo', 73 | src: 'img/balance_logo/icon.svg', 74 | }, 75 | items: [ 76 | {to: 'blog', label: 'Blog', position: 'right'}, 77 | { 78 | type: 'doc', 79 | docId: 'docs/overview', 80 | position: 'right', 81 | label: 'Docs', 82 | }, 83 | { 84 | type: 'doc', 85 | docId: 'tutorials/index', 86 | position: 'right', 87 | label: 'Tutorials', 88 | }, 89 | { 90 | type: 'doc', 91 | docId: 'api_reference/index', 92 | position: 'right', 93 | label: 'API Reference', 94 | }, 95 | // Please keep GitHub link to the right for consistency. 96 | { 97 | href: 'https://github.com/facebookresearch/balance', 98 | label: 'GitHub', 99 | position: 'right', 100 | }, 101 | ], 102 | }, 103 | footer: { 104 | style: 'dark', 105 | links: [ 106 | { 107 | title: 'Legal', 108 | // Please do not remove the privacy and terms, it's a legal requirement. 109 | items: [ 110 | { 111 | label: 'Privacy', 112 | href: 'https://opensource.fb.com/legal/privacy/', 113 | }, 114 | { 115 | label: 'Terms', 116 | href: 'https://opensource.fb.com/legal/terms/', 117 | }, 118 | { 119 | label: 'Data Policy', 120 | href: 'https://opensource.fb.com/legal/data-policy/', 121 | }, 122 | { 123 | label: 'Cookie Policy', 124 | href: 'https://opensource.fb.com/legal/cookie-policy/', 125 | }, 126 | ], 127 | }, 128 | ], 129 | logo: { 130 | alt: 'Meta Open Source Logo', 131 | // This default includes a positive & negative version, allowing for 132 | // appropriate use depending on your site's style. 133 | src: '/img/meta_opensource_logo_negative.svg', 134 | href: 'https://opensource.fb.com', 135 | }, 136 | copyright: ` 137 | Copyright © ${new Date().getFullYear()} Meta Platforms, Inc. Built with Docusaurus.
138 | Documentation Content Licensed Under CC-BY-4.0.
` 139 | }, 140 | prism: { 141 | theme: lightCodeTheme, 142 | darkTheme: darkCodeTheme, 143 | }, 144 | colorMode: { 145 | disableSwitch: true, 146 | }, 147 | }), 148 | }); 149 | -------------------------------------------------------------------------------- /website/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "staticdocs-starter", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "docusaurus": "docusaurus", 7 | "start": "docusaurus start", 8 | "build": "docusaurus build", 9 | "swizzle": "docusaurus swizzle", 10 | "deploy": "docusaurus deploy", 11 | "clear": "docusaurus clear", 12 | "clean": "docusaurus clear", 13 | "serve": "docusaurus serve", 14 | "write-translations": "docusaurus write-translations", 15 | "write-heading-ids": "docusaurus write-heading-ids" 16 | }, 17 | "dependencies": { 18 | "@docusaurus/core": "2.4.3", 19 | "@docusaurus/preset-classic": "2.4.3", 20 | "@mdx-js/react": "^1.6.21", 21 | "clsx": "^1.1.1", 22 | "docusaurus-plugin-internaldocs-fb": "1.8.0", 23 | "hast-util-is-element": "1.1.0", 24 | "prism-react-renderer": "^1.3.3", 25 | "react": "^17.0.2", 26 | "react-dom": "^17.0.2", 27 | "rehype-katex": "5", 28 | "remark-math": "3" 29 | }, 30 | "browserslist": { 31 | "production": [ 32 | ">0.5%", 33 | "not dead", 34 | "not op_mini all" 35 | ], 36 | "development": [ 37 | "last 1 chrome version", 38 | "last 1 firefox version", 39 | "last 1 safari version" 40 | ] 41 | }, 42 | "engines": { 43 | "node": ">=16", 44 | "npm": "use yarn instead", 45 | "yarn": "^1.5" 46 | }, 47 | "resolutions": { 48 | "node-fetch": "^2.6.7", 49 | "nth-check": "^2.0.1", 50 | "ansi-regex": "^5.0.1", 51 | "follow-redirects": "^1.14.8", 52 | "minimist": "^1.2.6", 53 | "minimatch": "^3.0.5", 54 | "d3-color": "^3.1.0", 55 | "cross-fetch": "^3.1.5", 56 | "terser": "^5.14.2", 57 | "trim": "0.0.3", 58 | "got": "^11.8.5", 59 | "eta": "^2.0.0", 60 | "ua-parser-js": "^0.7.33", 61 | "katex": ">=0.16.21", 62 | "dompurify": ">=3.2.4", 63 | "prismjs": ">=0.30.0", 64 | "@babel/helpers": ">=7.26.10", 65 | "@babel/runtime": ">=7.26.10", 66 | "@babel/runtime-corejs3": ">=7.26.10" 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /website/sidebars.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * 4 | * This software may be used and distributed according to the terms of the 5 | * GNU General Public License version 2. 6 | */ 7 | 8 | /** 9 | * Creating a sidebar enables you to: 10 | - create an ordered group of docs 11 | - render a sidebar for each doc of that group 12 | - provide next/previous navigation 13 | 14 | The sidebars can be generated from the filesystem, or explicitly defined here. 15 | 16 | Create as many sidebars as you want. 17 | */ 18 | 19 | 20 | /* 21 | TODO: change to Manual configuration, which is more recommended: 22 | https://staticdocs.internalfb.com/staticdocs/docs/documenting/create-a-page/#manual-configuration---recommended 23 | */ 24 | module.exports = { 25 | docsSidebar: [ 26 | { 27 | type: 'autogenerated', 28 | dirName: 'docs', 29 | }, 30 | ], 31 | tutorialsSidebar: [ 32 | { 33 | type: 'autogenerated', 34 | dirName: 'tutorials', 35 | }, 36 | ] 37 | }; 38 | -------------------------------------------------------------------------------- /website/src/components/HTMLLoader.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) Meta Platforms, Inc. and affiliates. 3 | * 4 | * This software may be used and distributed according to the terms of the 5 | * GNU General Public License version 2. 6 | */ 7 | 8 | import React, { useRef } from 'react'; 9 | import BrowserOnly from '@docusaurus/BrowserOnly'; 10 | 11 | export default function HTMLLoader(props) { 12 | let src = props.docFile; 13 | const resize = (frame) => { 14 | const doc = frame.contentWindow.document; 15 | frame.height = frame.contentWindow.document.body.scrollHeight + 'px'; 16 | }; 17 | const onLoad = (e) => { 18 | const frame = e.target; 19 | const doc = frame.contentWindow.document; 20 | const observer = new MutationObserver((list, obj) => { resize(frame); }); 21 | observer.observe(doc.body, {attributes:true, childList:true, subtree: true}); 22 | resize(frame); 23 | }; 24 | const frameRef = useRef(null) 25 | const f =