├── .DS_Store ├── .drone.yml ├── .github ├── .DS_Store └── workflows │ ├── image-pub.yml.bak │ └── release.yml.bak ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── config └── config.yaml ├── docker ├── Dockerfile ├── Dockerfile-arm ├── components_version.txt ├── entrypoint-arm.sh ├── entrypoint.sh ├── prepare.sh ├── release.sh ├── run.sh ├── runtime.Dockerfile └── wkhtmltox_0.12.6-1.focal_amd64.deb ├── docs └── dev.md ├── examples └── docs │ ├── 1. 股东会-会议通知.docx │ ├── maoxuan_full.pdf │ ├── maoxuan_intro_with_table.jpg │ ├── maoxuan_sample.docx │ ├── maoxuan_sample1.jpg │ ├── maoxuan_scan.pdf │ ├── maoxuan_v1.pdf │ ├── maoxuan_volumn_v1.txt │ ├── maoxuan_wikipedia.html │ ├── sw-flp-1965-v1.pdf │ ├── table_test_001.jpg │ ├── table_x1.tsv │ ├── table_x2.csv │ ├── test.md │ ├── test.xlsx │ ├── tests-example.xls │ ├── tests-example.xlsx │ ├── 毛泽东课件.ppt │ └── 静静的顿河-wiki.html ├── pyproject.toml ├── requirements.txt ├── scripts ├── install_latex.sh └── run.sh ├── setup.py ├── src └── bisheng_unstructured │ ├── __init__.py │ ├── __version__.py │ ├── api │ ├── __init__.py │ ├── any2pdf.py │ ├── main.py │ ├── pipeline.py │ └── types.py │ ├── cleaners │ ├── __init__.py │ ├── core.py │ ├── extract.py │ └── translate.py │ ├── common │ ├── __init__.py │ ├── logger.py │ └── timer.py │ ├── config │ ├── __init__.py │ ├── config.yaml │ └── settings.py │ ├── documents │ ├── __init__.py │ ├── base.py │ ├── coordinates.py │ ├── elements.py │ ├── email_elements.py │ ├── html.py │ ├── html_utils.py │ ├── layout.py │ ├── markdown.py │ ├── pdf_parser │ │ ├── __init__.py │ │ ├── blob.py │ │ ├── idp │ │ │ ├── __init__.py │ │ │ ├── image.py │ │ │ └── pdf.py │ │ ├── image.py │ │ ├── pdf.py │ │ ├── pdf_creator.py │ │ └── test_pdf.py │ └── xml.py │ ├── file_utils │ ├── __init__.py │ ├── encoding.py │ ├── exploration.py │ ├── file_conversion.py │ ├── filetype.py │ ├── google_filetype.py │ └── metadata.py │ ├── middlewares │ ├── __init__.py │ └── http_middleware.py │ ├── models │ ├── __init__.py │ ├── common.py │ ├── formula_agent.py │ ├── idp │ │ ├── __init__.py │ │ ├── config.py │ │ ├── dummy_ocr_agent.py │ │ ├── layout_agent.py │ │ ├── ocr_agent.py │ │ └── table_agent.py │ ├── layout_agent.py │ ├── ocr_agent.py │ ├── readme.md │ └── table_agent.py │ ├── nlp │ ├── __init__.py │ ├── english-words.txt │ ├── english_words.py │ ├── partition.py │ ├── patterns.py │ └── tokenize.py │ ├── partition │ ├── __init__.py │ ├── api.py │ ├── auto.py │ ├── common.py │ ├── csv.py │ ├── doc.py │ ├── docx.py │ ├── email.py │ ├── epub.py │ ├── html.py │ ├── image.py │ ├── json.py │ ├── md.py │ ├── msg.py │ ├── odt.py │ ├── org.py │ ├── pdf.py │ ├── ppt.py │ ├── pptx.py │ ├── rst.py │ ├── rtf.py │ ├── strategies.py │ ├── text.py │ ├── text_type.py │ ├── tsv.py │ ├── xls.py │ ├── xlsx.py │ └── xml.py │ ├── staging │ ├── __init__.py │ ├── argilla.py │ ├── base.py │ ├── baseplate.py │ ├── datasaur.py │ ├── huggingface.py │ ├── label_box.py │ ├── label_studio.py │ ├── prodigy.py │ └── weaviate.py │ ├── topdf │ ├── __init__.py │ ├── docx2pdf.py │ ├── excel2pdf.py │ ├── pptx2pdf.py │ └── text2pdf.py │ └── utils.py ├── tests ├── regression │ ├── deploy.sh │ ├── deploy_model.json │ ├── run_test_api.sh │ ├── test_config_update.py │ ├── test_container.sh │ └── test_etl4llm.py ├── run_test.sh ├── test_client.py ├── test_cmd_with_blank_char.py ├── test_doc.py ├── test_docx2pdf.py ├── test_docx_layout.py ├── test_docx_template_replace.py ├── test_excel2pdf.py ├── test_formula.py ├── test_idp_models_sdk.py ├── test_image.py ├── test_partition.py ├── test_partition_image.py ├── test_pdf.py ├── test_pdf_creator.py ├── test_pdf_parse_idp.py ├── test_pdf_parser.py ├── test_pipeline.py ├── test_pptx.py ├── test_pptx2pdf.py ├── test_pptx_layout.py ├── test_pptx_to_tex.py ├── test_text2pdf.py └── test_xlsx.py ├── unstructured.LICENSE └── version.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/.DS_Store -------------------------------------------------------------------------------- /.drone.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/.drone.yml -------------------------------------------------------------------------------- /.github/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/.github/.DS_Store -------------------------------------------------------------------------------- /.github/workflows/image-pub.yml.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/.github/workflows/image-pub.yml.bak -------------------------------------------------------------------------------- /.github/workflows/release.yml.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/.github/workflows/release.yml.bak -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/README.md -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/config/config.yaml -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/Dockerfile -------------------------------------------------------------------------------- /docker/Dockerfile-arm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/Dockerfile-arm -------------------------------------------------------------------------------- /docker/components_version.txt: -------------------------------------------------------------------------------- 1 | bisheng-unstructured=0.0.3.post4 2 | -------------------------------------------------------------------------------- /docker/entrypoint-arm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/entrypoint-arm.sh -------------------------------------------------------------------------------- /docker/entrypoint.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/entrypoint.sh -------------------------------------------------------------------------------- /docker/prepare.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/prepare.sh -------------------------------------------------------------------------------- /docker/release.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/release.sh -------------------------------------------------------------------------------- /docker/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/run.sh -------------------------------------------------------------------------------- /docker/runtime.Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/runtime.Dockerfile -------------------------------------------------------------------------------- /docker/wkhtmltox_0.12.6-1.focal_amd64.deb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/docker/wkhtmltox_0.12.6-1.focal_amd64.deb -------------------------------------------------------------------------------- /docs/dev.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 开发基线: e4aa7373e2d9011e96525e0c122a6668e5eb449b (unstructured) 4 | -------------------------------------------------------------------------------- /examples/docs/1. 股东会-会议通知.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/1. 股东会-会议通知.docx -------------------------------------------------------------------------------- /examples/docs/maoxuan_full.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_full.pdf -------------------------------------------------------------------------------- /examples/docs/maoxuan_intro_with_table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_intro_with_table.jpg -------------------------------------------------------------------------------- /examples/docs/maoxuan_sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_sample.docx -------------------------------------------------------------------------------- /examples/docs/maoxuan_sample1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_sample1.jpg -------------------------------------------------------------------------------- /examples/docs/maoxuan_scan.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_scan.pdf -------------------------------------------------------------------------------- /examples/docs/maoxuan_v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_v1.pdf -------------------------------------------------------------------------------- /examples/docs/maoxuan_volumn_v1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_volumn_v1.txt -------------------------------------------------------------------------------- /examples/docs/maoxuan_wikipedia.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/maoxuan_wikipedia.html -------------------------------------------------------------------------------- /examples/docs/sw-flp-1965-v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/sw-flp-1965-v1.pdf -------------------------------------------------------------------------------- /examples/docs/table_test_001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/table_test_001.jpg -------------------------------------------------------------------------------- /examples/docs/table_x1.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/table_x1.tsv -------------------------------------------------------------------------------- /examples/docs/table_x2.csv: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,2,3 3 | 4,5,6 4 | -------------------------------------------------------------------------------- /examples/docs/test.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/test.md -------------------------------------------------------------------------------- /examples/docs/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/test.xlsx -------------------------------------------------------------------------------- /examples/docs/tests-example.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/tests-example.xls -------------------------------------------------------------------------------- /examples/docs/tests-example.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/tests-example.xlsx -------------------------------------------------------------------------------- /examples/docs/毛泽东课件.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/毛泽东课件.ppt -------------------------------------------------------------------------------- /examples/docs/静静的顿河-wiki.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/examples/docs/静静的顿河-wiki.html -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/install_latex.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/scripts/install_latex.sh -------------------------------------------------------------------------------- /scripts/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/scripts/run.sh -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/setup.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" # pragma: no cover 2 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/api/any2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/api/any2pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/api/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/api/main.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/api/pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/api/pipeline.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/api/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/api/types.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/cleaners/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/cleaners/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/cleaners/core.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/cleaners/extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/cleaners/extract.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/cleaners/translate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/cleaners/translate.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/common/__init__.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/common/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/common/logger.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/common/timer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/common/timer.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/config/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/config/config.yaml -------------------------------------------------------------------------------- /src/bisheng_unstructured/config/settings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/config/settings.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/base.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/coordinates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/coordinates.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/elements.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/elements.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/email_elements.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/email_elements.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/html.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/html_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/html_utils.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/layout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/layout.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/markdown.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/markdown.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/blob.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/pdf_parser/blob.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/idp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/idp/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/pdf_parser/idp/image.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/idp/pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/pdf_parser/idp/pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/pdf_parser/image.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/pdf_parser/pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/pdf_creator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/pdf_parser/pdf_creator.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/pdf_parser/test_pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/pdf_parser/test_pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/documents/xml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/documents/xml.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/file_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/file_utils/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/file_utils/encoding.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/file_utils/exploration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/file_utils/exploration.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/file_utils/file_conversion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/file_utils/file_conversion.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/file_utils/filetype.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/file_utils/filetype.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/file_utils/google_filetype.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/file_utils/google_filetype.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/file_utils/metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/file_utils/metadata.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/middlewares/http_middleware.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/middlewares/http_middleware.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/__init__.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/common.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/formula_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/formula_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/idp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/idp/config.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/idp/dummy_ocr_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/idp/dummy_ocr_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/idp/layout_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/idp/layout_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/idp/ocr_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/idp/ocr_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/idp/table_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/idp/table_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/layout_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/layout_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/ocr_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/ocr_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/readme.md -------------------------------------------------------------------------------- /src/bisheng_unstructured/models/table_agent.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/models/table_agent.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/nlp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/nlp/english-words.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/nlp/english-words.txt -------------------------------------------------------------------------------- /src/bisheng_unstructured/nlp/english_words.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/nlp/english_words.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/nlp/partition.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/nlp/partition.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/nlp/patterns.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/nlp/patterns.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/nlp/tokenize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/nlp/tokenize.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/api.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/auto.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/auto.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/common.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/csv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/csv.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/doc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/doc.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/docx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/docx.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/email.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/email.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/epub.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/epub.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/html.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/image.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/json.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/md.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/md.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/msg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/msg.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/odt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/odt.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/org.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/org.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/ppt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/ppt.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/pptx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/pptx.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/rst.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/rst.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/rtf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/rtf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/strategies.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/strategies.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/text.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/text_type.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/text_type.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/tsv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/tsv.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/xls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/xls.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/xlsx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/xlsx.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/partition/xml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/partition/xml.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/argilla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/argilla.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/base.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/baseplate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/baseplate.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/datasaur.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/datasaur.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/huggingface.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/label_box.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/label_box.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/label_studio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/label_studio.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/prodigy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/prodigy.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/staging/weaviate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/staging/weaviate.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/topdf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/topdf/__init__.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/topdf/docx2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/topdf/docx2pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/topdf/excel2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/topdf/excel2pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/topdf/pptx2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/topdf/pptx2pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/topdf/text2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/topdf/text2pdf.py -------------------------------------------------------------------------------- /src/bisheng_unstructured/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/src/bisheng_unstructured/utils.py -------------------------------------------------------------------------------- /tests/regression/deploy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/regression/deploy.sh -------------------------------------------------------------------------------- /tests/regression/deploy_model.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/regression/deploy_model.json -------------------------------------------------------------------------------- /tests/regression/run_test_api.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/regression/run_test_api.sh -------------------------------------------------------------------------------- /tests/regression/test_config_update.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/regression/test_config_update.py -------------------------------------------------------------------------------- /tests/regression/test_container.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/regression/test_container.sh -------------------------------------------------------------------------------- /tests/regression/test_etl4llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/regression/test_etl4llm.py -------------------------------------------------------------------------------- /tests/run_test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/run_test.sh -------------------------------------------------------------------------------- /tests/test_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_client.py -------------------------------------------------------------------------------- /tests/test_cmd_with_blank_char.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_cmd_with_blank_char.py -------------------------------------------------------------------------------- /tests/test_doc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_doc.py -------------------------------------------------------------------------------- /tests/test_docx2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_docx2pdf.py -------------------------------------------------------------------------------- /tests/test_docx_layout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_docx_layout.py -------------------------------------------------------------------------------- /tests/test_docx_template_replace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_docx_template_replace.py -------------------------------------------------------------------------------- /tests/test_excel2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_excel2pdf.py -------------------------------------------------------------------------------- /tests/test_formula.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_formula.py -------------------------------------------------------------------------------- /tests/test_idp_models_sdk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_idp_models_sdk.py -------------------------------------------------------------------------------- /tests/test_image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_image.py -------------------------------------------------------------------------------- /tests/test_partition.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_partition.py -------------------------------------------------------------------------------- /tests/test_partition_image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_partition_image.py -------------------------------------------------------------------------------- /tests/test_pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pdf.py -------------------------------------------------------------------------------- /tests/test_pdf_creator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pdf_creator.py -------------------------------------------------------------------------------- /tests/test_pdf_parse_idp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pdf_parse_idp.py -------------------------------------------------------------------------------- /tests/test_pdf_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pdf_parser.py -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pipeline.py -------------------------------------------------------------------------------- /tests/test_pptx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pptx.py -------------------------------------------------------------------------------- /tests/test_pptx2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pptx2pdf.py -------------------------------------------------------------------------------- /tests/test_pptx_layout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pptx_layout.py -------------------------------------------------------------------------------- /tests/test_pptx_to_tex.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_pptx_to_tex.py -------------------------------------------------------------------------------- /tests/test_text2pdf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_text2pdf.py -------------------------------------------------------------------------------- /tests/test_xlsx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/tests/test_xlsx.py -------------------------------------------------------------------------------- /unstructured.LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataelement/bisheng-unstructured/HEAD/unstructured.LICENSE -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | v0.0.2 2 | --------------------------------------------------------------------------------