├── .dockerignore ├── data ├── docs │ ├── adv.gif │ ├── input_dino.gif │ ├── pipeline_overview.png │ └── bundlesdf_pipeline.png ├── samples │ └── retail_item │ │ ├── left │ │ ├── left000000.png │ │ ├── left000001.png │ │ ├── left000002.png │ │ ├── left000003.png │ │ ├── left000004.png │ │ ├── left000005.png │ │ ├── left000006.png │ │ ├── left000007.png │ │ ├── left000008.png │ │ ├── left000009.png │ │ ├── left000010.png │ │ ├── left000011.png │ │ ├── left000012.png │ │ ├── left000013.png │ │ ├── left000014.png │ │ ├── left000015.png │ │ ├── left000016.png │ │ ├── left000017.png │ │ ├── left000018.png │ │ ├── left000019.png │ │ ├── left000020.png │ │ ├── left000021.png │ │ ├── left000022.png │ │ ├── left000023.png │ │ ├── left000024.png │ │ ├── left000025.png │ │ ├── left000026.png │ │ ├── left000027.png │ │ ├── left000028.png │ │ ├── left000029.png │ │ ├── left000030.png │ │ ├── left000031.png │ │ ├── left000032.png │ │ ├── left000033.png │ │ ├── left000034.png │ │ ├── left000035.png │ │ └── left000036.png │ │ └── right │ │ ├── right000000.png │ │ ├── right000001.png │ │ ├── right000002.png │ │ ├── right000003.png │ │ ├── right000004.png │ │ ├── right000005.png │ │ ├── right000006.png │ │ ├── right000007.png │ │ ├── right000008.png │ │ ├── right000009.png │ │ ├── right000010.png │ │ ├── right000011.png │ │ ├── right000012.png │ │ ├── right000013.png │ │ ├── right000014.png │ │ ├── right000015.png │ │ ├── right000016.png │ │ ├── right000017.png │ │ ├── right000018.png │ │ ├── right000019.png │ │ ├── right000020.png │ │ ├── right000021.png │ │ ├── right000022.png │ │ ├── right000023.png │ │ ├── right000024.png │ │ ├── right000025.png │ │ ├── right000026.png │ │ ├── right000027.png │ │ ├── right000028.png │ │ ├── right000029.png │ │ ├── right000030.png │ │ ├── right000031.png │ │ ├── right000032.png │ │ ├── right000033.png │ │ ├── right000034.png │ │ ├── right000035.png │ │ └── right000036.png ├── LICENSE └── configs │ └── base.yaml ├── src ├── nvidia │ ├── __init__.py │ └── objectreconstruction │ │ ├── cli │ │ ├── __init__.py │ │ └── main.py │ │ ├── dataloader │ │ ├── __init__.py │ │ └── reconstruction_dataloader.py │ │ ├── utils │ │ ├── __init__.py │ │ ├── structures.py │ │ └── preprocessing.py │ │ ├── configs │ │ ├── __init__.py │ │ └── schema.py │ │ ├── networks │ │ ├── __init__.py │ │ ├── foundationstereo.py │ │ └── sam2infer.py │ │ └── __init__.py ├── requirements.txt ├── setup.py ├── MANIFEST.in ├── README.md └── pyproject.toml ├── .gitignore ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── documentation_request_new.yml │ ├── documentation_request_correction.yml │ ├── bug_report_form.yml │ └── feature_request_form.yml └── CODEOWNERS ├── CITATION.md ├── SECURITY.md ├── deploy └── compose │ └── docker-compose.yml ├── CHANGELOG.md ├── print_env.sh ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── .gitattributes └── docker └── Dockerfile /.dockerignore: -------------------------------------------------------------------------------- 1 | data/output/ 2 | notebooks/.ipynb_checkpoints 3 | **/__pycache__/ -------------------------------------------------------------------------------- /data/docs/adv.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/adv.gif -------------------------------------------------------------------------------- /data/docs/input_dino.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/input_dino.gif -------------------------------------------------------------------------------- /data/docs/pipeline_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/pipeline_overview.png -------------------------------------------------------------------------------- /data/docs/bundlesdf_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/bundlesdf_pipeline.png -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000000.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:68985599fe71f6dc055407e0f76f50e8bcb8408cc476a8f27be761efb5082fb9 3 | size 12328835 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000001.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2746f8871ff093274775fb4b79a65c6970509aa0b988e370d0bef3dced907581 3 | size 12645958 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000002.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a43e5f30816b9587665ad3d5804499934a19ffddfae1702c3595dc557277652b 3 | size 12901737 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000003.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d85d7ee84e8590359da375799f4ad4229a3a587165642556146e8ba01d556f17 3 | size 13136485 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000004.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:26877db244dc839d62740e409a0d2244a07835f6f9efb2a8c79f8d378b0619e7 3 | size 13373248 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000005.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:877451413167d59ecf065f02cf1c4d69afcf544468598ace4471e3d2522838bb 3 | size 13383554 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000006.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:aea8389834d151ec74cda682754d09043a0780f96f19a4235d1e55cac546e994 3 | size 13373851 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000007.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:72cc382a208a6b5ce834d98a38dc1291aa9588c982779d8c3d1516ed107c7520 3 | size 13500831 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000008.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7ae3e9d6fa0ab9b606aa875ad4c9633d0eb8edb5de7fab820ca79fca44f11de8 3 | size 13588682 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000009.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ad87e70fd50b80c48de1fffa805b389388e7f5bb077616ef99e34d4c6a3844b5 3 | size 13536425 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000010.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d2de80254f165ba9af319dfc38f21a4e098a50f32dc04367564086ae85061cfe 3 | size 13417513 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000011.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1e2e711be062493a5ebaa8651757aac47a6c0c7af168f0116082004991994afe 3 | size 13267914 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000012.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:33f0771c52d544bef69aab4fe8e93a9592daa77e4fa29f5771ee425d7f8d3950 3 | size 13330359 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000013.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ad8ad9a26c234da03e2da8ec6224578d38e339a239f9da83b8997b5122836132 3 | size 13444945 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000014.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c01f5c411eb24750a924f402f4aef014c350b44424dc0d4123820942e36cd8ad 3 | size 13884438 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000015.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f1a51b9a266bb1229986cf90da9a6a33d679519146c94cfb43bbe650b1c4ef4b 3 | size 14303246 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000016.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c656c108c1c7a5a4d7723c10330b521c827fff56a2f207d9c9298114b401c689 3 | size 14432480 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000017.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fd40e0d79962dc9aea79a9b83b99081ff55fc7c4b1587b2958e138d7cf085cff 3 | size 14454634 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000018.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:789e3866539b30b044cb6a20c8d62b07e2334888e0189df546a1338e95acaa41 3 | size 14286318 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000019.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:705593bccb0eedb2adfbe9f4e96b9eeba1eea8783a71853c0dd7fb728a6f7f0e 3 | size 14215593 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000020.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:767e3ae666f6112ba9cba2b377d46f81d3c451b6436c721835d993ca943e6233 3 | size 14076802 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000021.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:616c3f25b6b61a7c0982c81bbf68a2e2ca3867a227abfb5df5d16cb5532164c9 3 | size 13681923 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000022.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:48422c0d80fa448d72f15cfd037750a076f5e55c4fe10189858c06602ee5a0e6 3 | size 13125454 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000023.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:66ac600ac985e642604eec8aad827caebc13ac92862323db8320d1832bf8ee5c 3 | size 12530119 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000024.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fb405139a307425f147cd1cb0ee99a0c744b9e9f1e1d53d1fccffac058bb289b 3 | size 12430620 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000025.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:11b526b859367b6959284bc7053e3f2be16a0fe6b95b9a4292d8b48fbc08152f 3 | size 12214426 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000026.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:baa326185701e7220667e70e60324a8fdfc4f06ca05474bdab459dba30742610 3 | size 12310511 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000027.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b93891800b183c4afd3c91f8eee2c88342cdfd04b26b05236a335cfc3fcd07af 3 | size 12370352 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000028.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b39383c3e0b983c9e916b2bd4cfb61d1982fd5fbbf8e14b0beaf866efabfa3ca 3 | size 12429603 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000029.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:105062f742fe427fd22e5ef3b8d02f133e4dacdb18706f5e085ec67c32a8b292 3 | size 12301322 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000030.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2936bf6a7dc69abfe383ab0ec81cdb76f88f36554b27ca43b85c46d215c8ad54 3 | size 12403034 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000031.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b2b3be7ab9fa5e692cb169d93e4e1be41b5e339fcb769061711a81dc08f55512 3 | size 12329273 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000032.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fec230cc65ddd61e4352ae6a1d915b239d2e62a7b7844e46e3359fa881f42185 3 | size 12209108 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000033.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e93f0e786c257f98f092949c11517f2be585168b15877bf5b36bc1b4a2095293 3 | size 12484521 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000034.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:07d89d7d879b350e827643a0ae253f103c3841597954795d871a96c9031dd49f 3 | size 12734645 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000035.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:dd29e837c3664469da918eb78cf69b07c9297b29583a5247c1bc7f4e32155813 3 | size 12645619 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/left/left000036.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:926ee1c89388dc7663333fa5f0ecea6beb2742de7a6ee6e50c1af13d2be261a3 3 | size 12604569 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000000.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8e4034609bf574176f920ac4ad20a3f14b87a340e1d7e2c000cabc4a0e7b440e 3 | size 12307906 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000001.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:88115a8a54b5ed07d3fdd96fe9a5a2c875ce99598618279375ac8cd8bf5c4c26 3 | size 12293559 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000002.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:131d4d4168cbd1077f8073ef09342999020414bbd11d7aadd5143c89bd6e7ced 3 | size 12564149 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000003.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1d45a197af4133dd8269969c27330d6b59f9acd8ccd5904de3c1196337aea043 3 | size 12808280 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000004.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5b5ef4edae33b0c08bd002aa204c12db6814aaa9d5bb20e88f42a30637fad615 3 | size 13056049 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000005.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:23c61ba9d90eb651d4fb95399cf125504fbc8a21bfb9942d002df5b673d4aa62 3 | size 13244619 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000006.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8b21d5a1ac30c374066502bc8a1bcc2639d611401ea3cfb611e7cd7994ec982e 3 | size 13229861 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000007.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5c76813368be67e309a1d1e1f4493ce7e01c060ba234369328c7e8a96dd7c5fd 3 | size 13204653 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000008.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b1bd3edcfc08202dc61265787eeb34b9174f10fb340043dcbc419149e8c8f577 3 | size 13342051 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000009.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:08e8cf210cf291e76191fde7047ef62cc964fd66a0b2c15334e0e076fb503496 3 | size 13409813 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000010.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eea81615e6646aca2a36f62113437e1fb7ac0d3fc871e66fa6f3253c00edb749 3 | size 13390113 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000011.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2748c4290add290351ce5c2b9fa5c268b4c1239962bcc49d2a5c783982307bfe 3 | size 13248047 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000012.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:efc8c30f58a8d7db6e5c872e50c742141e89c7b830152a5b3fdded25ce03898d 3 | size 13016947 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000013.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d13a128dbff945de945b13924004da5724d4eca458a871838c657e65c8fe8196 3 | size 13065264 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000014.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5956cbe858c3c1c1f450b20bc60afb5778628aa99cc2929fb98386cdaa86baf5 3 | size 13213599 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000015.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e2f8a63ace3b74ed519cd53bcfd62b85b6826dfa6113e56da26c16a70a7f6d38 3 | size 13774114 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000016.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:498abff2f00065db5ca824f24196743ee0a7cfe87f6ede7270b51515d5ce9eeb 3 | size 14068457 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000017.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ea77d130df3b02cd4ef4e26b5ccb94f54c8f7921366e9c42494fe48ca64612a5 3 | size 14304233 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000018.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eec46fb29932d53824c28aa82fe4a0aeaf721991b4350db963aa3b61de4596af 3 | size 14118226 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000019.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3d0effe946906a30827ed6567b4e5d32a37a9e019209c33869f1f9d0033e957c 3 | size 14019921 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000020.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:26c5617ee88fcdbacd766ef3037441acf85ea8203757a952f0611104ecf985d4 3 | size 13940315 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000021.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6158ba1c18170380d50bbab089982ab75e8cd33004b0cca61700d63dd3f2acd0 3 | size 13685382 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000022.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:62f101d0869faa5aa570ea49fed99ea847d611d2d0ed2da1a1b3584f6b411978 3 | size 13263611 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000023.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:710cb7e20dcd157b32f77e23a74c0cfee4cfe4d148d4f235fd0b0e420e9f6855 3 | size 12688863 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000024.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b18d5b4d56355da128da4e9cbd96adecf5e06d721c29ffa50bb60ea04d83c4fb 3 | size 12231639 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000025.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:549eb2a5197e45557ae18d2b1e9a7212681815063bc304ec03c44171391f845f 3 | size 12055552 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000026.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:36acb14e260f58539d6424f35b51ddccda09f84421ce6ac66188c5a39c57aa7f 3 | size 12070464 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000027.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4aaaa6eac9bca39177ce9a7922073e8d5c38d5aef252ad35d17f40cda6e07976 3 | size 12192296 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000028.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8a9298dbb57130763f03f95820031a66717b21ca64d55e8cb3385307eea519b5 3 | size 12229413 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000029.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:38cb95acd5bf7c074ad6942cd74c02ef24016a541c8da63aaa268bf7f5fdb8ab 3 | size 12204798 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000030.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:21ddcf8e517c69ef1d3d9bf0d4de1171676793f8628e2cabbcaa1a83bc75ef30 3 | size 12209473 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000031.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d25dbf139f894ba6b37b6697fba8396c79fb6f1b3cf7a18c309ee1a81371e615 3 | size 12118436 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000032.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7dc9825de6e885c105414dc4f960b1330cd6c96ed0ea280471c501cabf5dca19 3 | size 12043599 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000033.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:58e19246104c0fe4d99648159c37be0371a9b54da13fdcb14dd2701ed9e48dbd 3 | size 12375658 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000034.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d35b8de816e854a62ff25bb33e9a8e6838a73926b56d40477f520e6afbeb24a2 3 | size 12525838 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000035.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:48721abf0cf2cefb2a093b4cf830ad634ba3e210206a24994fa9366ccee360cf 3 | size 12429751 4 | -------------------------------------------------------------------------------- /data/samples/retail_item/right/right000036.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a355d7ecc831546852bfb2400ccb9a1a8ef9c233be181e8b17e4649d4bcad6c1 3 | size 12356552 4 | -------------------------------------------------------------------------------- /src/nvidia/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | NVIDIA namespace package. 3 | 4 | This is a namespace package that allows multiple NVIDIA packages to coexist. 5 | """ 6 | 7 | __path__ = __import__('pkgutil').extend_path(__path__, __name__) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/weights 2 | data/output 3 | datasets/ 4 | weights/ 5 | __pycache__/ 6 | .vscode/ 7 | **/*.pyc 8 | notebooks/.ipynb_checkpoints/ 9 | data/samples/test/ 10 | src/.ipynb_checkpoints/README-checkpoint.md 11 | src/test.py 12 | -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-Line Interface for 3D Object Reconstruction. 3 | 4 | This module provides command-line tools for running the reconstruction pipeline 5 | and its individual components. 6 | """ 7 | 8 | from .main import main 9 | 10 | __all__ = [ 11 | 'main', 12 | ] -------------------------------------------------------------------------------- /src/requirements.txt: -------------------------------------------------------------------------------- 1 | ruamel_yaml 2 | pycuda==2025.1 3 | imageio 4 | numpy==1.26.4 5 | trimesh==4.6.1 6 | libigl==2.5.1 7 | iopath 8 | joblib==1.4.2 9 | scipy==1.15.1 10 | scikit-learn==1.6.1 11 | opencv-python==4.11.0.86 12 | python-multipart==0.0.20 13 | pytest 14 | omegaconf==2.3.0 15 | flash-attn==2.7.3 16 | xatlas==0.0.10 17 | transformations 18 | -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup script for nvidia-3d-object-reconstruction package. 3 | 4 | This file provides backward compatibility with older Python packaging tools. 5 | All configuration is now handled by pyproject.toml. 6 | """ 7 | 8 | from setuptools import setup 9 | 10 | # All configuration is in pyproject.toml 11 | # This file exists only for backward compatibility 12 | setup() -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data Loading Components for 3D Object Reconstruction. 3 | 4 | This module provides data readers and transformations for various input formats 5 | used in the reconstruction pipeline. 6 | """ 7 | 8 | from .reconstruction_dataloader import ReconstructionDataLoader 9 | 10 | __all__ = [ 11 | 'ReconstructionDataLoader' 12 | ] -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 4 | 5 | 6 | ## Checklist 7 | - [ ] I am familiar with the [Contributing Guidelines](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CONTRIBUTING.md). 8 | - [ ] New or existing tests using the default retail item example cover these changes. 9 | - [ ] The documentation is up to date with these changes. 10 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # GitHub info on config.yml 2 | # https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/configuring-issue-templates-for-your-repository#configuring-the-template-chooser 3 | # Set to 'false' if you only want the templates to be used. 4 | blank_issues_enabled: false 5 | 6 | # When using discussions instead of Question issue templates, 7 | # link that below to have it show up in the 'Submit Issue' page 8 | contact_links: 9 | - name: Report an issue 10 | url: https://github.com/NVIDIA/3DObjectReconstruction/issues/new/choose 11 | about: Please raise any issues here. 12 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | #package code owners 2 | src/ @3d-object-reconstruction-team/3d-object-reconstruction-codeowners 3 | 4 | #build/ops code owners 5 | .github/ @3d-object-reconstruction-team/3d-object-reconstruction-codeowners 6 | deploy/ @3d-object-reconstruction-team/3d-object-reconstruction-codeowners 7 | docker/ @3d-object-reconstruction-team/3d-object-reconstruction-codeowners 8 | 9 | # docs and examples code owners 10 | docs/ @3d-object-reconstruction-team/3d-object-reconstruction-codeowners 11 | notebooks/ @3d-object-reconstruction-team/3d-object-reconstruction-codeowners 12 | data/ @3d-object-reconstruction-team/3d-object-reconstruction-codeowners -------------------------------------------------------------------------------- /src/MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the package README 2 | include README.md 3 | 4 | # Include package configuration files 5 | recursive-include nvidia/objectreconstruction/configs *.yaml *.yml 6 | recursive-include nvidia/objectreconstruction/data *.txt 7 | 8 | # Include Python package files 9 | recursive-include nvidia *.py 10 | recursive-include nvidia *.pyi 11 | 12 | # Exclude compiled files and cache 13 | global-exclude *.pyc 14 | global-exclude *.pyo 15 | global-exclude *.so 16 | global-exclude __pycache__ 17 | global-exclude .git* 18 | global-exclude .DS_Store 19 | 20 | # Exclude development and build files 21 | exclude .gitignore 22 | exclude .pre-commit-config.yaml 23 | exclude .github 24 | exclude tox.ini 25 | exclude .coverage 26 | exclude .pytest_cache 27 | exclude build 28 | exclude dist 29 | exclude *.egg-info -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | # Citation 2 | 3 | Please cite the following papers when using this workflow: 4 | 5 | **FoundationStereo**: 6 | ```bibtex 7 | @article{wen2025stereo, 8 | title={FoundationStereo: Zero-Shot Stereo Matching}, 9 | author={Bowen Wen and Matthew Trepte and Joseph Aribido and Jan Kautz and Orazio Gallo and Stan Birchfield}, 10 | journal={CVPR}, 11 | year={2025} 12 | } 13 | ``` 14 | 15 | **BundleSDF**: 16 | ```bibtex 17 | @InProceedings{bundlesdfwen2023, 18 | author = {Bowen Wen and Jonathan Tremblay and Valts Blukis and Stephen Tyree and Thomas M\"{u}ller and Alex Evans and Dieter Fox and Jan Kautz and Stan Birchfield}, 19 | title = {{BundleSDF}: {N}eural 6-{DoF} Tracking and {3D} Reconstruction of Unknown Objects}, 20 | booktitle = {CVPR}, 21 | year = {2023}, 22 | } 23 | ``` 24 | -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility Functions for 3D Object Reconstruction. 3 | 4 | This module provides essential utility functions for data preprocessing, 5 | structure conversion, and I/O operations used throughout the reconstruction pipeline. 6 | """ 7 | 8 | from .preprocessing import ( 9 | load_config, 10 | setup_experiment_directory, 11 | process_video_frames, 12 | depth2xyzmap, 13 | toOpen3dCloud, 14 | read_video 15 | ) 16 | 17 | from .structures import dataclass_to_dict 18 | 19 | __all__ = [ 20 | # Preprocessing functions 21 | 'load_config', 22 | 'setup_experiment_directory', 23 | 'process_video_frames', 24 | 'depth2xyzmap', 25 | 'toOpen3dCloud', 26 | 'read_video', 27 | 28 | # Structure utilities 29 | 'dataclass_to_dict', 30 | ] -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/configs/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration Management for 3D Object Reconstruction. 3 | 4 | This module provides configuration schemas, default values, and validation 5 | for all components of the reconstruction pipeline. 6 | """ 7 | 8 | from .schema import ( 9 | NVBundleSDFConfig, 10 | BundleTrackConfig, 11 | NeRFConfig, 12 | FoundationStereoConfig, 13 | SAM2Config, 14 | RoMaConfig, 15 | CameraConfig, 16 | TextureBakeConfig, 17 | SegmentationConfig, 18 | DepthProcessingConfig, 19 | BasePathConfig 20 | ) 21 | 22 | __all__ = [ 23 | # Main configuration 24 | 'NVBundleSDFConfig', 25 | 26 | # Component configurations 27 | 'BundleTrackConfig', 28 | 'NeRFConfig', 29 | 'FoundationStereoConfig', 30 | 'SAM2Config', 31 | 'RoMaConfig', 32 | 'CameraConfig', 33 | 'TextureBakeConfig', 34 | 'SegmentationConfig', 35 | 'DepthProcessingConfig', 36 | 'BasePathConfig', 37 | ] -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/utils/structures.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | def dataclass_to_dict(obj): 4 | """ 5 | Recursively convert a dataclass object and its nested attributes to a dictionary. 6 | 7 | Args: 8 | obj: A dataclass object or any other Python object 9 | 10 | Returns: 11 | dict: A dictionary representation of the object with all nested objects converted 12 | """ 13 | if obj is None: 14 | return {} 15 | 16 | # Get the object's dictionary 17 | if hasattr(obj, '__dict__'): 18 | result = vars(obj) 19 | else: 20 | return obj 21 | 22 | # Recursively convert nested objects 23 | for key, value in result.items(): 24 | if hasattr(value, '__dict__'): 25 | result[key] = dataclass_to_dict(value) 26 | elif isinstance(value, (list, tuple)): 27 | result[key] = [dataclass_to_dict(item) if hasattr(item, '__dict__') else item for item in value] 28 | elif isinstance(value, dict): 29 | result[key] = {k: dataclass_to_dict(v) if hasattr(v, '__dict__') else v for k, v in value.items()} 30 | elif isinstance(value, Path): 31 | result[key] = str(value) 32 | 33 | return result -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/networks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural Network Models for 3D Object Reconstruction. 3 | 4 | This module contains all the neural network components used in the reconstruction pipeline: 5 | - FoundationStereoProcessor: Stereo depth estimation 6 | - NVBundleSDF: Main reconstruction pipeline combining BundleTrack and NeRF 7 | - Sam2Infer: SAM2-based object segmentation 8 | - FeatureMatchingInfer: RoMa-based feature matching 9 | - NerfRunner: Neural Radiance Field implementation 10 | - Tool utilities: Point cloud processing, mesh operations 11 | """ 12 | 13 | from .foundationstereo import FoundationStereoProcessor, FoundationStereoNet, run_depth_estimation 14 | from .nvbundlesdf import NVBundleSDF, vis_camera_poses 15 | from .sam2infer import Sam2Infer, run_mask_extraction 16 | from .roma import FeatureMatchingInfer 17 | from .nerf_runner import NerfRunner, ModelRendererOffscreen 18 | from .tool import ( 19 | PointCloudProcessor, 20 | MeshProcessor, 21 | TensorUtils, 22 | PoseUtils, 23 | compute_scene_bounds, 24 | set_seed 25 | ) 26 | 27 | __all__ = [ 28 | # Main pipeline 29 | 'NVBundleSDF', 30 | 31 | # Individual processors 32 | 'FoundationStereoProcessor', 33 | 'FoundationStereoNet', 34 | 'run_depth_estimation', 35 | 'Sam2Infer', 36 | 'run_mask_extraction', 37 | 'FeatureMatchingInfer', 38 | 'NerfRunner', 39 | 40 | # Utility classes 41 | 'PointCloudProcessor', 42 | 'MeshProcessor', 43 | 'TensorUtils', 44 | 'PoseUtils', 45 | 46 | # Utility functions 47 | 'compute_scene_bounds', 48 | 'set_seed', 49 | 'ModelRendererOffscreen', 50 | 'vis_camera_poses' 51 | ] -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization. 4 | 5 | If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.** 6 | 7 | ## Reporting Potential Security Vulnerability in an NVIDIA Product 8 | 9 | To report a potential security vulnerability in any NVIDIA product: 10 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) 11 | - E-Mail: psirt@nvidia.com 12 | - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) 13 | - Please include the following information: 14 | - Product/Driver name and version/branch that contains the vulnerability 15 | - Type of vulnerability (code execution, denial of service, buffer overflow, etc.) 16 | - Instructions to reproduce the vulnerability 17 | - Proof-of-concept or exploit code 18 | - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability 19 | 20 | While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information. 21 | 22 | ## NVIDIA Product Security 23 | 24 | For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security 25 | -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | NVIDIA 3D Object Reconstruction Package. 3 | 4 | A comprehensive framework for high-quality 3D object reconstruction from RGB-D input 5 | using neural implicit surfaces, bundle adjustment, and advanced feature matching. 6 | 7 | Key Features: 8 | - BundleTrack for camera pose tracking 9 | - FoundationStereo for depth estimation 10 | - SAM2 for object segmentation 11 | - Neural Implicit Surface representation 12 | - Texture baking for photorealistic results 13 | 14 | Example Usage: 15 | >>> from nvidia.reconstruction3d.object.networks import NVBundleSDF 16 | >>> from nvidia.reconstruction3d.object.configs.schema import NVBundleSDFConfig 17 | >>> 18 | >>> config = NVBundleSDFConfig() 19 | >>> pipeline = NVBundleSDF(config.nerf, config.bundletrack, config.roma) 20 | >>> pipeline.run_track(reader) 21 | >>> pipeline.run_global_sdf(reader) 22 | """ 23 | 24 | __version__ = "1.0.0" 25 | __author__ = "NVIDIA Corporation" 26 | __email__ = "support@nvidia.com" 27 | 28 | # Main pipeline imports 29 | from .networks.nvbundlesdf import NVBundleSDF 30 | from .configs.schema import NVBundleSDFConfig 31 | 32 | # Individual component imports 33 | from .networks.foundationstereo import FoundationStereoProcessor, run_depth_estimation 34 | from .networks.sam2infer import Sam2Infer, run_mask_extraction 35 | from .networks.roma import FeatureMatchingInfer 36 | from .dataloader.reconstruction_dataloader import ReconstructionDataLoader 37 | 38 | __all__ = [ 39 | 'NVBundleSDF', 40 | 'NVBundleSDFConfig', 41 | 'FoundationStereoProcessor', 42 | 'run_depth_estimation', 43 | 'Sam2Infer', 44 | 'run_mask_extraction', 45 | 'FeatureMatchingInfer', 46 | 'ReconstructionDataLoader', 47 | '__version__' 48 | ] -------------------------------------------------------------------------------- /deploy/compose/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | reconstruction-app: 3 | # Build configuration (used when BUILD_MODE=local) 4 | build: 5 | context: ../../ 6 | dockerfile: docker/Dockerfile 7 | args: 8 | - BUILDKIT_INLINE_CACHE=1 9 | 10 | # Image configuration - can be overridden with IMAGE_NAME env var 11 | image: ${IMAGE_NAME:-3d-object-reconstruction:latest} 12 | container_name: 3d-object-reconstruction-container-${USER:-default} 13 | 14 | # Shared memory size - useful for heavy workloads (adjust this as needed) 15 | shm_size: 8gb 16 | 17 | # GPU support 18 | deploy: 19 | resources: 20 | reservations: 21 | devices: 22 | - driver: nvidia 23 | count: all 24 | capabilities: [gpu] 25 | 26 | # Environment variables 27 | environment: 28 | - NVIDIA_VISIBLE_DEVICES=all 29 | - CUDA_VISIBLE_DEVICES=0 30 | - PYTHONPATH=/workspace/3d-object-reconstruction 31 | 32 | # Working directory 33 | working_dir: /workspace/3d-object-reconstruction 34 | 35 | # Volume mounts 36 | volumes: 37 | # Mount source code 38 | - ../../src:/workspace/3d-object-reconstruction/src 39 | # Mount data folder 40 | - ../../data:/workspace/3d-object-reconstruction/data 41 | # Mount notebooks for development 42 | - ../../notebooks:/workspace/3d-object-reconstruction/notebooks 43 | # Mount README.md 44 | - ../../README.md:/workspace/3d-object-reconstruction/README.md 45 | 46 | # Port mappings (for Jupyter notebook) - dynamically allocated 47 | ports: 48 | - "${JUPYTER_HOST_PORT:-8888}:8888" 49 | 50 | # Keep container running 51 | stdin_open: true 52 | tty: true 53 | 54 | # Restart policy 55 | restart: unless-stopped 56 | 57 | # User-specific network 58 | networks: 59 | - reconstruction-network 60 | 61 | 62 | # User-specific network to avoid conflicts 63 | networks: 64 | reconstruction-network: 65 | name: 3d-recon-network-${USER:-default} 66 | driver: bridge 67 | 68 | 69 | volumes: 70 | weights: 71 | driver: local 72 | output: 73 | driver: local -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation_request_new.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Documentation - New Documentation Request 17 | description: Request additions to 3D Object Reconstruction documentation 18 | title: "[DOC]: " 19 | labels: ["doc"] 20 | 21 | body: 22 | - type: markdown 23 | attributes: 24 | value: | 25 | Thanks for taking the time to improve our documentation! 26 | 27 | - type: dropdown 28 | id: criticality 29 | attributes: 30 | label: How would you describe the priority of this documentation request 31 | options: 32 | - Critical (currently preventing usage) 33 | - High 34 | - Medium 35 | - Low (would be nice) 36 | validations: 37 | required: true 38 | 39 | - type: textarea 40 | id: problem 41 | attributes: 42 | label: Describe the future/missing documentation 43 | placeholder: A code snippet mentions function foo(args) but I cannot find any documentation on it. 44 | validations: 45 | required: true 46 | 47 | - type: textarea 48 | id: search_locs 49 | attributes: 50 | label: Where have you looked? 51 | placeholder: | 52 | https://github.com/NVIDIA/3DObjectReconstruction/blob/main/README.md 53 | 54 | - type: checkboxes 55 | id: terms 56 | attributes: 57 | label: Code of Conduct 58 | description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md) 59 | options: 60 | - label: I agree to follow 3D Object Reconstruction's Code of Conduct 61 | required: true 62 | - label: I have searched the [open documentation issues](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3Adoc) and have found no duplicates for this bug report 63 | required: true 64 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 3D Object Reconstruction 0.1.0 (18 Jul 2025) 2 | 3 | ## New Features 4 | 5 | - **End-to-End 3D Reconstruction Workflow:** Initial release of the 3D Object Reconstruction workflow, providing a complete workflow to convert stereo video inputs into high-quality 3D assets. 6 | - **State-of-the-Art Model Integration:** The workflow integrates several cutting-edge models for robust and accurate reconstruction: 7 | - **FoundationStereo:** A transformer-based model for high-accuracy stereo depth estimation. 8 | - **SAM2 (Segment Anything Model 2):** Used for precise and consistent object segmentation in video sequences. 9 | - **RoMA (Robust Matching):** Employs robust feature matching to establish reliable correspondences between images. 10 | - **BundleSDF:** Implements neural 6-DoF tracking and 3D reconstruction for unknown objects, ensuring geometric accuracy. 11 | - **Sample Inference Data:** Includes a sample dataset of a retail item with corresponding configuration files, allowing users to quickly test and validate the reconstruction workflow. 12 | - **Docker Compose-Based Deployment:** 13 | - **Simplified Setup:** A single script (`deploy.sh`) automates the entire setup process, including downloading model weights, building container images, and managing external dependencies. 14 | - **Pre-configured Environment:** The Dockerfile is based on DeepStream base images and includes all necessary components to run the workflow out-of-the-box. 15 | - **Interactive Jupyter Notebook:** 16 | - **Step-by-Step Guidance:** A demo notebook (`3d_object_reconstruction_demo.ipynb`) provides an interactive, step-by-step guide through the reconstruction process. 17 | - **Easy to Use:** Designed for ease of use, allowing users to experiment with the workflow and visualize results in real-time. 18 | - **Command-Line Interface (CLI):** 19 | - **Automated Workflows:** Provides a CLI for running the reconstruction workflow, enabling batch processing and integration into automated workflows. 20 | 21 | ## Improvements 22 | 23 | - **High-Quality Mesh and Texture Generation:** The workflow is optimized to produce production-ready 3D meshes with photorealistic textures, suitable for digital twin creation, synthetic data generation, and more. 24 | - **Performance:** Achieves rapid processing, with the capability to generate a complete 3D asset in under 30 minutes on an NVIDIA RTX A6000 GPU. 25 | - **Extensibility:** The modular architecture allows for customization and integration of new models or components. 26 | 27 | ## Bug Fixes 28 | 29 | - No major bug fixes in this initial release. 30 | -------------------------------------------------------------------------------- /print_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # Reports relevant environment information useful for diagnosing and 4 | # debugging __PROJECT__ issues. 5 | # Usage: 6 | # "./print_env.sh" - prints to stdout 7 | # "./print_env.sh > env.txt" - prints to file "env.txt" 8 | 9 | print_env() { 10 | echo "**git***" 11 | if [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" == "true" ]; then 12 | git log --decorate -n 1 13 | echo "**git submodules***" 14 | git submodule status --recursive 15 | else 16 | echo "Not inside a git repository" 17 | fi 18 | echo 19 | 20 | echo "***OS Information***" 21 | cat /etc/*-release 22 | uname -a 23 | echo 24 | 25 | echo "***GPU Information***" 26 | nvidia-smi 27 | echo 28 | 29 | echo "***CPU***" 30 | lscpu 31 | echo 32 | 33 | echo "***Docker***" 34 | which docker && docker --version 35 | echo 36 | 37 | echo "***Docker Compose***" 38 | if command -v docker-compose &> /dev/null; then 39 | docker-compose --version 40 | elif docker compose version &> /dev/null; then 41 | docker compose version 42 | else 43 | echo "docker-compose or docker compose not found" 44 | fi 45 | echo 46 | 47 | echo "***NVIDIA Container Toolkit***" 48 | which nvidia-container-toolkit && nvidia-container-toolkit --version 49 | echo 50 | 51 | echo "***CMake***" 52 | which cmake && cmake --version 53 | echo 54 | 55 | echo "***g++***" 56 | which g++ && g++ --version 57 | echo 58 | 59 | echo "***nvcc***" 60 | which nvcc && nvcc --version 61 | echo 62 | 63 | echo "***Python***" 64 | which python && python -c "import sys; print('Python {0}.{1}.{2}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))" 65 | echo 66 | 67 | echo "***Environment Variables***" 68 | 69 | printf '%-32s: %s\n' PATH $PATH 70 | 71 | printf '%-32s: %s\n' LD_LIBRARY_PATH $LD_LIBRARY_PATH 72 | 73 | printf '%-32s: %s\n' NUMBAPRO_NVVM $NUMBAPRO_NVVM 74 | 75 | printf '%-32s: %s\n' NUMBAPRO_LIBDEVICE $NUMBAPRO_LIBDEVICE 76 | 77 | printf '%-32s: %s\n' CONDA_PREFIX $CONDA_PREFIX 78 | 79 | printf '%-32s: %s\n' PYTHON_PATH $PYTHON_PATH 80 | 81 | echo 82 | 83 | 84 | # Print conda packages if conda exists 85 | if type "conda" &> /dev/null; then 86 | echo '***conda packages***' 87 | which conda && conda list 88 | echo 89 | # Print pip packages if pip exists 90 | elif type "pip" &> /dev/null; then 91 | echo "conda not found" 92 | echo "***pip packages***" 93 | which pip && pip list 94 | echo 95 | else 96 | echo "conda not found" 97 | echo "pip not found" 98 | fi 99 | } 100 | 101 | echo "
Click here to see environment details
"
102 | echo "     "
103 | print_env | while read -r line; do
104 |     echo "     $line"
105 | done
106 | echo "
" 107 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation_request_correction.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Documentation - Correction/Update Request 17 | description: Request corrections or updates to existing documentation 18 | title: "[DOC]: " 19 | labels: ["doc"] 20 | 21 | body: 22 | - type: markdown 23 | attributes: 24 | value: | 25 | Thanks for taking the time to improve our documentation! 26 | 27 | - type: dropdown 28 | id: criticality 29 | attributes: 30 | label: How would you describe the priority of this documentation request 31 | options: 32 | - Critical (currently preventing usage) 33 | - High 34 | - Medium 35 | - Low (would be nice) 36 | validations: 37 | required: true 38 | 39 | - type: input 40 | id: correction_location 41 | attributes: 42 | label: Please provide a link or source to the relevant docs 43 | placeholder: "ex: https://github.com/NVIDIA/3DObjectReconstruction/blob/main/README.md" 44 | validations: 45 | required: true 46 | 47 | - type: textarea 48 | id: problem 49 | attributes: 50 | label: Describe the problems in the documentation 51 | placeholder: The documents say to use foo.func(args) however an AttributeError is thrown 52 | validations: 53 | required: true 54 | 55 | - type: textarea 56 | id: correction 57 | attributes: 58 | label: (Optional) Propose a correction 59 | placeholder: foo.func() was deprecated, replace documentation with foo.new_func() 60 | 61 | - type: checkboxes 62 | id: terms 63 | attributes: 64 | label: Code of Conduct 65 | description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md) 66 | options: 67 | - label: I agree to follow 3D Object Reconstruction's Code of Conduct 68 | required: true 69 | - label: I have searched the [open documentation issues](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3Adoc) and have found no duplicates for this bug report 70 | required: true 71 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report_form.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Bug Report 17 | description: File a bug report 18 | title: "[BUG]: " 19 | labels: ["bug"] 20 | 21 | body: 22 | - type: markdown 23 | attributes: 24 | value: | 25 | Thanks for taking the time to fill out this bug report! 26 | 27 | - type: input 28 | id: version 29 | attributes: 30 | label: Version 31 | description: What version of 3D Object Reconstruction are you running? 32 | placeholder: "example: 0.1.0" 33 | validations: 34 | required: true 35 | 36 | - type: dropdown 37 | id: installation-method 38 | attributes: 39 | label: Which installation method(s) does this occur on? 40 | multiple: true 41 | options: 42 | - Docker 43 | - Conda 44 | - Pip 45 | - Source 46 | 47 | - type: textarea 48 | id: description 49 | attributes: 50 | label: Describe the bug. 51 | description: Also tell us, what did you expect to happen? 52 | placeholder: XYZ occured, I expected QRS results 53 | validations: 54 | required: true 55 | 56 | - type: textarea 57 | id: mvr 58 | attributes: 59 | label: Minimum reproducible example 60 | description: Please supply a [minimum reproducible code example](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) here 61 | render: shell 62 | 63 | - type: textarea 64 | id: logs 65 | attributes: 66 | label: Relevant log output 67 | description: Please paste relevant error and log output here 68 | render: shell 69 | 70 | - type: textarea 71 | id: env-printout 72 | attributes: 73 | label: Full env printout 74 | description: Please run and paste the output of the `print_env.sh` script here, to gather any other relevant environment details 75 | render: shell 76 | 77 | - type: textarea 78 | id: misc 79 | attributes: 80 | label: Other/Misc. 81 | description: Please enter any other helpful information here. 82 | 83 | - type: textarea 84 | id: dataset 85 | attributes: 86 | label: Dataset 87 | description: Please provide a public link to the dataset you are using along with the output directory for repro if possible. 88 | 89 | - type: checkboxes 90 | id: terms 91 | attributes: 92 | label: Code of Conduct 93 | description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md) 94 | options: 95 | - label: I agree to follow 3D Object Reconstruction's Code of Conduct 96 | required: true 97 | - label: I have searched the [open bugs](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3Abug) and have found no duplicates for this bug report 98 | required: true 99 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to 3D Object Reconstruction 2 | 3 | If you are interested in contributing to 3D Object Reconstruction, your contributions will fall 4 | into three categories: 5 | 1. You want to report a bug, feature request, or documentation issue 6 | - File an [issue](hhttps://github.com/NVIDIA/3DObjectReconstruction/issues/new/choose) 7 | describing what you encountered or what you want to see changed. 8 | - Please run and paste the output of the `3DObjectReconstruction/print_env.sh` script while 9 | reporting a bug to gather and report relevant environment details. 10 | - The 3D Object Reconstruction team will evaluate the issues and triage them, scheduling 11 | them for a release. If you believe the issue needs priority attention 12 | comment on the issue to notify the team. 13 | 2. You want to propose a new Feature and implement it 14 | - Post about your intended feature, and we shall discuss the design and 15 | implementation. 16 | - Once we agree that the plan looks good, go ahead and implement it, using 17 | the [code contributions](#code-contributions) guide below. 18 | 3. You want to implement a feature or bug-fix for an outstanding issue 19 | - Follow the [code contributions](#code-contributions) guide below. 20 | - If you need more context on a particular issue, please ask and we shall 21 | provide. 22 | 23 | ## Code contributions 24 | 25 | ### Your first issue 26 | 27 | 1. Read the project's [README.md](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/README.md) 28 | to learn how to setup the development environment. 29 | 2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/nvidia/3DObjectReconstruction/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) 30 | or [help wanted](https://github.com/nvidia/3DObjectReconstruction/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels 31 | 3. Comment on the issue saying you are going to work on it. 32 | 4. Code! Make sure to contribute any unit tests and validate that the workflow works with the default inference example! 33 | 6. When done, [create your pull request](https://github.com/nvidia/3DObjectReconstruction/compare). 34 | 7. Wait for other developers to review your code and update code as needed. 35 | 8. Once reviewed and approved, a 3D Object Reconstruction developer will merge your pull request. 36 | 37 | Remember, if you are unsure about anything, don't hesitate to comment on issues and ask for clarifications! 38 | 39 | ### Managing PR labels 40 | 41 | Each PR must be labeled according to whether it is a "breaking" or "non-breaking" change (using Github labels). This is used to highlight changes that users should know about when upgrading. 42 | 43 | For 3D Object Reconstruction, a "breaking" change is one that modifies the codebase in a 44 | non-backward-compatible way. 45 | 46 | Additional labels must be applied to indicate whether the change is a feature, improvement, bugfix, or documentation change. 47 | 48 | ### Branch naming 49 | 50 | Branches used to create PRs should have a name of the form `-` 51 | which conforms to the following conventions: 52 | - Type: 53 | - fea - For if the branch is for a new feature(s) 54 | - enh - For if the branch is an enhancement of an existing feature(s) 55 | - bug - For if the branch is for fixing a bug(s) or regression(s) 56 | - Name: 57 | - A name to convey what is being worked on 58 | - Please use dashes or underscores between words as opposed to spaces. 59 | 60 | ## Attribution 61 | Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md 62 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Overview 4 | 5 | Define the code of conduct followed and enforced for 3D Object Reconstruction. 6 | 7 | ### Intended audience 8 | 9 | Community | Developers | Project Leads 10 | 11 | ## Our Pledge 12 | 13 | In the interest of fostering an open and welcoming environment, we as 14 | contributors and maintainers pledge to making participation in our project and 15 | our community a harassment-free experience for everyone, regardless of age, body 16 | size, disability, ethnicity, sex characteristics, gender identity and expression, 17 | level of experience, education, socio-economic status, nationality, personal 18 | appearance, race, religion, or sexual identity and orientation. 19 | 20 | ## Our Standards 21 | 22 | Examples of behavior that contributes to creating a positive environment 23 | include: 24 | 25 | * Using welcoming and inclusive language 26 | * Being respectful of differing viewpoints and experiences 27 | * Gracefully accepting constructive criticism 28 | * Focusing on what is best for the community 29 | * Showing empathy towards other community members 30 | 31 | Examples of unacceptable behavior by participants include: 32 | 33 | * The use of sexualized language or imagery and unwelcome sexual attention or 34 | advances 35 | * Trolling, insulting/derogatory comments, and personal or political attacks 36 | * Public or private harassment 37 | * Publishing others' private information, such as a physical or electronic 38 | address, without explicit permission 39 | * Other conduct which could reasonably be considered inappropriate in a 40 | professional setting 41 | 42 | ## Our Responsibilities 43 | 44 | Project maintainers are responsible for clarifying the standards of acceptable 45 | behavior and are expected to take appropriate and fair corrective action in 46 | response to any instances of unacceptable behavior. 47 | 48 | Project maintainers have the right and responsibility to remove, edit, or 49 | reject comments, commits, code, wiki edits, issues, and other contributions 50 | that are not aligned to this Code of Conduct, or to ban temporarily or 51 | permanently any contributor for other behaviors that they deem inappropriate, 52 | threatening, offensive, or harmful. 53 | 54 | ## Scope 55 | 56 | This Code of Conduct applies both within project spaces and in public spaces 57 | when an individual is representing the project or its community. Examples of 58 | representing a project or community include using an official project e-mail 59 | address, posting via an official social media account, or acting as an appointed 60 | representative at an online or offline event. Representation of a project may be 61 | further defined and clarified by project maintainers. 62 | 63 | ## Enforcement 64 | 65 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 66 | reported by contacting GitHub_Conduct@nvidia.com. All complaints will be reviewed and 67 | investigated and will result in a response that is deemed necessary and appropriate 68 | to the circumstances. The project team is obligated to maintain confidentiality with 69 | regard to the reporter of an incident. Further details of specific enforcement policies 70 | may be posted separately. 71 | 72 | Project maintainers who do not follow or enforce the Code of Conduct in good 73 | faith may face temporary or permanent repercussions as determined by other 74 | members of the project's leadership. 75 | 76 | ## Attribution 77 | 78 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 79 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 80 | 81 | [homepage]: https://www.contributor-covenant.org 82 | 83 | For answers to common questions about this code of conduct, see 84 | https://www.contributor-covenant.org/faq 85 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA 3D Object Reconstruction Package 2 | 3 | A comprehensive workflow for high-quality 3D object reconstruction from RGB-D input using neural implicit surfaces, bundle adjustment, and advanced feature matching. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | pip install nvidia-3d-object-reconstruction 9 | ``` 10 | 11 | ## Quick Start 12 | 13 | ```python 14 | from nvidia.reconstruction3d.object.networks import NVBundleSDF 15 | from nvidia.reconstruction3d.object.configs.schema import NVBundleSDFConfig 16 | 17 | # Initialize configuration 18 | config = NVBundleSDFConfig() 19 | 20 | # Create reconstruction workflow 21 | workflow = NVBundleSDF( 22 | config_nerf=config.nerf, 23 | cfg_bundletrack=config.bundletrack, 24 | roma_config=config.roma 25 | ) 26 | 27 | # Run the reconstruction workflow 28 | workflow.run_track(reader) 29 | workflow.run_global_sdf(reader) 30 | workflow.run_texture_bake(reader) 31 | ``` 32 | 33 | ## Package Components 34 | 35 | ### Networks (`nvidia.reconstruction3d.object.networks`) 36 | 37 | - **NVBundleSDF**: Main reconstruction workflow 38 | - **FoundationStereoProcessor**: Stereo depth estimation 39 | - **Sam2Infer**: SAM2-based object segmentation 40 | - **FeatureMatchingInfer**: RoMa feature matching 41 | - **NerfRunner**: Neural Radiance Field implementation 42 | 43 | ### Configuration (`nvidia.reconstruction3d.object.configs`) 44 | 45 | - **NVBundleSDFConfig**: Main configuration schema 46 | - **BundleTrackConfig**: Bundle adjustment settings 47 | - **NeRFConfig**: Neural field parameters 48 | - **FoundationStereoConfig**: Stereo depth settings 49 | - **SAM2Config**: Segmentation parameters 50 | 51 | ### Utilities (`nvidia.reconstruction3d.object.utils`) 52 | 53 | - **preprocessing**: Data preprocessing functions 54 | - **structures**: Data structure utilities 55 | 56 | ## Individual Component Usage 57 | 58 | ### Stereo Depth Estimation 59 | 60 | ```python 61 | from nvidia.reconstruction3d.object.networks import FoundationStereoProcessor 62 | 63 | processor = FoundationStereoProcessor(config, rgb_path, output_path) 64 | processor.run() 65 | ``` 66 | 67 | ### Object Segmentation 68 | 69 | ```python 70 | from nvidia.reconstruction3d.object.networks import Sam2Infer 71 | 72 | sam2 = Sam2Infer(config) 73 | sam2.run(rgb_path, mask_path) 74 | ``` 75 | 76 | ### Feature Matching 77 | 78 | ```python 79 | from nvidia.reconstruction3d.object.networks import FeatureMatchingInfer 80 | 81 | matcher = FeatureMatchingInfer(config) 82 | ``` 83 | 84 | ## Configuration Management 85 | 86 | ```python 87 | from nvidia.reconstruction3d.object.configs.schema import ( 88 | NVBundleSDFConfig, 89 | FoundationStereoConfig, 90 | SAM2Config, 91 | BundleTrackConfig, 92 | NeRFConfig 93 | ) 94 | 95 | # Create and customize configuration 96 | config = NVBundleSDFConfig() 97 | config.nerf.n_step = 5000 98 | config.foundation_stereo.scale = 0.5 99 | config.sam2.bbox = [1144, 627, 2227, 2232] 100 | ``` 101 | 102 | ## Command Line Interface 103 | 104 | ```bash 105 | # Run reconstruction workflow 106 | nvidia-3d-reconstruct --config config.yaml --data-path /path/to/data 107 | 108 | # Get help 109 | nvidia-3d-reconstruct --help 110 | ``` 111 | 112 | ## Key Features 113 | 114 | - **BundleTrack**: Camera pose tracking and bundle adjustment 115 | - **FoundationStereo**: Advanced stereo depth estimation 116 | - **SAM2**: Object segmentation using Segment Anything Model 2 117 | - **Neural Implicit Surfaces**: High-quality 3D reconstruction using NeRF 118 | - **Texture Baking**: Photorealistic texture generation 119 | 120 | ## Requirements 121 | 122 | - **GPU**: NVIDIA GPU with CUDA support (minimum requirements: Compute Capability 7.0 with at least 24GB VRAM) 123 | - **Memory**: 32GB+ RAM recommended 124 | - **Storage**: 100GB+ free space recommended 125 | - **OS**: Ubuntu 22.04+ 126 | 127 | ## License 128 | 129 | NVIDIA License (Non-Commercial) - see LICENSE file for details. 130 | 131 | **Important**: This software is for non-commercial use only. This package incorporates third-party components under different licenses including CC BY-NC-SA 4.0. Review the complete LICENSE file for all terms and attributions. 132 | 133 | ## Support 134 | 135 | For issues and questions, please visit the project repository. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request_form.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Feature Request Form 17 | description: Request new or improved functionality or changes to existing functionality 18 | title: "[FEA]: " 19 | labels: ["feature request"] 20 | 21 | body: 22 | - type: markdown 23 | attributes: 24 | value: | 25 | Thanks for taking the time to fill out this feature request! 26 | 27 | - type: dropdown 28 | id: new_or_improvement 29 | attributes: 30 | label: Is this a new feature, an improvement, or a change to existing functionality? 31 | options: 32 | - New Feature 33 | - Improvement 34 | - Change 35 | validations: 36 | required: true 37 | 38 | - type: dropdown 39 | id: criticality 40 | attributes: 41 | label: How would you describe the priority of this feature request 42 | options: 43 | - Critical (currently preventing usage) 44 | - High 45 | - Medium 46 | - Low (would be nice) 47 | validations: 48 | required: true 49 | 50 | - type: textarea 51 | id: problem 52 | attributes: 53 | label: Please provide a clear description of problem this feature solves 54 | description: Real usage examples are especially helpful, non-code. 55 | validations: 56 | required: true 57 | 58 | - type: textarea 59 | id: Feature_Description 60 | attributes: 61 | label: Feature Description 62 | description: Please provide clear description of the feature you request (refer to [User Story format](https://www.atlassian.com/agile/project-management/user-stories#:~:text=User%20story%20template%20and%20examples) and [EARS format](https://ieeexplore.ieee.org/document/5328509)) 63 | placeholder: > 64 | For new feature request, please use one of the following format to describe the feature 65 | 1. From End-user perspective, use the following user story format 66 | As a , I , . 67 | 2. From System perspective, use the following EARS format 68 | shall 69 | For changing or improving existing feature, it's recommended to provide the previoius Feature Request ID. 70 | validations: 71 | required: true 72 | 73 | - type: textarea 74 | id: solution 75 | attributes: 76 | label: Describe your ideal solution 77 | description: Please describe the functionality you would like added. 78 | placeholder: > 79 | A new function that takes in the information in this form, and triages the issue 80 | 81 | def feature_request(form_info): 82 | parse(form_info) 83 | return triage_outcome 84 | validations: 85 | required: true 86 | 87 | - type: textarea 88 | id: alternatives 89 | attributes: 90 | label: Describe any alternatives you have considered 91 | description: List any other libraries, or approaches you have looked at or tried. 92 | placeholder: I have looked at library xyz and qrs, but they do not offer GPU accleration 93 | 94 | - type: textarea 95 | id: misc 96 | attributes: 97 | label: Additional context 98 | description: Add any other context, code examples, or references to existing implementations about the feature request here. If applicable, please list the modules affected. 99 | 100 | - type: checkboxes 101 | id: terms 102 | attributes: 103 | label: Code of Conduct 104 | description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md) 105 | options: 106 | - label: I agree to follow 3D Object Reconstruction's Code of Conduct 107 | required: true 108 | - label: I have searched the [open feature requests](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3A%22feature+request%22%2Cimprovement%2Cenhancement) and have found no duplicates for this feature request 109 | required: true 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | NVIDIA License 2 | 3 | 1. Definitions 4 | 5 | "Licensor" means any person or entity that distributes its Work. 6 | 7 | "Work" means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license. 8 | 9 | The terms "reproduce," "reproduction," "derivative works," and "distribution" have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work. 10 | 11 | Works are "made available" under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license. 12 | 13 | 2. License Grant 14 | 15 | 2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form. 16 | 17 | 3. Limitations 18 | 19 | 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work. 20 | 21 | 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work ("Your Terms") only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself. 22 | 23 | 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, "non-commercially" means for research or evaluation purposes only. 24 | 25 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately. 26 | 27 | 3.5 Trademarks. This license does not grant any rights to use any Licensor's or its affiliates' names, logos, or trademarks, except as necessary to reproduce the notices described in this license. 28 | 29 | 3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately. 30 | 31 | 4. Disclaimer of Warranty 32 | 33 | THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. 34 | 35 | 5. Limitation of Liability 36 | 37 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 38 | 39 | ================================================================================ 40 | THIRD-PARTY COMPONENTS 41 | ================================================================================ 42 | 43 | This project incorporates code from third-party sources under different licenses: 44 | 45 | BundleTrack CUDA Components 46 | --------------------------- 47 | Source: https://github.com/NVlabs/BundleSDF/tree/master/BundleTrack/src/cuda 48 | License: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0) 49 | Copyright: NVIDIA Corporation 50 | 51 | This component is licensed under CC BY-NC-SA 4.0. You may: 52 | - Share and adapt the material for non-commercial purposes only 53 | - Must provide appropriate attribution 54 | - Must distribute any derivative works under the same license 55 | - Cannot use for commercial purposes 56 | 57 | Full license text: https://creativecommons.org/licenses/by-nc-sa/4.0/ 58 | 59 | Note: The CC BY-NC-SA 4.0 non-commercial restriction applies to any derivative works 60 | of the BundleTrack CUDA components. Users must ensure compliance with both the main 61 | project license and this third-party license when using this software. 62 | -------------------------------------------------------------------------------- /src/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=65.0", "wheel", "setuptools-scm"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "nvidia-3d-object-reconstruction" 7 | version = "0.1.0" 8 | description = "NVIDIA 3D Object Reconstruction Framework using BundleTrack and Neural Implicit Surfaces" 9 | readme = "README.md" 10 | license = {text = "NVIDIA License (Non-Commercial)"} 11 | authors = [ 12 | {name = "NVIDIA Corporation", email = "support@nvidia.com"} 13 | ] 14 | maintainers = [ 15 | {name = "NVIDIA Corporation", email = "support@nvidia.com"} 16 | ] 17 | keywords = [ 18 | "3D reconstruction", 19 | "neural implicit surfaces", 20 | "bundle adjustment", 21 | "computer vision", 22 | "NVIDIA", 23 | "stereo vision", 24 | "depth estimation", 25 | "NeRF" 26 | ] 27 | classifiers = [ 28 | "Development Status :: 4 - Beta", 29 | "Intended Audience :: Developers", 30 | "Intended Audience :: Science/Research", 31 | "Operating System :: POSIX :: Linux", 32 | "Programming Language :: Python :: 3", 33 | "Programming Language :: Python :: 3.8", 34 | "Programming Language :: Python :: 3.9", 35 | "Programming Language :: Python :: 3.10", 36 | "Programming Language :: Python :: 3.11", 37 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 38 | "Topic :: Scientific/Engineering :: Image Recognition", 39 | "Topic :: Multimedia :: Graphics :: 3D Modeling", 40 | ] 41 | requires-python = ">=3.8" 42 | dependencies = [ 43 | # Core ML/CV libraries 44 | "torch>=2.0.0", 45 | "torchvision>=0.15.0", 46 | "numpy==1.26.4", 47 | "opencv-python>=4.5.0", 48 | "Pillow>=8.0.0", 49 | "imageio>=2.9.0", 50 | "scikit-image>=0.18.0", 51 | "scikit-learn>=1.0.0", 52 | 53 | # 3D processing 54 | "open3d>=0.15.0", 55 | "trimesh>=3.15.0", 56 | "pyrender>=0.1.45", 57 | 58 | # Configuration and data handling 59 | "omegaconf>=2.1.0", 60 | "pyyaml>=6.0", 61 | "tqdm>=4.60.0", 62 | "joblib>=1.1.0", 63 | 64 | # Utilities 65 | "typing-extensions>=4.0.0", 66 | "ipympl", 67 | 68 | # External models (these may need to be installed separately) 69 | # "foundation-stereo>=1.0.0", # Custom package 70 | # "roma>=1.0.0", # RoMa matcher 71 | # "sam2>=1.0.0", # SAM2 segmentation 72 | "xformers" 73 | ] 74 | 75 | [project.optional-dependencies] 76 | dev = [ 77 | "pytest>=7.0.0", 78 | "pytest-cov>=3.0.0", 79 | "black>=22.0.0", 80 | "isort>=5.10.0", 81 | "flake8>=4.0.0", 82 | "mypy>=0.950", 83 | "pre-commit>=2.17.0", 84 | ] 85 | jupyter = [ 86 | "jupyter>=1.0.0", 87 | "jupyterlab>=3.0.0", 88 | "notebook>=6.4.0", 89 | "ipywidgets>=7.6.0", 90 | ] 91 | viz = [ 92 | "matplotlib>=3.5.0", 93 | "plotly>=5.0.0", 94 | "seaborn>=0.11.0", 95 | ] 96 | all = [ 97 | "nvidia-3d-object-reconstruction[dev,jupyter,viz]" 98 | ] 99 | 100 | [project.urls] 101 | Homepage = "https://github.com/NVIDIA/3DObjectReconstruction" 102 | Documentation = "https://github.com/NVIDIA/3DObjectReconstruction" 103 | Repository = "https://github.com/NVIDIA/3DObjectReconstruction.git" 104 | Issues = "https://github.com/NVIDIA/3DObjectReconstruction/issues" 105 | Changelog = "https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CHANGELOG.md" 106 | 107 | [project.scripts] 108 | nvidia-3d-reconstruct = "nvidia.objectreconstruction.cli.main:main" 109 | 110 | [tool.setuptools] 111 | zip-safe = false 112 | include-package-data = true 113 | 114 | [tool.setuptools.packages.find] 115 | where = ["."] 116 | include = ["nvidia*"] 117 | namespaces = true 118 | 119 | [tool.setuptools.package-data] 120 | "nvidia.objectreconstruction" = [ 121 | "configs/*.yaml", 122 | "configs/*.yml", 123 | "data/*.txt", 124 | "*.md" 125 | ] 126 | 127 | # Development tools configuration 128 | [tool.black] 129 | line-length = 100 130 | target-version = ['py38', 'py39', 'py310', 'py311'] 131 | include = '\.pyi?$' 132 | extend-exclude = ''' 133 | /( 134 | # directories 135 | \.eggs 136 | | \.git 137 | | \.hg 138 | | \.mypy_cache 139 | | \.tox 140 | | \.venv 141 | | build 142 | | dist 143 | )/ 144 | ''' 145 | 146 | [tool.isort] 147 | profile = "black" 148 | line_length = 100 149 | multi_line_output = 3 150 | include_trailing_comma = true 151 | force_grid_wrap = 0 152 | use_parentheses = true 153 | ensure_newline_before_comments = true 154 | 155 | [tool.mypy] 156 | python_version = "3.8" 157 | warn_return_any = true 158 | warn_unused_configs = true 159 | disallow_untyped_defs = true 160 | disallow_incomplete_defs = true 161 | check_untyped_defs = true 162 | disallow_untyped_decorators = true 163 | no_implicit_optional = true 164 | warn_redundant_casts = true 165 | warn_unused_ignores = true 166 | warn_no_return = true 167 | warn_unreachable = true 168 | strict_equality = true 169 | 170 | [tool.pytest.ini_options] 171 | minversion = "7.0" 172 | addopts = "-ra -q --strict-markers --strict-config" 173 | testpaths = [ 174 | "tests", 175 | ] 176 | filterwarnings = [ 177 | "error", 178 | "ignore::UserWarning", 179 | "ignore::DeprecationWarning", 180 | ] -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | data/samples/retail_item/ filter=lfs diff=lfs merge=lfs -text 2 | data/samples/retail_item/left/left000002.png filter=lfs diff=lfs merge=lfs -text 3 | data/samples/retail_item/left/left000005.png filter=lfs diff=lfs merge=lfs -text 4 | data/samples/retail_item/left/left000026.png filter=lfs diff=lfs merge=lfs -text 5 | data/samples/retail_item/left/left000034.png filter=lfs diff=lfs merge=lfs -text 6 | data/samples/retail_item/left/left000006.png filter=lfs diff=lfs merge=lfs -text 7 | data/samples/retail_item/left/left000007.png filter=lfs diff=lfs merge=lfs -text 8 | data/samples/retail_item/left/left000014.png filter=lfs diff=lfs merge=lfs -text 9 | data/samples/retail_item/left/left000028.png filter=lfs diff=lfs merge=lfs -text 10 | data/samples/retail_item/left/left000035.png filter=lfs diff=lfs merge=lfs -text 11 | data/samples/retail_item/left/left000003.png filter=lfs diff=lfs merge=lfs -text 12 | data/samples/retail_item/left/left000015.png filter=lfs diff=lfs merge=lfs -text 13 | data/samples/retail_item/left/left000018.png filter=lfs diff=lfs merge=lfs -text 14 | data/samples/retail_item/left/left000022.png filter=lfs diff=lfs merge=lfs -text 15 | data/samples/retail_item/left/left000031.png filter=lfs diff=lfs merge=lfs -text 16 | data/samples/retail_item/left/left000001.png filter=lfs diff=lfs merge=lfs -text 17 | data/samples/retail_item/left/left000010.png filter=lfs diff=lfs merge=lfs -text 18 | data/samples/retail_item/left/left000017.png filter=lfs diff=lfs merge=lfs -text 19 | data/samples/retail_item/left/left000030.png filter=lfs diff=lfs merge=lfs -text 20 | data/samples/retail_item/left/left000032.png filter=lfs diff=lfs merge=lfs -text 21 | data/samples/retail_item/left/left000000.png filter=lfs diff=lfs merge=lfs -text 22 | data/samples/retail_item/left/left000013.png filter=lfs diff=lfs merge=lfs -text 23 | data/samples/retail_item/left/left000021.png filter=lfs diff=lfs merge=lfs -text 24 | data/samples/retail_item/left/left000023.png filter=lfs diff=lfs merge=lfs -text 25 | data/samples/retail_item/left/left000025.png filter=lfs diff=lfs merge=lfs -text 26 | data/samples/retail_item/left/left000027.png filter=lfs diff=lfs merge=lfs -text 27 | data/samples/retail_item/left/left000011.png filter=lfs diff=lfs merge=lfs -text 28 | data/samples/retail_item/left/left000024.png filter=lfs diff=lfs merge=lfs -text 29 | data/samples/retail_item/left/left000009.png filter=lfs diff=lfs merge=lfs -text 30 | data/samples/retail_item/left/left000020.png filter=lfs diff=lfs merge=lfs -text 31 | data/samples/retail_item/left/left000004.png filter=lfs diff=lfs merge=lfs -text 32 | data/samples/retail_item/left/left000008.png filter=lfs diff=lfs merge=lfs -text 33 | data/samples/retail_item/left/left000012.png filter=lfs diff=lfs merge=lfs -text 34 | data/samples/retail_item/left/left000016.png filter=lfs diff=lfs merge=lfs -text 35 | data/samples/retail_item/left/left000019.png filter=lfs diff=lfs merge=lfs -text 36 | data/samples/retail_item/left/left000029.png filter=lfs diff=lfs merge=lfs -text 37 | data/samples/retail_item/left/left000033.png filter=lfs diff=lfs merge=lfs -text 38 | data/samples/retail_item/left/left000036.png filter=lfs diff=lfs merge=lfs -text 39 | data/samples/retail_item/right/right000013.png filter=lfs diff=lfs merge=lfs -text 40 | data/samples/retail_item/right/right000014.png filter=lfs diff=lfs merge=lfs -text 41 | data/samples/retail_item/right/right000020.png filter=lfs diff=lfs merge=lfs -text 42 | data/samples/retail_item/right/right000023.png filter=lfs diff=lfs merge=lfs -text 43 | data/samples/retail_item/right/right000031.png filter=lfs diff=lfs merge=lfs -text 44 | data/samples/retail_item/right/right000005.png filter=lfs diff=lfs merge=lfs -text 45 | data/samples/retail_item/right/right000010.png filter=lfs diff=lfs merge=lfs -text 46 | data/samples/retail_item/right/right000019.png filter=lfs diff=lfs merge=lfs -text 47 | data/samples/retail_item/right/right000029.png filter=lfs diff=lfs merge=lfs -text 48 | data/samples/retail_item/right/right000033.png filter=lfs diff=lfs merge=lfs -text 49 | data/samples/retail_item/right/right000034.png filter=lfs diff=lfs merge=lfs -text 50 | data/samples/retail_item/right/right000000.png filter=lfs diff=lfs merge=lfs -text 51 | data/samples/retail_item/right/right000002.png filter=lfs diff=lfs merge=lfs -text 52 | data/samples/retail_item/right/right000027.png filter=lfs diff=lfs merge=lfs -text 53 | data/samples/retail_item/right/right000036.png filter=lfs diff=lfs merge=lfs -text 54 | data/samples/retail_item/right/right000008.png filter=lfs diff=lfs merge=lfs -text 55 | data/samples/retail_item/right/right000024.png filter=lfs diff=lfs merge=lfs -text 56 | data/samples/retail_item/right/right000032.png filter=lfs diff=lfs merge=lfs -text 57 | data/samples/retail_item/right/right000035.png filter=lfs diff=lfs merge=lfs -text 58 | data/samples/retail_item/right/right000001.png filter=lfs diff=lfs merge=lfs -text 59 | data/samples/retail_item/right/right000016.png filter=lfs diff=lfs merge=lfs -text 60 | data/samples/retail_item/right/right000012.png filter=lfs diff=lfs merge=lfs -text 61 | data/samples/retail_item/right/right000028.png filter=lfs diff=lfs merge=lfs -text 62 | data/samples/retail_item/right/right000006.png filter=lfs diff=lfs merge=lfs -text 63 | data/samples/retail_item/right/right000009.png filter=lfs diff=lfs merge=lfs -text 64 | data/samples/retail_item/right/right000018.png filter=lfs diff=lfs merge=lfs -text 65 | data/samples/retail_item/right/right000030.png filter=lfs diff=lfs merge=lfs -text 66 | data/samples/retail_item/right/right000007.png filter=lfs diff=lfs merge=lfs -text 67 | data/samples/retail_item/right/right000015.png filter=lfs diff=lfs merge=lfs -text 68 | data/samples/retail_item/right/right000017.png filter=lfs diff=lfs merge=lfs -text 69 | data/samples/retail_item/right/right000021.png filter=lfs diff=lfs merge=lfs -text 70 | data/samples/retail_item/right/right000022.png filter=lfs diff=lfs merge=lfs -text 71 | data/samples/retail_item/right/right000004.png filter=lfs diff=lfs merge=lfs -text 72 | data/samples/retail_item/right/right000011.png filter=lfs diff=lfs merge=lfs -text 73 | data/samples/retail_item/right/right000026.png filter=lfs diff=lfs merge=lfs -text 74 | data/samples/retail_item/right/right000003.png filter=lfs diff=lfs merge=lfs -text 75 | data/samples/retail_item/right/right000025.png filter=lfs diff=lfs merge=lfs -text 76 | -------------------------------------------------------------------------------- /data/LICENSE: -------------------------------------------------------------------------------- 1 | NVIDIA ASSET LICENSE 2 | 3 | 4 | 5 | IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE ASSET. 6 | 7 | 8 | 9 | This license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity ("you”) and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA Sample Images for Demonstrating the 3D Object Reconstruction Workflow, provided under this Agreement (the “Asset”). 10 | 11 | 12 | 13 | This Agreement can be accepted only by an adult of legal age of majority in the country in which the Asset is used. If you do not have the required age or authority to accept this Agreement or if you don’t accept all the terms and conditions of this Agreement, do not use the Asset. 14 | 15 | 16 | 17 | You agree to use the Asset only for purposes that are permitted by this Agreement and any applicable law or regulation in the relevant jurisdictions. 18 | 19 | 20 | 21 | 1. License. Subject to the terms of this Agreement, NVIDIA grants you a limited, non-exclusive, revocable, non-transferable, non-sublicensable license to (a) use and reproduce the Asset solely for research or evaluation purposes, with NVIDIA software or hardware and consistent with the limitations in Section 2 below. 22 | 23 | 24 | 25 | 2. Limitations. Your license to use the Asset and Derivative Works is restricted. Except as expressly permitted in Section 1 above, you may not: (a) change or remove copyright or other proprietary notices in the Asset and Derivative Works; (b) sell, rent, sublicense, transfer, distribute, or otherwise make the Asset and Derivative Works available to others; (c) train or test AI models using the Asset; (d) offer or distribute the Asset on a stand-alone basis; (e) use the Asset or assist or facilitate using the Asset in any manner inconsistent with NVIDIA’s Trustworthy AI Terms available at https://www.nvidia.com/en-us/agreements/trustworthy-ai/terms/. 26 | 27 | 28 | 29 | 3. Ownership. The Asset, including all intellectual property rights, are and will remain the sole and exclusive property of NVIDIA or its licensors. Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests, and remedies in connection with the Asset, and (b) no other license or right is granted to you by implication, estoppel or otherwise. 30 | 31 | 32 | 33 | 4. Feedback. You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the Asset (collectively, “Feedback”). Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates. If you provide Feedback, you hereby grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion. 34 | 35 | 36 | 37 | 5. Term and Termination. This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the Asset. Additionally, NVIDIA may terminate this Agreement at any time with prior written notice. Upon any termination, you must immediately stop using and destroy all copies of the Asset. Upon written request, you will certify in writing that you have complied with your commitments under this section. All provisions will survive termination, except for the licenses granted to you. 38 | 39 | 40 | 41 | 6. Disclaimer of Warranties. THE ASSET IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING.  42 | 43 | 44 | 45 | 7. Limitations of Liability. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, WILL NVIDIA BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY TYPE ARISING OUT OF OR AS A RESULT OF THIS AGREEMENT OR THE USE OR INABILITY TO USE THE ASSET (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER DAMAGES OR LOSSES), EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 46 | 47 | 48 | 49 | 8. Indemnity. You will indemnify and hold harmless NVIDIA, its affiliates, their employees, officers, directors and agents (“Indemnified Parties”) and, at NVIDIA’s election, defend the Indemnified Parties from all third-party claims or lawsuits, costs, damages, expenses, liabilities, including attorney’s fees, arising out of or in connection with your use of the Asset. 50 | 51 | 52 | 53 | 10. Governing Law and Jurisdiction. This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.  54 | 55 | 56 | 57 | 11. No Assignment. NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law. You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void. 58 | 59 | 60 | 61 | 12. Export. The Asset is subject to United States export laws and regulations. You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, including the Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use.  62 | 63 | 64 | 65 | 13. Entire Agreement. Regarding the subject matter of this Agreement, the parties agree that this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications. If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect. Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties. 66 | -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import sys 4 | import yaml 5 | import logging 6 | 7 | import numpy as np 8 | import open3d as o3d 9 | import torch.nn.functional as F 10 | from tqdm import tqdm 11 | 12 | from omegaconf import OmegaConf 13 | from pathlib import Path 14 | from typing import Dict, Any 15 | 16 | from ..networks.foundationstereo import FoundationStereoProcessor 17 | from ..networks.sam2infer import Sam2Infer 18 | 19 | logger = logging.getLogger("preprocessing") 20 | 21 | def load_config(config_path: str) -> Dict[str, Any]: 22 | """ 23 | Load and validate configuration file. 24 | 25 | Args: 26 | config_path: Path to the configuration YAML file. 27 | 28 | Returns: 29 | Dict containing configuration parameters. 30 | 31 | Raises: 32 | FileNotFoundError: If config file doesn't exist. 33 | yaml.YAMLError: If config file is invalid. 34 | """ 35 | try: 36 | with open(config_path, 'r') as f: 37 | return yaml.load(f) 38 | except FileNotFoundError: 39 | logger.error(f"Configuration file not found: {config_path}") 40 | sys.exit(1) 41 | except yaml.YAMLError as e: 42 | logger.error(f"Error parsing configuration file: {e}") 43 | sys.exit(1) 44 | 45 | def setup_experiment_directory(config: Dict[str, Any]) -> tuple[Path, Path]: 46 | """ 47 | Create and validate experiment directory structure. 48 | 49 | Args: 50 | config: Configuration dictionary. 51 | 52 | Returns: 53 | Tuple of (experiment_path, rgb_path) 54 | """ 55 | exp_path = Path(config['base_path']['base_folder']) 56 | exp_path.mkdir(exist_ok=True) 57 | rgb_path = Path(config['base_path']['image_folder']) 58 | rgb_path.mkdir(exist_ok=True) 59 | logger.info(f"Using experiment directory: {exp_path}") 60 | 61 | return exp_path, rgb_path 62 | 63 | def process_video_frames(config: Dict[str, Any], exp_path: Path, rgb_path: Path) -> None: 64 | """ 65 | Extract frames from input video if not already processed. 66 | 67 | Args: 68 | config: Configuration dictionary. 69 | exp_path: Path to experiment directory. 70 | rgb_path: Path to RGB frames directory. 71 | """ 72 | if rgb_path.exists() and any(rgb_path.iterdir()): 73 | logger.info("RGB frames already extracted") 74 | return 75 | 76 | rgb_path.mkdir(exist_ok=True) 77 | logger.info("Extracting video frames...") 78 | read_video(config['video']['input_path'], str(exp_path), config) 79 | 80 | 81 | def depth2xyzmap(depth:np.ndarray, K, uvs:np.ndarray=None, zmin=0.1): 82 | invalid_mask = (depth < zmin) 83 | H, W = depth.shape[:2] 84 | if uvs is None: 85 | vs, us = np.meshgrid(np.arange(0, H), np.arange(0, W), sparse=False, indexing='ij') 86 | vs = vs.reshape(-1) 87 | us = us.reshape(-1) 88 | else: 89 | uvs = uvs.round().astype(int) 90 | us = uvs[:, 0] 91 | vs = uvs[:, 1] 92 | zs = depth[vs, us] 93 | xs = (us - K[0, 2]) * zs / K[0, 0] 94 | ys = (vs - K[1, 2]) * zs / K[1, 1] 95 | pts = np.stack((xs.reshape(-1), ys.reshape(-1), zs.reshape(-1)), 1) 96 | xyz_map = np.zeros((H, W, 3), dtype=np.float32) 97 | xyz_map[vs, us] = pts 98 | if invalid_mask.any(): 99 | xyz_map[invalid_mask] = 0 100 | return xyz_map 101 | 102 | def toOpen3dCloud(points, colors=None, normals=None): 103 | cloud = o3d.geometry.PointCloud() 104 | cloud.points = o3d.utility.Vector3dVector(points.astype(np.float64)) 105 | 106 | if colors is not None: 107 | if colors.max() > 1: 108 | colors = colors / 255.0 109 | cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float64)) 110 | if normals is not None: 111 | cloud.normals = o3d.utility.Vector3dVector(normals.astype(np.float64)) 112 | return cloud 113 | 114 | 115 | def read_video(input_video_path, base_folder, config=None): 116 | """ 117 | Read stereo video and split it into left and right frames 118 | Save the frames in the respective folders 119 | """ 120 | 121 | # Validate input video path 122 | if not os.path.exists(input_video_path): 123 | logger.error(f"Input video file not found: {input_video_path}") 124 | return False 125 | 126 | # Try different backends in order of preference 127 | cap = None 128 | backends_to_try = [ 129 | (cv2.CAP_FFMPEG, "FFmpeg"), 130 | (cv2.CAP_GSTREAMER, "GStreamer"), 131 | (cv2.CAP_ANY, "Default") 132 | ] 133 | 134 | for backend_id, backend_name in backends_to_try: 135 | logger.info(f"Trying {backend_name} backend...") 136 | cap = cv2.VideoCapture(input_video_path, backend_id) 137 | if cap.isOpened(): 138 | logger.info(f"Successfully opened with {backend_name} backend") 139 | break 140 | else: 141 | logger.warning(f"{backend_name} backend failed") 142 | if cap: 143 | cap.release() 144 | 145 | # Check if any backend worked 146 | if not cap or not cap.isOpened(): 147 | logger.error(f"Failed to open video file with any backend: {input_video_path}") 148 | logger.error("This could be due to missing codec support or corrupted file") 149 | return False 150 | 151 | # Get video properties for validation 152 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 153 | fps = cap.get(cv2.CAP_PROP_FPS) 154 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 155 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 156 | 157 | logger.info(f"Video properties: {total_frames} frames, {fps:.2f} FPS, {width}x{height}") 158 | 159 | if total_frames == 0: 160 | logger.error("Video contains no frames or frame count could not be determined") 161 | cap.release() 162 | return False 163 | 164 | # Set default step if config not provided 165 | step = 1 166 | if config and 'video' in config and 'step' in config['video']: 167 | step = config['video']['step'] 168 | 169 | logger.info(f"Processing every {step} frame(s)") 170 | 171 | # Read the video and split it into frames 172 | ret = True 173 | count = 0 174 | frames_saved = 0 175 | 176 | left_path = os.path.join(base_folder, 'left') 177 | if not os.path.exists(left_path): 178 | os.makedirs(left_path) 179 | 180 | right_path = os.path.join(base_folder, 'right') 181 | if not os.path.exists(right_path): 182 | os.makedirs(right_path) 183 | 184 | # Create progress bar 185 | pbar = tqdm(total=total_frames, desc="Processing frames", unit="frames") 186 | 187 | while ret: 188 | ret, image = cap.read() 189 | 190 | if not ret: 191 | break 192 | 193 | if image is None: 194 | logger.warning(f"Frame {count} is None, skipping") 195 | count += 1 196 | pbar.update(1) 197 | continue 198 | 199 | h, w = image.shape[:2] 200 | 201 | # Validate that we have a stereo image (should be twice as wide) 202 | if w < 100: # Minimum reasonable width 203 | logger.error(f"Image width {w} is too small for stereo video") 204 | break 205 | 206 | # separate the stereo video into left and right 207 | left = image[:, :w//2] 208 | right = image[:, w//2:] 209 | 210 | # Save frames based on step interval 211 | if count % step == 0: 212 | left_save_path = os.path.join(left_path, '{}.png'.format(str(frames_saved).zfill(6))) 213 | right_save_path = os.path.join(right_path, '{}.png'.format(str(frames_saved).zfill(6))) 214 | 215 | success_left = cv2.imwrite(left_save_path, left) 216 | success_right = cv2.imwrite(right_save_path, right) 217 | 218 | if success_left and success_right: 219 | frames_saved += 1 220 | else: 221 | logger.error(f"Failed to save frame {count}") 222 | 223 | count += 1 224 | 225 | # Update progress bar with current status 226 | pbar.set_postfix({ 227 | 'saved': frames_saved, 228 | 'step': f'1/{step}' if step > 1 else 'all' 229 | }) 230 | pbar.update(1) 231 | 232 | pbar.close() 233 | 234 | cap.release() 235 | cv2.destroyAllWindows() 236 | 237 | if frames_saved > 0: 238 | logger.info(f"Successfully saved {frames_saved} frame pairs from {count} total frames") 239 | return True 240 | else: 241 | logger.error(f"No frames were saved! Processed {count} frames but none could be saved.") 242 | return False -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/deepstream:7.1-triton-multiarch 2 | 3 | ARG CMAKE_VERSION_MAJOR=3 4 | ARG CMAKE_VERSION_MINOR=25 5 | ARG CMAKE_VERSION_PATCH=3 6 | 7 | ARG EIGEN_VERSION_MAJOR=3 8 | ARG EIGEN_VERSION_MINOR=4 9 | ARG EIGEN_VERSION_PATCH=0 10 | 11 | ARG OPENCV_VERSION_MAJOR=4 12 | ARG OPENCV_VERSION_MINOR=11 13 | ARG OPENCV_VERSION_PATCH=0 14 | 15 | ARG PCL_VERSION_MAJOR=1 16 | ARG PCL_VERSION_MINOR=10 17 | ARG PCL_VERSION_PATCH=0 18 | 19 | ARG PYBIND11_VERSION_MAJOR=2 20 | ARG PYBIND11_VERSION_MINOR=13 21 | ARG PYBIND11_VERSION_PATCH=0 22 | 23 | ARG YAML_CPP_VERSION_MAJOR=0 24 | ARG YAML_CPP_VERSION_MINOR=8 25 | ARG YAML_CPP_VERSION_PATCH=0 26 | 27 | # Install dependencies 28 | RUN apt-get update && apt-get install -y --no-install-recommends \ 29 | python3-pip \ 30 | python3-dev \ 31 | libglib2.0-0 \ 32 | libsm6 \ 33 | libxext6 \ 34 | libxrender-dev \ 35 | libblas-dev \ 36 | libssl-dev \ 37 | liblapack-dev \ 38 | gfortran \ 39 | gnupg \ 40 | software-properties-common \ 41 | libflann-dev \ 42 | libboost-filesystem-dev \ 43 | libboost-date-time-dev \ 44 | libboost-iostreams-dev \ 45 | libboost-system-dev \ 46 | libboost-program-options-dev \ 47 | libzmq3-dev \ 48 | ffmpeg \ 49 | && rm -rf /var/lib/apt/lists/* 50 | 51 | # Insttall cmake 52 | RUN cd / &&\ 53 | wget http://www.cmake.org/files/v${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}/cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}.${CMAKE_VERSION_PATCH}.tar.gz &&\ 54 | tar xf cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}.${CMAKE_VERSION_PATCH}.tar.gz &&\ 55 | cd cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}.${CMAKE_VERSION_PATCH} &&\ 56 | ./configure &&\ 57 | make &&\ 58 | make install 59 | 60 | # Install Eigen 61 | RUN cd / && \ 62 | wget https://gitlab.com/libeigen/eigen/-/archive/${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}/eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}.tar.gz && \ 63 | tar xf eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}.tar.gz && \ 64 | cd eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH} && \ 65 | mkdir build && \ 66 | cd build && \ 67 | cmake .. && \ 68 | make install && \ 69 | cd / && \ 70 | rm -rf eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}.tar.gz eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH} 71 | 72 | RUN pip3 install -U torch==2.6.0 torchvision==0.21.0 73 | 74 | # Install OpenCV 75 | RUN cd / && \ 76 | git clone --depth 1 --branch ${OPENCV_VERSION_MAJOR}.${OPENCV_VERSION_MINOR}.${OPENCV_VERSION_PATCH} https://github.com/opencv/opencv && \ 77 | git clone --depth 1 --branch ${OPENCV_VERSION_MAJOR}.${OPENCV_VERSION_MINOR}.${OPENCV_VERSION_PATCH} https://github.com/opencv/opencv_contrib && \ 78 | mkdir -p /opencv/build && \ 79 | cd /opencv/build && \ 80 | cmake .. -DCMAKE_BUILD_TYPE=Release \ 81 | -DBUILD_CUDA_STUBS=OFF \ 82 | -DBUILD_DOCS=OFF \ 83 | -DWITH_MATLAB=OFF \ 84 | -Dopencv_dnn_BUILD_TORCH_IMPORTE=OFF \ 85 | -DCUDA_FAST_MATH=ON \ 86 | -DMKL_WITH_OPENMP=ON \ 87 | -DOPENCV_ENABLE_NONFREE=ON \ 88 | -DWITH_OPENMP=ON \ 89 | -DWITH_QT=ON \ 90 | -DWITH_OPENEXR=ON \ 91 | -DENABLE_PRECOMPILED_HEADERS=OFF \ 92 | -DBUILD_opencv_cudacodec=OFF \ 93 | -DINSTALL_PYTHON_EXAMPLES=OFF \ 94 | -DWITH_TIFF=OFF \ 95 | -DWITH_WEBP=OFF \ 96 | -DWITH_FFMPEG=ON \ 97 | -DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules \ 98 | -DCMAKE_CXX_FLAGS=-std=c++17 \ 99 | -DENABLE_CXX11=OFF \ 100 | -DBUILD_opencv_xfeatures2d=OFF \ 101 | -DOPENCV_DNN_OPENCL=OFF \ 102 | -DWITH_CUDA=ON \ 103 | -DWITH_OPENCL=OFF \ 104 | -DBUILD_opencv_wechat_qrcode=OFF \ 105 | -DCMAKE_CXX_STANDARD=17 \ 106 | -DCMAKE_CXX_STANDARD_REQUIRED=ON \ 107 | -DOPENCV_CUDA_OPTIONS_opencv_test_cudev=-std=c++17 \ 108 | -DCUDA_ARCH_BIN="7.0 7.5 8.0 8.6 9.0" \ 109 | -DCMAKE_INSTALL_PREFIX=/usr/local \ 110 | -DCMAKE_INSTALL_LIBDIR=lib \ 111 | -DINSTALL_PKGCONFIG=ON \ 112 | -DOPENCV_GENERATE_PKGCONFIG=ON \ 113 | -DPKG_CONFIG_PATH=/usr/local/lib/pkgconfig \ 114 | -DINSTALL_PYTHON_EXAMPLES=OFF \ 115 | -DINSTALL_C_EXAMPLES=OFF && \ 116 | make -j$(nproc) && \ 117 | make install && \ 118 | cd / && \ 119 | rm -rf /opencv /opencv_contrib 120 | 121 | # Install PCL 122 | RUN cd / && \ 123 | git clone --depth 1 --branch pcl-${PCL_VERSION_MAJOR}.${PCL_VERSION_MINOR}.${PCL_VERSION_PATCH} https://github.com/PointCloudLibrary/pcl && \ 124 | mkdir -p /pcl/build && \ 125 | cd /pcl/build && \ 126 | cmake .. \ 127 | -DCMAKE_BUILD_TYPE=Release \ 128 | -DBUILD_apps=OFF \ 129 | -DBUILD_GPU=OFF \ 130 | -DBUILD_CUDA=OFF \ 131 | -DBUILD_examples=OFF \ 132 | -DBUILD_global_tests=OFF \ 133 | -DBUILD_simulation=OFF \ 134 | -DCUDA_BUILD_EMULATION=OFF \ 135 | -DCMAKE_CXX_FLAGS=-std=c++17 \ 136 | -DPCL_ENABLE_SSE=ON \ 137 | -DPCL_SHARED_LIBS=ON \ 138 | -DWITH_VTK=OFF \ 139 | -DPCL_ONLY_CORE_POINT_TYPES=ON \ 140 | -DPCL_COMMON_WARNINGS=OFF && \ 141 | make -j$(nproc) && \ 142 | make install && \ 143 | cd / && \ 144 | rm -rf /pcl 145 | 146 | # Install Pybind11 147 | RUN cd / && \ 148 | git clone --depth 1 --branch v${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH} https://github.com/pybind/pybind11 && \ 149 | mkdir -p /pybind11/build && \ 150 | cd /pybind11/build && \ 151 | cmake .. -DCMAKE_BUILD_TYPE=Release -DPYBIND11_INSTALL=ON -DPYBIND11_TEST=OFF && \ 152 | make -j$(nproc) && \ 153 | make install && \ 154 | cd / && \ 155 | rm -rf /pybind11 156 | 157 | # Install YAML-CPP 158 | RUN cd / && \ 159 | git clone --depth 1 --branch ${YAML_CPP_VERSION_MAJOR}.${YAML_CPP_VERSION_MINOR}.${YAML_CPP_VERSION_PATCH} https://github.com/jbeder/yaml-cpp && \ 160 | mkdir -p /yaml-cpp/build && \ 161 | cd /yaml-cpp/build && \ 162 | cmake .. \ 163 | -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ 164 | -DBUILD_TESTING=OFF \ 165 | -DCMAKE_BUILD_TYPE=Release \ 166 | -DINSTALL_GTEST=OFF \ 167 | -DYAML_CPP_BUILD_TESTS=OFF \ 168 | -DYAML_BUILD_SHARED_LIBS=ON && \ 169 | make -j$(nproc) && \ 170 | make install && \ 171 | cd / && \ 172 | rm -rf /yaml-cpp 173 | 174 | # Create workspace directory 175 | WORKDIR /workspace 176 | 177 | COPY src/requirements.txt /workspace/ 178 | # Install Python dependencies 179 | RUN pip3 install --no-cache-dir -r /workspace/requirements.txt 180 | 181 | # Install Python dependencies 182 | RUN pip3 install -U pip && pip3 install --no-cache-dir \ 183 | scikit-learn scikit-image --force-reinstall \ 184 | && pip3 install --no-cache-dir --ignore-installed open3d \ 185 | && pip3 install --no-cache-dir kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu124.html \ 186 | && pip3 install --no-cache-dir --no-index pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt241/download.html \ 187 | && pip3 install --no-cache-dir numpy==1.26.4 scipy joblib scikit-learn scikit-image --force-reinstall \ 188 | && pip3 install --no-cache-dir pyrender \ 189 | && pip3 install --no-cache-dir jupyter jupyterlab notebook 190 | 191 | # Setup environment 192 | ENV PATH="/bin/python3:${PATH}" 193 | RUN alias python="/bin/python3" 194 | RUN echo 'alias python="/bin/python3"' >> /etc/bash.bashrc && \ 195 | update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ 196 | update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ 197 | echo "export PYTHONPATH=/usr/local/lib/python3.10/dist-packages:${PYTHONPATH}" >> /etc/bash.bashrc 198 | 199 | RUN cd / && git clone https://github.com/NVLabs/BundleSDF.git 200 | 201 | RUN cp -r /BundleSDF/mycuda /customize_cuda 202 | RUN cd /customize_cuda && \ 203 | TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0" FORCE_CUDA=1 pip install . --no-build-isolation 204 | 205 | # Setup the OpenMPI 206 | RUN mkdir -p /opt/hpcx/ompi/lib/x86_64-linux-gnu 207 | RUN ln -s /opt/hpcx/ompi /opt/hpcx/ompi/lib/x86_64-linux-gnu/openmpi 208 | RUN apt remove -y libvtk9-dev || true 209 | 210 | # Reinstall VTK package for OpenCV compatibility 211 | RUN apt-get update && apt-get install -y libvtk9-dev && \ 212 | rm -rf /var/lib/apt/lists/* 213 | 214 | # Build and install BundleTrack (copy only necessary artifacts) 215 | RUN cp -r /BundleSDF/BundleTrack /tmp/BundleTrack 216 | RUN cd /tmp/BundleTrack && \ 217 | mkdir -p build && \ 218 | cd build && \ 219 | cmake .. -DCMAKE_BUILD_TYPE=Release && \ 220 | make -j$(nproc) && \ 221 | # Copy only the essential built artifacts to standard locations \ 222 | cp my_cpp*.so /usr/local/lib/python3.10/dist-packages/ && \ 223 | cp libBundleTrack.so /usr/local/lib/ && \ 224 | cp libMY_CUDA_LIB.so /usr/local/lib/ && \ 225 | # Update library cache \ 226 | ldconfig && \ 227 | # Remove all source code and build artifacts \ 228 | cd / && \ 229 | rm -rf /tmp/BundleTrack 230 | 231 | RUN cd / && \ 232 | git clone https://github.com/NVlabs/FoundationStereo.git 233 | 234 | ENV PYTHONPATH=/FoundationStereo/core:$PYTHONPATH 235 | 236 | # Install sam2 and roma libraries 237 | RUN cd / && git clone https://github.com/facebookresearch/sam2.git &&\ 238 | cd sam2 &&\ 239 | SAM2_BUILD_CUDA=0 pip install -e ".[notebooks]" && \ 240 | python3 setup.py build_ext --inplace 241 | 242 | RUN cd / && git clone https://github.com/Parskatt/RoMa.git &&\ 243 | cd RoMa &&\ 244 | pip3 install . &&\ 245 | cd / && rm -rf /RoMa 246 | 247 | # Final cleanup and ldconfig 248 | RUN ldconfig && \ 249 | apt-get autoremove -y && \ 250 | apt-get clean && \ 251 | rm -rf /var/lib/apt/lists/* && \ 252 | rm -rf /root/.cache/pip && \ 253 | # Remove any remaining temporary files \ 254 | rm -rf /tmp/* 255 | 256 | WORKDIR /workspace 257 | 258 | # Copy the entire package structure for proper installation 259 | COPY src /workspace/3d-object-reconstruction/src 260 | COPY README.md /workspace/3d-object-reconstruction/ 261 | COPY notebooks /workspace/3d-object-reconstruction/notebooks 262 | COPY data /workspace/3d-object-reconstruction/data 263 | 264 | # Install the package in editable mode (package files are now under src/) 265 | WORKDIR /workspace/3d-object-reconstruction 266 | RUN pip3 install -e src/ 267 | 268 | # Create Jupyter configuration 269 | RUN jupyter notebook --generate-config && \ 270 | echo "c.NotebookApp.ip = '0.0.0.0'" >> ~/.jupyter/jupyter_notebook_config.py && \ 271 | echo "c.NotebookApp.port = 8888" >> ~/.jupyter/jupyter_notebook_config.py && \ 272 | echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py && \ 273 | echo "c.NotebookApp.allow_root = True" >> ~/.jupyter/jupyter_notebook_config.py && \ 274 | echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py && \ 275 | echo "c.NotebookApp.password = ''" >> ~/.jupyter/jupyter_notebook_config.py 276 | 277 | # Expose Jupyter port 278 | EXPOSE 8888 279 | 280 | # Set the default command to start Jupyter notebook 281 | CMD ["jupyter", "notebook", "--notebook-dir=/workspace/3d-object-reconstruction", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=''", "--NotebookApp.password=''"] 282 | 283 | -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/configs/schema.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Optional, Tuple 3 | from omegaconf import MISSING 4 | 5 | """BundleTrack and NVBundleSDF config schemas""" 6 | 7 | @dataclass 8 | class RoMaConfig: 9 | """RoMa configuration.""" 10 | coarse_res: int = 560 11 | upsample_res: Tuple[int, int] = (864, 864) 12 | device: str = "cuda" 13 | weights: str = "/workspace/3d-object-reconstruction/data/weights/roma/roma_outdoor.pth" 14 | dinov2_weights: str = "/workspace/3d-object-reconstruction/data/weights/roma/dinov2_vitl14_pretrain.pth" 15 | 16 | 17 | @dataclass 18 | class CameraConfig: 19 | """Camera configuration.""" 20 | step: int = field(default=1, metadata={"validate": lambda x: x > 0}) 21 | intrinsic: List[float] = field(default_factory=lambda: [3.0796e+03, 0, 2.0000e+03, 0, 3.0751e+03, 1.50001e+03, 0, 0, 1]) 22 | 23 | @dataclass 24 | class FoundationStereoConfig: 25 | """Foundation Stereo configuration.""" 26 | # inference_uri: str = MISSING 27 | pth_path: str = '/workspace/3d-object-reconstruction/data/weights/foundationstereo/model_best_bp2.pth' 28 | cfg_path: str = '/workspace/3d-object-reconstruction/data/weights/foundationstereo/cfg.yaml' 29 | vit_size: str = 'vitl' 30 | scale: float = 0.3 31 | hiera: int = 0 32 | z_far: float = 10 33 | remove_invisible: bool = True 34 | intrinsic: List[float] = field(default_factory=lambda: [3.0796e+03, 0, 2.0000e+03, 0, 3.0751e+03, 1.50001e+03, 0, 0, 1]) 35 | baseline: float = 0.0657696127 36 | 37 | @dataclass 38 | class SAM2Config: 39 | """SAM2 configuration.""" 40 | checkpoint_path: str = "/workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_large.pt" 41 | model_config: str = "/workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_l.yaml" 42 | bbox: List[int] = field(default_factory=lambda: [1144, 627, 2227, 2232]) 43 | device: str = "cuda" 44 | 45 | @dataclass 46 | class TextureBakeConfig: 47 | """Texture baking configuration.""" 48 | downscale: float = 1.0 49 | texture_res: int = 2048 50 | 51 | @dataclass 52 | class SegmentationConfig: 53 | """Segmentation configuration.""" 54 | ob_scales: List[float] = field(default_factory=lambda: [0.3, 0.3, 0.3]) 55 | tolerance: float = 0.03 56 | 57 | @dataclass 58 | class DepthProcessingConfig: 59 | """Depth processing configuration.""" 60 | zfar: float = 1.0 61 | 62 | @dataclass 63 | class ErodeConfig: 64 | radius: int = 1 65 | diff: float = 0.001 66 | ratio: float = 0.8 # If ratio larger than this, depth set to 0 67 | 68 | erode: ErodeConfig = field(default_factory=ErodeConfig) 69 | 70 | @dataclass 71 | class BilateralFilterConfig: 72 | radius: int = 2 73 | sigma_D: int = 2 74 | sigma_R: int = 100000 75 | 76 | bilateral_filter: BilateralFilterConfig = field(default_factory=BilateralFilterConfig) 77 | 78 | @dataclass 79 | class OutlierRemovalConfig: 80 | num: int = 30 81 | std_mul: int = 3 82 | 83 | outlier_removal: OutlierRemovalConfig = field(default_factory=OutlierRemovalConfig) 84 | 85 | edge_normal_thres: int = 10 # Deg between normal and ray 86 | denoise_cloud: bool = False 87 | percentile: int = 95 88 | 89 | @dataclass 90 | class BundleConfig: 91 | num_iter_outter: int = 7 92 | num_iter_inner: int = 5 93 | window_size: int = 5 # Exclude keyframes, include new frame 94 | max_BA_frames: int = 10 95 | subset_selection_method: str = "normal_orientation_nearest" 96 | depth_association_radius: int = 5 # Used for depth point association 97 | non_neighbor_max_rot: int = 90 98 | non_neighbor_min_visible: float = 0.1 # Ratio of pixel visible 99 | icp_pose_rot_thres: int = 60 # Rotation larger than XX deg is ignored for icp 100 | w_rpi: int = 0 101 | w_p2p: int = 1 # Used in loss.cpp 102 | w_fm: int = 1 103 | w_sdf: int = 0 104 | w_pm: int = 0 105 | robust_delta: float = 0.005 106 | min_fm_edges_newframe: int = 15 107 | image_downscale: List[int] = field(default_factory=lambda: [4]) 108 | feature_edge_dist_thres: float = 0.01 109 | feature_edge_normal_thres: int = 30 # Normal angle should be within this range 110 | max_optimized_feature_loss: float = 0.03 111 | 112 | @dataclass 113 | class KeyframeConfig: 114 | min_interval: int = 1 115 | min_feat_num: int = 0 116 | min_trans: int = 0 117 | min_rot: int = 5 118 | min_visible: int = 1 119 | 120 | @dataclass 121 | class SiftConfig: 122 | scales: List[int] = field(default_factory=lambda: [2, 4, 8]) 123 | max_match_per_query: int = 5 124 | nOctaveLayers: int = 3 125 | contrastThreshold: float = 0.01 126 | edgeThreshold: int = 50 127 | sigma: float = 1.6 128 | 129 | @dataclass 130 | class FeatureCorresConfig: 131 | mutual: bool = True 132 | map_points: bool = True 133 | max_dist_no_neighbor: float = 0.01 134 | max_normal_no_neighbor: int = 20 135 | max_dist_neighbor: float = 0.02 136 | max_normal_neighbor: int = 30 137 | suppression_patch_size: int = 5 138 | max_view_normal_angle: int = 180 139 | min_match_with_ref: int = 5 140 | resize: int = 800 141 | rematch_after_nerf: bool = False 142 | 143 | @dataclass 144 | class RansacConfig: 145 | max_iter: int = 2000 146 | num_sample: int = 3 147 | inlier_dist: float = 0.01 148 | inlier_normal_angle: int = 20 149 | desired_succ_rate: float = 0.99 150 | max_trans_neighbor: float = 0.02 # ransac model estimated pose shouldnt be too far 151 | max_rot_deg_neighbor: int = 30 152 | max_trans_no_neighbor: float = 0.01 153 | max_rot_no_neighbor: int = 10 154 | epipolar_thres: int = 1 155 | min_match_after_ransac: int = 5 156 | 157 | @dataclass 158 | class P2PConfig: 159 | projective: bool = False 160 | max_dist: float = 0.02 161 | max_normal_angle: int = 45 162 | 163 | @dataclass 164 | class SDFEdgeConfig: 165 | max_dist: float = 0.02 166 | 167 | @dataclass 168 | class ShapeConfig: 169 | res: float = 0.005 170 | xrange: Tuple[float, float] = (-0.2, 0.2) 171 | yrange: Tuple[float, float] = (-0.2, 0.2) 172 | zrange: Tuple[float, float] = (-0.2, 0.2) 173 | max_weight: int = 100 174 | 175 | @dataclass 176 | class BundleTrackConfig: 177 | debug_dir: str = MISSING 178 | SPDLOG: int = 2 179 | USE_GRAY: bool = False 180 | port: str = "5555" 181 | nerf_port: str = "9999" 182 | downscale: float = 1.0 183 | erode_mask: int = 3 184 | visible_angle: int = 70 # Angle between normal and point to camera origin within XXX is regarded as visible 185 | 186 | segmentation: SegmentationConfig = field(default_factory=SegmentationConfig) 187 | depth_processing: DepthProcessingConfig = field(default_factory=DepthProcessingConfig) 188 | bundle: BundleConfig = field(default_factory=BundleConfig) 189 | keyframe: KeyframeConfig = field(default_factory=KeyframeConfig) 190 | sift: SiftConfig = field(default_factory=SiftConfig) 191 | feature_corres: FeatureCorresConfig = field(default_factory=FeatureCorresConfig) 192 | ransac: RansacConfig = field(default_factory=RansacConfig) 193 | p2p: P2PConfig = field(default_factory=P2PConfig) 194 | sdf_edge: SDFEdgeConfig = field(default_factory=SDFEdgeConfig) 195 | shape: ShapeConfig = field(default_factory=ShapeConfig) 196 | 197 | 198 | @dataclass 199 | class NeRFConfig: 200 | """NeRF configuration.""" 201 | batch_size: int = 32 202 | downscale: float = 0.5 203 | n_step: int = 2000 204 | save_dir: str = MISSING 205 | 206 | # Network architecture 207 | netdepth: int = 8 208 | netwidth: int = 256 209 | netdepth_fine: int = 8 210 | netwidth_fine: int = 256 211 | 212 | # Training parameters 213 | N_rand: int = 2048 214 | lrate: float = 0.01 215 | lrate_pose: float = 0.01 216 | decay_rate: float = 0.1 217 | chunk: int = 99999999999 218 | netchunk: int = 6553600 219 | no_batching: int = 0 220 | amp: bool = True 221 | 222 | # Sampling parameters 223 | N_samples: int = 64 224 | N_samples_around_depth: int = 256 225 | N_importance: int = 0 226 | perturb: int = 1 227 | use_viewdirs: int = 1 228 | 229 | # Embedding parameters 230 | i_embed: int = 1 231 | i_embed_views: int = 2 232 | multires: int = 8 233 | multires_views: int = 3 234 | feature_grid_dim: int = 2 235 | raw_noise_std: int = 0 236 | 237 | # Logging options 238 | i_img: int = 99999 239 | i_weights: int = 999999 240 | i_mesh: int = 999999 241 | i_pose: int = 999999 242 | i_print: int = 999999 243 | 244 | # Hash encoding parameters 245 | finest_res: int = 256 246 | base_res: int = 16 247 | num_levels: int = 16 248 | log2_hashmap_size: int = 22 249 | 250 | # Octree parameters 251 | use_octree: int = 1 252 | first_frame_weight: int = 1 253 | denoise_depth_use_octree_cloud: bool = True 254 | octree_embed_base_voxel_size: float = 0.02 255 | octree_smallest_voxel_size: float = 0.02 256 | octree_raytracing_voxel_size: float = 0.02 257 | octree_dilate_size: float = 0.02 258 | down_scale_ratio: int = 1 259 | 260 | # Scene parameters 261 | bounding_box: List[List[float]] = field(default_factory=lambda: [[-1, -1, -1], [1, 1, 1]]) 262 | use_mask: int = 1 263 | dilate_mask_size: int = 0 264 | rays_valid_depth_only: bool = True 265 | near: float = 0.1 266 | far: float = 1.0 267 | 268 | # Loss weights 269 | rgb_weight: int = 10 270 | depth_weight: int = 0 271 | sdf_lambda: int = 5 272 | trunc: float = 0.002 273 | trunc_start: float = 0.002 274 | neg_trunc_ratio: int = 1 275 | trunc_decay_type: str = "" 276 | fs_weight: int = 100 277 | empty_weight: int = 2 278 | fs_rgb_weight: int = 0 279 | fs_sdf: float = 0.1 280 | trunc_weight: int = 6000 281 | tv_loss_weight: int = 0 282 | frame_features: int = 2 283 | optimize_poses: int = 0 284 | pose_reg_weight: int = 0 285 | feature_reg_weight: float = 0.1 286 | share_coarse_fine: int = 1 287 | eikonal_weight: int = 0 288 | 289 | # Rendering mode and mesh extraction 290 | mode: str = "sdf" 291 | mesh_resolution: float = 0.002 292 | max_trans: float = 0.02 293 | max_rot: int = 20 294 | 295 | 296 | @dataclass 297 | class MeshSmoothingConfig: 298 | enabled: bool = True 299 | iterations: int = 2 300 | lambda_: float = 0.5 301 | use_taubin: bool = True 302 | 303 | mesh_smoothing: MeshSmoothingConfig = field(default_factory=MeshSmoothingConfig) 304 | save_octree_clouds: bool = True 305 | 306 | @dataclass 307 | class BasePathConfig: 308 | """Base path configuration.""" 309 | base_folder: str = MISSING 310 | image_folder: str = MISSING 311 | save_dir: str = MISSING 312 | 313 | @dataclass 314 | class NVBundleSDFConfig: 315 | """NVBundleSDF configuration.""" 316 | data_path: str = MISSING 317 | workdir: str = MISSING 318 | downscale: float = 1.0 319 | camera_config: CameraConfig = field(default_factory=CameraConfig) 320 | bundletrack: BundleTrackConfig = field(default_factory=BundleTrackConfig) 321 | foundation_stereo: FoundationStereoConfig = field(default_factory=FoundationStereoConfig) 322 | sam2: SAM2Config = field(default_factory=SAM2Config) 323 | nerf: NeRFConfig = field(default_factory=NeRFConfig) 324 | texture_bake: TextureBakeConfig = field(default_factory=TextureBakeConfig) 325 | roma: RoMaConfig = field(default_factory=RoMaConfig) 326 | base_path: BasePathConfig = field(default_factory=BasePathConfig) 327 | -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/cli/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line interface for NVIDIA 3D Object Reconstruction. 3 | 4 | This module provides the main entry point for the CLI tool. 5 | """ 6 | 7 | import argparse 8 | import logging 9 | import sys 10 | import os 11 | import shutil 12 | from pathlib import Path 13 | import uuid 14 | import yaml 15 | import time 16 | import torch 17 | 18 | from nvidia.objectreconstruction.networks import NVBundleSDF 19 | from nvidia.objectreconstruction.dataloader import ReconstructionDataLoader 20 | from nvidia.objectreconstruction.utils.structures import dataclass_to_dict 21 | from nvidia.objectreconstruction.networks.foundationstereo import run_depth_estimation 22 | from nvidia.objectreconstruction.networks.sam2infer import run_mask_extraction 23 | from nvidia.objectreconstruction.utils.preprocessing import setup_experiment_directory 24 | 25 | 26 | def setup_logging(verbose: bool = False): 27 | """Setup logging configuration.""" 28 | level = logging.DEBUG if verbose else logging.INFO 29 | logging.basicConfig( 30 | level=level, 31 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 32 | ) 33 | 34 | def validate_config_file(config_path: str) -> dict: 35 | """ 36 | Load and validate configuration file. 37 | 38 | Args: 39 | config_path: Path to configuration file 40 | 41 | Returns: 42 | dict: Loaded configuration 43 | 44 | Raises: 45 | FileNotFoundError: If config file doesn't exist 46 | yaml.YAMLError: If config file is invalid 47 | ValueError: If config is missing required sections 48 | """ 49 | if not os.path.exists(config_path): 50 | raise FileNotFoundError(f"Configuration file not found: {config_path}") 51 | 52 | try: 53 | with open(config_path, 'r') as f: 54 | config = yaml.safe_load(f) 55 | except yaml.YAMLError as e: 56 | raise yaml.YAMLError(f"Invalid YAML in config file {config_path}: {e}") 57 | 58 | if not isinstance(config, dict): 59 | raise ValueError(f"Configuration file must contain a dictionary, got {type(config)}") 60 | 61 | # Validate required sections 62 | required_sections = ['bundletrack', 'nerf', 'roma', 'sam2', 'foundation_stereo', 'texture_bake'] 63 | missing_sections = [section for section in required_sections if section not in config] 64 | if missing_sections: 65 | raise ValueError(f"Configuration missing required sections: {missing_sections}") 66 | 67 | return config 68 | 69 | def validate_data_path(data_path: str) -> Path: 70 | """ 71 | Validate and return data path. 72 | 73 | Args: 74 | data_path: Path to data directory 75 | 76 | Returns: 77 | Path: Validated path object 78 | 79 | Raises: 80 | FileNotFoundError: If data path doesn't exist 81 | NotADirectoryError: If data path is not a directory 82 | """ 83 | path = Path(data_path) 84 | 85 | if not path.exists(): 86 | raise FileNotFoundError(f"Data path does not exist: {data_path}") 87 | 88 | if not path.is_dir(): 89 | raise NotADirectoryError(f"Data path is not a directory: {data_path}") 90 | 91 | return path 92 | 93 | def main(): 94 | """Main CLI entry point with comprehensive error handling.""" 95 | parser = argparse.ArgumentParser( 96 | description="NVIDIA 3D Object Reconstruction Framework", 97 | formatter_class=argparse.RawDescriptionHelpFormatter, 98 | epilog=""" 99 | Examples: 100 | nvidia-3d-reconstruct --help 101 | nvidia-3d-reconstruct --config config.yaml --data-path /path/to/data 102 | """ 103 | ) 104 | 105 | parser.add_argument( 106 | "--config", 107 | type=str, 108 | default="/workspace/3d-object-reconstruction/data/configs/base.yaml", 109 | help="Path to configuration file" 110 | ) 111 | parser.add_argument( 112 | "--data-path", 113 | type=str, 114 | default="/workspace/3d-object-reconstruction/data/samples/retail_item/", 115 | help="Path to input data directory" 116 | ) 117 | parser.add_argument( 118 | "--output-path", 119 | type=str, 120 | default=f"/workspace/3d-object-reconstruction/data/output/{uuid.uuid4()}", 121 | help="Path to output directory for reconstruction results" 122 | ) 123 | parser.add_argument( 124 | "--verbose", "-v", 125 | action="store_true", 126 | help="Enable verbose logging" 127 | ) 128 | parser.add_argument( 129 | "--version", 130 | action="version", 131 | version="nvidia-3d-object-reconstruction 0.1.0" 132 | ) 133 | 134 | # Parse arguments with error handling 135 | try: 136 | args = parser.parse_args() 137 | except SystemExit as e: 138 | # argparse calls sys.exit on error, catch and re-raise 139 | return e.code if e.code is not None else 1 140 | 141 | # Setup logging 142 | setup_logging(args.verbose) 143 | logger = logging.getLogger(__name__) 144 | 145 | try: 146 | start_total = time.time() 147 | logger.info("NVIDIA 3D Object Reconstruction CLI") 148 | 149 | # Validate inputs 150 | logger.info("Validating configuration and inputs...") 151 | config = validate_config_file(args.config) 152 | exp_path = validate_data_path(args.data_path) 153 | 154 | # Create output directory 155 | output_path = Path(args.output_path) 156 | os.makedirs(output_path, exist_ok=True) 157 | logger.info(f"Output directory: {output_path}") 158 | 159 | # Setup configuration paths 160 | config['workdir'] = output_path 161 | config['bundletrack']['debug_dir'] = output_path / "bundletrack" 162 | config['nerf']['save_dir'] = output_path 163 | 164 | # Extract configuration sections 165 | bundletrack_config = config['bundletrack'] 166 | nerf_config = config['nerf'] 167 | roma_config = config['roma'] 168 | sam2_config = config['sam2'] 169 | foundation_stereo_config = config['foundation_stereo'] 170 | texture_config = config['texture_bake'] 171 | 172 | logger.info(f"Starting reconstruction pipeline for: {exp_path}") 173 | 174 | # Copy contents of input data path to output folder 175 | logger.info("Copying input data to output folder...") 176 | for item in exp_path.iterdir(): 177 | if item.is_dir(): 178 | shutil.copytree(item, output_path / item.name, dirs_exist_ok=True) 179 | else: 180 | shutil.copy2(item, output_path) 181 | logger.info("Input data copied successfully") 182 | 183 | # Step 1: Mask extraction 184 | logger.info("Step 1/4: Running mask extraction...") 185 | try: 186 | start_mask = time.time() 187 | run_mask_extraction(sam2_config, output_path, output_path / 'left', mask_path=output_path / 'masks') 188 | logger.info("Mask extraction completed successfully") 189 | time_mask = time.time() - start_mask 190 | except Exception as e: 191 | logger.error(f"Mask extraction failed: {e}") 192 | raise RuntimeError(f"Mask extraction step failed: {e}") 193 | 194 | 195 | # Step 2: Depth estimation 196 | logger.info("Step 2/4: Running depth estimation...") 197 | try: 198 | start_depth = time.time() 199 | response = run_depth_estimation(foundation_stereo_config, output_path, output_path / 'left', depth_path=output_path / 'depth') 200 | if not response: 201 | raise RuntimeError("Depth estimation failed") 202 | logger.info("Depth estimation completed successfully") 203 | time_depth = time.time() - start_depth 204 | except Exception as e: 205 | logger.error(f"Depth estimation failed: {e}") 206 | raise RuntimeError(f"Depth estimation step failed: {e}") 207 | 208 | # Step 3: Initialize tracker and datasets 209 | logger.info("Step 3/4: Initializing reconstruction components...") 210 | try: 211 | start_pipeline = time.time() 212 | tracker = NVBundleSDF(nerf_config, bundletrack_config, roma_config, texture_config, logger=logger) 213 | 214 | track_dataset = ReconstructionDataLoader( 215 | str(output_path), 216 | config, 217 | downscale=bundletrack_config['downscale'], 218 | min_resolution=bundletrack_config['min_resolution'] 219 | ) 220 | nerf_dataset = ReconstructionDataLoader( 221 | str(output_path), 222 | config, 223 | downscale=nerf_config['downscale'], 224 | min_resolution=nerf_config['min_resolution'] 225 | ) 226 | texture_dataset = ReconstructionDataLoader( 227 | str(output_path), 228 | config, 229 | downscale=texture_config['downscale'], 230 | min_resolution=texture_config['min_resolution'] 231 | ) 232 | logger.info("Components initialized successfully") 233 | except Exception as e: 234 | logger.error(f"Component initialization failed: {e}") 235 | raise RuntimeError(f"Failed to initialize reconstruction components: {e}") 236 | 237 | # Step 4: Run reconstruction pipeline 238 | logger.info("Step 4/4: Running reconstruction pipeline...") 239 | 240 | # Object tracking 241 | logger.info(" 4a. Running object tracking...") 242 | try: 243 | start_track = time.time() 244 | tracker.run_track(track_dataset) 245 | logger.info(" Object tracking completed") 246 | time_track = time.time() - start_track 247 | except Exception as e: 248 | logger.error(f" Object tracking failed: {e}") 249 | raise RuntimeError(f"Object tracking failed: {e}") 250 | 251 | # SDF training 252 | logger.info(" 4b. Running SDF training...") 253 | try: 254 | start_sdf = time.time() 255 | tracker.run_global_sdf(nerf_dataset) 256 | logger.info(" SDF training completed") 257 | time_sdf = time.time() - start_sdf 258 | except Exception as e: 259 | logger.error(f" SDF training failed: {e}") 260 | raise RuntimeError(f"SDF training failed: {e}") 261 | 262 | # Texture baking 263 | logger.info(" 4c. Running texture baking...") 264 | try: 265 | start_texture = time.time() 266 | tracker.run_texture_bake(texture_dataset) 267 | logger.info(" Texture baking completed") 268 | time_texture = time.time() - start_texture 269 | except Exception as e: 270 | logger.error(f" Texture baking failed: {e}") 271 | raise RuntimeError(f"Texture baking failed: {e}") 272 | 273 | logger.info(f"Reconstruction completed successfully for {output_path}") 274 | time_pipeline = time.time() - start_pipeline 275 | times = { 276 | "total": time.time() - start_total, 277 | "mask": time_mask, 278 | "depth": time_depth, 279 | "pipeline": time_pipeline, 280 | "track": time_track, 281 | "sdf": time_sdf, 282 | "texture": time_texture, 283 | "gpu_name": torch.cuda.get_device_name(0), 284 | } 285 | with open(output_path / "run_time.yaml", "w") as f: 286 | yaml.dump(times, f) 287 | return 0 288 | 289 | except KeyboardInterrupt: 290 | logger.warning("Reconstruction interrupted by user (Ctrl+C)") 291 | return 130 # Standard exit code for SIGINT 292 | 293 | except FileNotFoundError as e: 294 | logger.error(f"File not found: {e}") 295 | return 2 296 | 297 | except NotADirectoryError as e: 298 | logger.error(f"Invalid directory: {e}") 299 | return 2 300 | 301 | except yaml.YAMLError as e: 302 | logger.error(f"Configuration file error: {e}") 303 | return 3 304 | 305 | except ValueError as e: 306 | logger.error(f"Configuration validation error: {e}") 307 | return 3 308 | 309 | except RuntimeError as e: 310 | logger.error(f"Processing error: {e}") 311 | return 4 312 | 313 | except MemoryError: 314 | logger.error("Out of memory - try reducing batch size or image resolution") 315 | return 5 316 | 317 | except Exception as e: 318 | logger.error(f"Unexpected error: {e}") 319 | logger.debug("Full traceback:", exc_info=True) 320 | return 1 321 | 322 | if __name__ == "__main__": 323 | sys.exit(main()) -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/dataloader/reconstruction_dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import json 5 | class ReconstructionDataLoader: 6 | """ 7 | Data loader for multi-view object reconstruction using Bundle-SDF approach. 8 | 9 | This class manages loading and preprocessing of data for Bundle-SDF reconstruction, 10 | including RGB images, depth maps, and segmentation masks. 11 | 12 | Args: 13 | image_dir (str): Directory containing the dataset with subdirectories: 14 | - left/: RGB images 15 | - depth/: Corresponding depth maps 16 | - masks/: Segmentation masks 17 | - poses/: Camera poses 18 | config (Dict): Configuration dictionary containing: 19 | - camera_config: Camera parameters dictionary with: 20 | - intrinsic: Camera intrinsic matrix (flattened) 21 | downscale (float, optional): Scale factor to resize inputs (1.0 = original size). 22 | Defaults to 1.0. 23 | version (int, optional): Version of the dataloader implementation: 24 | - 1: Original implementation, returns (color, depth, mask) 25 | - 2: Enhanced implementation, returns (left, right, depth, mask, pose, id_str) 26 | where right and pose are None if not available 27 | Defaults to 1. 28 | 29 | Raises: 30 | ValueError: If directory structure is invalid or files cannot be found 31 | IOError: If image files cannot be read properly 32 | """ 33 | def __init__(self, image_dir, config, downscale=1, version=1, min_resolution=300): 34 | if not os.path.exists(image_dir): 35 | raise ValueError(f"Image directory not found: {image_dir}") 36 | 37 | self.image_dir = image_dir 38 | self.downscale = downscale 39 | self.version = version 40 | 41 | if self.version not in [1, 2]: 42 | raise ValueError(f"Invalid version {version}. Must be 1 or 2.") 43 | 44 | # Validate and process camera intrinsics 45 | if 'camera_config' not in config or 'intrinsic' not in config['camera_config']: 46 | raise ValueError("Config must contain 'camera_config' with 'intrinsic' parameter") 47 | 48 | self.K = np.array(config['camera_config']['intrinsic']).reshape(3, 3) 49 | self.time_step = config['camera_config']['step'] 50 | 51 | # Find and sort frame files 52 | left_dir = os.path.join(self.image_dir, 'left/') 53 | if not os.path.exists(left_dir): 54 | raise ValueError(f"Left image directory not found: {left_dir}") 55 | 56 | # Check for optional directories 57 | right_dir = os.path.join(self.image_dir, 'right/') 58 | depth_dir = os.path.join(self.image_dir, 'depth/') 59 | mask_dir = os.path.join(self.image_dir, 'masks/') 60 | pose_dir = os.path.join(self.image_dir, 'poses/') 61 | 62 | # Track which features are available 63 | self.has_right_images = os.path.exists(right_dir) 64 | self.has_depth_maps = os.path.exists(depth_dir) 65 | self.has_masks = os.path.exists(mask_dir) 66 | self.has_poses = os.path.exists(pose_dir) 67 | 68 | # Validate required directories based on version 69 | if self.version == 2: 70 | missing_dirs = [] 71 | if not self.has_depth_maps: 72 | missing_dirs.append("depth/") 73 | if not self.has_masks: 74 | missing_dirs.append("masks/") 75 | 76 | if missing_dirs: 77 | print(f"Warning: Required directories missing for version 2: {', '.join(missing_dirs)}") 78 | print("Returning None for missing data fields") 79 | 80 | frame_names = [ 81 | p for p in os.listdir(left_dir) 82 | if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg", ".png"] 83 | ] 84 | 85 | if not frame_names: 86 | raise ValueError(f"No valid image files found in {left_dir}") 87 | 88 | frame_names.sort(key=lambda p: int(os.path.splitext(p)[0][4:])) 89 | 90 | self.color_files = [os.path.join(left_dir, file_name) for file_name in frame_names] 91 | 92 | # Extract frame IDs 93 | self.id_strs = [] 94 | for color_file in self.color_files: 95 | id_str = os.path.basename(color_file)[:-4] 96 | self.id_strs.append(id_str) 97 | 98 | # Get image dimensions from first frame and apply downscaling 99 | first_img_path = self.color_files[0] 100 | first_img = cv2.imread(first_img_path) 101 | if first_img is None: 102 | raise IOError(f"Could not read image: {first_img_path}") 103 | 104 | self.H, self.W = first_img.shape[:2] 105 | if self.H < min_resolution or self.W < min_resolution: 106 | self.downscale = 1.0 107 | else: 108 | scale = min_resolution / min(self.H, self.W) 109 | self.downscale = max(self.downscale, scale) 110 | self.H = int(self.H * self.downscale) 111 | self.W = int(self.W * self.downscale) 112 | 113 | # Scale intrinsics according to downscale factor 114 | self.K[:2] *= self.downscale 115 | self.far = config['nerf']['far'] 116 | 117 | def __len__(self): 118 | """Return the number of frames in the dataset.""" 119 | return len(self.color_files) 120 | 121 | def get_color(self, idx): 122 | """ 123 | Load and preprocess RGB image for the specified index. 124 | 125 | Args: 126 | idx (int): Index of the frame to retrieve 127 | 128 | Returns: 129 | np.ndarray: RGB image as np.uint8 with shape (H, W, 3) 130 | 131 | Raises: 132 | IndexError: If idx is out of range 133 | IOError: If image file cannot be read 134 | """ 135 | if idx < 0 or idx >= len(self): 136 | raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}") 137 | 138 | color_path = self.color_files[idx] 139 | color = cv2.imread(color_path) 140 | 141 | if color is None: 142 | raise IOError(f"Failed to load image: {color_path}") 143 | 144 | color = cv2.resize(color, (self.W, self.H), interpolation=cv2.INTER_LINEAR) 145 | return color 146 | 147 | def get_right(self, idx): 148 | """ 149 | Load and preprocess right RGB image for the specified index (if available). 150 | 151 | Args: 152 | idx (int): Index of the frame to retrieve 153 | 154 | Returns: 155 | np.ndarray or None: Right RGB image as np.uint8 with shape (H, W, 3), 156 | or None if not available 157 | 158 | Raises: 159 | IndexError: If idx is out of range 160 | """ 161 | if not self.has_right_images: 162 | return None 163 | 164 | if idx < 0 or idx >= len(self): 165 | raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}") 166 | 167 | right_path = self.color_files[idx].replace('left/', 'right/') 168 | 169 | if not os.path.exists(right_path): 170 | return None 171 | 172 | right = cv2.imread(right_path) 173 | if right is None: 174 | return None 175 | 176 | right = cv2.resize(right, (self.W, self.H), interpolation=cv2.INTER_LINEAR) 177 | return right 178 | 179 | def get_depth(self, idx): 180 | """ 181 | Load and preprocess depth map for the specified index. 182 | 183 | Args: 184 | idx (int): Index of the frame to retrieve 185 | 186 | Returns: 187 | np.ndarray or None: Depth map as np.float32 with shape (H, W), 188 | or None if not available 189 | 190 | Raises: 191 | IndexError: If idx is out of range 192 | """ 193 | if not self.has_depth_maps: 194 | return None 195 | 196 | if idx < 0 or idx >= len(self): 197 | raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}") 198 | 199 | depth_file = self.color_files[idx].replace('left/', 'depth/') 200 | 201 | # if not os.path.exists(depth_file): 202 | # # check if npy file exists 203 | # if os.path.exists(depth_file.replace('.png', '.npy')): 204 | # depth_file = depth_file.replace('.png', '.npy') 205 | # else: 206 | # return None 207 | 208 | if os.path.exists(depth_file.replace('.png', '.npy')): 209 | depth_file = depth_file.replace('.png', '.npy') 210 | 211 | # Support multiple depth formats 212 | try: 213 | if os.path.splitext(depth_file)[1].lower() == '.npy': 214 | depth = np.load(depth_file) 215 | else: 216 | depth = cv2.imread(depth_file, cv2.IMREAD_UNCHANGED) 217 | if depth is None: 218 | return None 219 | depth = depth.astype(np.float32) / 1000.0 220 | 221 | depth = cv2.resize(depth, (self.W, self.H), interpolation=cv2.INTER_NEAREST) 222 | return depth 223 | except Exception as e: 224 | print(f"Warning: Failed to load depth map {depth_file}: {e}") 225 | return None 226 | 227 | def get_mask(self, idx): 228 | """ 229 | Load and preprocess segmentation mask for the specified index. 230 | 231 | Args: 232 | idx (int): Index of the frame to retrieve 233 | 234 | Returns: 235 | np.ndarray or None: Binary mask as np.uint8 with shape (H, W), 236 | or None if not available 237 | 238 | Raises: 239 | IndexError: If idx is out of range 240 | """ 241 | if not self.has_masks: 242 | return None 243 | 244 | if idx < 0 or idx >= len(self): 245 | raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}") 246 | 247 | mask_file = self.color_files[idx].replace('left/', 'masks/') 248 | 249 | if not os.path.exists(mask_file): 250 | return None 251 | 252 | try: 253 | mask = cv2.imread(mask_file, -1) 254 | 255 | if mask is None: 256 | return None 257 | 258 | # Ensure binary format 259 | if len(mask.shape) == 3: 260 | mask = (mask.sum(axis=-1) > 0).astype(np.uint8) 261 | 262 | mask = cv2.resize(mask, (self.W, self.H), interpolation=cv2.INTER_NEAREST) 263 | mask = cv2.erode(mask, np.ones((5, 5), np.uint8)) 264 | mask = cv2.dilate(mask, np.ones((5, 5), np.uint8)) #add denoising and smoothing to masks 265 | return mask 266 | except Exception as e: 267 | print(f"Warning: Failed to load mask {mask_file}: {e}") 268 | return None 269 | 270 | def get_pose(self, idx): 271 | """ 272 | Load camera pose for the specified index (if available). 273 | 274 | Args: 275 | idx (int): Index of the frame to retrieve 276 | 277 | Returns: 278 | np.ndarray or None: Camera pose matrix, or None if not available 279 | 280 | Raises: 281 | IndexError: If idx is out of range 282 | """ 283 | if not self.has_poses: 284 | return None 285 | 286 | if idx < 0 or idx >= len(self): 287 | raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}") 288 | 289 | pose_file = self.color_files[idx].replace('left/', 'poses/').replace( 290 | os.path.splitext(self.color_files[idx])[1], '.json') 291 | 292 | if not os.path.exists(pose_file): 293 | return None 294 | 295 | try: 296 | with open(pose_file, 'r') as f: 297 | pose_data = json.load(f) 298 | 299 | # Assuming pose data is a flat list or nested list that can be converted to a matrix 300 | pose = np.array(pose_data) 301 | return pose 302 | except Exception as e: 303 | print(f"Warning: Failed to load pose {pose_file}: {e}") 304 | return None 305 | 306 | def __getitem__(self, idx): 307 | """ 308 | Get preprocessed data for the specified index. 309 | 310 | Args: 311 | idx (int): Index of the frame to retrieve 312 | 313 | Returns: 314 | If version=1: 315 | Tuple containing: 316 | - RGB image (H, W, 3) as np.uint8 317 | - Depth map (H, W) as np.float32, or None if not available 318 | - Binary mask (H, W) as np.uint8, or None if not available 319 | 320 | If version=2: 321 | Tuple containing: 322 | - left: RGB image (H, W, 3) as np.uint8 323 | - right: RGB image (H, W, 3) as np.uint8, or None if not available 324 | - depth: Depth map (H, W) as np.float32, or None if not available 325 | - mask: Binary mask (H, W) as np.uint8, or None if not available 326 | - pose: Camera pose matrix as np.ndarray, or None if not available 327 | - id_str: ID string of the frame 328 | 329 | Raises: 330 | IndexError: If idx is out of range 331 | """ 332 | if idx < 0 or idx >= len(self): 333 | raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}") 334 | 335 | color = self.get_color(idx) 336 | depth = self.get_depth(idx) 337 | mask = self.get_mask(idx) 338 | id_str = self.id_strs[idx] 339 | 340 | if self.version == 1: 341 | return color, depth, mask 342 | elif self.version == 2: 343 | # Version 2 returns data in a format compatible with ReconstructionDataloader 344 | right = self.get_right(idx) 345 | pose = self.get_pose(idx) 346 | return color, right, depth, mask, pose, id_str 347 | 348 | def get_camera_intrinsics(self): 349 | """ 350 | Get the camera intrinsics matrix adjusted for current downscale factor. 351 | 352 | Returns: 353 | np.ndarray: 3x3 camera intrinsics matrix 354 | """ 355 | return self.K.copy() 356 | 357 | def get_image_dimensions(self): 358 | """ 359 | Get the current image dimensions after downscaling. 360 | 361 | Returns: 362 | Tuple[int, int]: Height and width of the images 363 | """ 364 | return self.H, self.W 365 | 366 | def get_frame_id(self, idx): 367 | """ 368 | Get the ID string for the frame at the specified index. 369 | 370 | Args: 371 | idx (int): Index of the frame 372 | 373 | Returns: 374 | str: ID string of the frame 375 | 376 | Raises: 377 | IndexError: If idx is out of range 378 | """ 379 | if idx < 0 or idx >= len(self): 380 | raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}") 381 | 382 | return self.id_strs[idx] -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/networks/foundationstereo.py: -------------------------------------------------------------------------------- 1 | """ 2 | FoundationStereo Network Implementation for 3D Object Reconstruction. 3 | 4 | This module provides a wrapper around the FoundationStereo model for stereo 5 | depth estimation. It includes preprocessing utilities, model initialization, 6 | and a high-level processor for batch depth map generation. 7 | 8 | Classes: 9 | InputPadder: Utility class for padding images to required dimensions 10 | FoundationStereoNet: Wrapper for the FoundationStereo model 11 | FoundationStereoProcessor: High-level processor for stereo depth estimation 12 | 13 | Functions: 14 | run_depth_estimation: Main entry point for depth estimation pipeline 15 | """ 16 | 17 | import cv2 18 | import torch 19 | import imageio 20 | import numpy as np 21 | import torch.nn.functional as F 22 | import sys 23 | sys.path.append('/FoundationStereo/core') 24 | from foundation_stereo import FoundationStereo 25 | from tqdm import tqdm 26 | from pathlib import Path 27 | from typing import Dict, Any, List, Tuple, Union, Optional 28 | from loguru import logger 29 | from omegaconf import OmegaConf 30 | 31 | 32 | class InputPadder: 33 | """ 34 | Utility class for padding images to dimensions divisible by a given factor. 35 | 36 | This class ensures that input images have dimensions that are compatible 37 | with neural network architectures that require specific divisibility 38 | constraints (e.g., divisible by 8 or 32). 39 | 40 | Attributes: 41 | ht (int): Original image height 42 | wd (int): Original image width 43 | _pad (List[int]): Padding values [left, right, top, bottom] 44 | """ 45 | 46 | def __init__( 47 | self, 48 | dims: Tuple[int, ...], 49 | mode: str = 'sintel', 50 | divis_by: int = 8, 51 | force_square: bool = False 52 | ) -> None: 53 | """ 54 | Initialize the InputPadder. 55 | 56 | Args: 57 | dims: Image dimensions tuple (..., H, W) 58 | mode: Padding mode, either 'sintel' or other 59 | divis_by: Factor by which dimensions should be divisible 60 | force_square: If True, pad to make image square 61 | 62 | Example: 63 | >>> padder = InputPadder((1, 3, 480, 640), divis_by=32) 64 | >>> padded_imgs = padder.pad(img1, img2) 65 | """ 66 | self.ht, self.wd = dims[-2:] 67 | 68 | if force_square: 69 | max_side = max(self.ht, self.wd) 70 | pad_ht = ((max_side // divis_by) + 1) * divis_by - self.ht 71 | pad_wd = ((max_side // divis_by) + 1) * divis_by - self.wd 72 | else: 73 | pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by 74 | pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by 75 | 76 | if mode == 'sintel': 77 | self._pad = [ 78 | pad_wd // 2, pad_wd - pad_wd // 2, 79 | pad_ht // 2, pad_ht - pad_ht // 2 80 | ] 81 | else: 82 | self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht] 83 | 84 | def pad(self, *inputs: torch.Tensor) -> List[torch.Tensor]: 85 | """ 86 | Apply padding to input tensors. 87 | 88 | Args: 89 | *inputs: Variable number of 4D tensors to pad 90 | 91 | Returns: 92 | List of padded tensors with same order as inputs 93 | 94 | Raises: 95 | AssertionError: If any input tensor is not 4-dimensional 96 | """ 97 | assert all((x.ndim == 4) for x in inputs), \ 98 | "All inputs must be 4-dimensional tensors" 99 | return [F.pad(x, self._pad, mode='replicate') for x in inputs] 100 | 101 | def unpad(self, x: torch.Tensor) -> torch.Tensor: 102 | """ 103 | Remove padding from a tensor. 104 | 105 | Args: 106 | x: 4D tensor to unpad 107 | 108 | Returns: 109 | Tensor with padding removed 110 | 111 | Raises: 112 | AssertionError: If input tensor is not 4-dimensional 113 | """ 114 | assert x.ndim == 4, "Input must be a 4-dimensional tensor" 115 | ht, wd = x.shape[-2:] 116 | c = [ 117 | self._pad[2], ht - self._pad[3], 118 | self._pad[0], wd - self._pad[1] 119 | ] 120 | return x[..., c[0]:c[1], c[2]:c[3]] 121 | 122 | 123 | class FoundationStereoNet(FoundationStereo): 124 | """ 125 | Wrapper class for FoundationStereo network. 126 | 127 | This class extends the base FoundationStereo class with additional 128 | functionality for configuration management, weight loading, and 129 | simplified inference interface for stereo depth estimation. 130 | 131 | Attributes: 132 | config (Dict[str, Any]): Model configuration parameters 133 | """ 134 | 135 | def __init__(self, config: Dict[str, Any]) -> None: 136 | """ 137 | Initialize the FoundationStereo network. 138 | 139 | Args: 140 | config: Configuration dictionary containing model parameters 141 | including architecture settings and hyperparameters 142 | 143 | Example: 144 | >>> config = {'hidden_dims': [128, 128], 'corr_levels': 4} 145 | >>> model = FoundationStereoNet(config) 146 | """ 147 | super().__init__(config) 148 | self.config = config 149 | 150 | def load_weights(self) -> None: 151 | """ 152 | Load pre-trained weights from checkpoint file. 153 | 154 | The checkpoint file path should be specified in config['pth_path']. 155 | The checkpoint is expected to contain a 'model' key with the 156 | state dictionary. 157 | 158 | Raises: 159 | FileNotFoundError: If checkpoint file doesn't exist 160 | KeyError: If checkpoint doesn't contain 'model' key 161 | RuntimeError: If state dict loading fails 162 | """ 163 | try: 164 | ckpt = torch.load(self.config['pth_path'], weights_only=False) 165 | self.load_state_dict(ckpt['model']) 166 | logger.info(f"Loaded weights from {self.config['pth_path']}") 167 | except FileNotFoundError as e: 168 | logger.error(f"Checkpoint file not found: {self.config['pth_path']}") 169 | raise e 170 | except KeyError as e: 171 | logger.error(f"Checkpoint missing 'model' key: {e}") 172 | raise e 173 | 174 | def forward( 175 | self, 176 | left: torch.Tensor, 177 | right: torch.Tensor 178 | ) -> torch.Tensor: 179 | """ 180 | Perform forward pass for stereo depth estimation. 181 | 182 | Args: 183 | left: Left stereo image tensor of shape [B, C, H, W] 184 | right: Right stereo image tensor of shape [B, C, H, W] 185 | 186 | Returns: 187 | Disparity map tensor of shape [B, 1, H, W] representing 188 | pixel disparities between left and right images 189 | """ 190 | return super().forward(left, right, iters=32, test_mode=True) 191 | 192 | 193 | class FoundationStereoProcessor: 194 | """ 195 | High-level processor for stereo depth estimation. 196 | 197 | This class manages the complete pipeline from loading stereo image pairs 198 | to generating depth maps. It handles image preprocessing, network inference, 199 | and depth conversion with configurable camera parameters. 200 | 201 | Attributes: 202 | config (Dict[str, Any]): Configuration parameters 203 | net (FoundationStereoNet): The stereo network model 204 | rgb_path (Path): Path to input RGB images 205 | output_path (Path): Path for output depth maps 206 | left_images (List[Path]): List of left stereo image paths 207 | intrinsic (np.ndarray): Camera intrinsic matrix (3x3) 208 | baseline (float): Baseline distance between cameras 209 | """ 210 | 211 | def __init__( 212 | self, 213 | config: Dict[str, Any], 214 | rgb_path: Path, 215 | output_path: Path 216 | ) -> None: 217 | """ 218 | Initialize the stereo depth estimation processor. 219 | 220 | Args: 221 | config: Configuration dictionary containing: 222 | - pth_path: Path to model weights 223 | - intrinsic: Camera intrinsics matrix (3x3) 224 | - baseline: Baseline distance between cameras 225 | - scale: Resize scale factor for images 226 | rgb_path: Path to directory containing left stereo images 227 | Supports png, jpg, jpeg formats 228 | output_path: Directory path where depth maps will be saved 229 | as .npy files 230 | 231 | Raises: 232 | RuntimeError: If CUDA is not available 233 | FileNotFoundError: If rgb_path doesn't exist 234 | """ 235 | self.config = config 236 | 237 | # Initialize and setup the stereo network 238 | self.net = FoundationStereoNet(config) 239 | self.net.load_weights() 240 | 241 | if not torch.cuda.is_available(): 242 | raise RuntimeError("CUDA is required but not available") 243 | 244 | self.net.cuda() # Move model to GPU 245 | self.net.eval() # Set to evaluation mode 246 | 247 | self.rgb_path = Path(rgb_path) 248 | self.output_path = Path(output_path) 249 | 250 | if not self.rgb_path.exists(): 251 | raise FileNotFoundError(f"RGB path does not exist: {rgb_path}") 252 | 253 | # Discover and sort left stereo images 254 | self._discover_images() 255 | 256 | # Extract camera parameters from configuration 257 | self._setup_camera_params() 258 | 259 | def _discover_images(self) -> None: 260 | """Discover and sort left stereo images from the input directory.""" 261 | left_images = [] 262 | supported_formats = ['*.png', '*.jpg', '*.jpeg'] 263 | 264 | for ext in supported_formats: 265 | left_images.extend(self.rgb_path.glob(ext)) 266 | 267 | self.left_images = sorted(left_images) 268 | 269 | if not self.left_images: 270 | logger.warning(f"No images found in {self.rgb_path}") 271 | 272 | logger.info(f"Found {len(self.left_images)} left images") 273 | 274 | def _setup_camera_params(self) -> None: 275 | """Extract and setup camera parameters from configuration.""" 276 | self.intrinsic = np.array(self.config['intrinsic']).reshape(3, 3) 277 | # Scale intrinsics to match resized images 278 | self.intrinsic[:2] *= self.config['scale'] 279 | self.baseline = self.config['baseline'] 280 | 281 | logger.info(f"Camera baseline: {self.baseline}") 282 | logger.info(f"Image scale factor: {self.config['scale']}") 283 | 284 | def infer( 285 | self, 286 | left_input: Union[str, Path, np.ndarray], 287 | right_input: Union[str, Path, np.ndarray], 288 | return_disparity: bool = False 289 | ) -> np.ndarray: 290 | """ 291 | Perform stereo depth inference on a single pair of images. 292 | 293 | Args: 294 | left_input: Path to left stereo image or numpy array 295 | right_input: Path to right stereo image or numpy array 296 | return_disparity: If True, returns disparity map instead of depth 297 | 298 | Returns: 299 | Depth map or disparity map as numpy array of shape [H, W] 300 | 301 | Raises: 302 | ValueError: If inputs are invalid or incompatible 303 | RuntimeError: If inference fails 304 | """ 305 | try: 306 | # Load images - handle both file paths and numpy arrays 307 | if isinstance(left_input, (str, Path)): 308 | left = imageio.imread(str(left_input)) 309 | right = imageio.imread(str(right_input)) 310 | else: 311 | # Assume numpy arrays passed directly 312 | left = left_input 313 | right = right_input 314 | 315 | # Validate image shapes 316 | if left.shape != right.shape: 317 | raise ValueError( 318 | f"Image shapes don't match: {left.shape} vs {right.shape}" 319 | ) 320 | 321 | # Resize images according to configuration scale 322 | scale = self.config['scale'] 323 | left = cv2.resize( 324 | left, fx=scale, fy=scale, dsize=None, 325 | interpolation=cv2.INTER_LINEAR 326 | ) 327 | right = cv2.resize( 328 | right, fx=scale, fy=scale, dsize=None, 329 | interpolation=cv2.INTER_LINEAR 330 | ) 331 | H, W = left.shape[:2] 332 | 333 | # Convert images to PyTorch tensors and move to GPU 334 | img0 = torch.as_tensor(left).cuda().float()[None].permute(0, 3, 1, 2) 335 | img1 = torch.as_tensor(right).cuda().float()[None].permute(0, 3, 1, 2) 336 | 337 | # Pad images to be divisible by 32 for network processing 338 | padder = InputPadder(img0.shape, divis_by=32, force_square=False) 339 | img0, img1 = padder.pad(img0, img1) 340 | 341 | # Run stereo matching inference 342 | with torch.no_grad(): 343 | disp = self.net(img0, img1) 344 | 345 | # Remove padding and convert to numpy 346 | disp = padder.unpad(disp.float()) 347 | disp = disp.data.cpu().numpy().reshape(H, W) 348 | 349 | if return_disparity: 350 | return disp 351 | 352 | # Convert disparity to metric depth using camera parameters 353 | # Depth = (focal_length * baseline) / disparity 354 | # Avoid division by zero 355 | disp_safe = np.where(disp > 0, disp, np.inf) 356 | depth = self.intrinsic[0, 0] * self.baseline / disp_safe 357 | 358 | return depth 359 | 360 | except Exception as e: 361 | logger.error(f"Inference failed: {e}") 362 | raise RuntimeError(f"Stereo inference failed: {e}") from e 363 | 364 | def run(self) -> None: 365 | """ 366 | Process all stereo image pairs to generate depth maps. 367 | 368 | Main processing loop that: 369 | 1. Loads left/right stereo image pairs 370 | 2. Uses the infer() method for consistent processing 371 | 3. Saves depth maps as numpy arrays 372 | 373 | For each left image, expects corresponding right image with 'left' 374 | replaced by 'right' in the filename. 375 | 376 | Output depth maps are saved as {image_name}.npy in the output directory. 377 | 378 | Raises: 379 | FileNotFoundError: If corresponding right image is not found 380 | RuntimeError: If processing fails 381 | """ 382 | if not self.left_images: 383 | logger.warning("No left images found to process") 384 | return 385 | 386 | # Ensure output directory exists 387 | self.output_path.mkdir(parents=True, exist_ok=True) 388 | 389 | successful_count = 0 390 | 391 | for left_path in tqdm(self.left_images, desc="Processing stereo pairs"): 392 | try: 393 | base_name = left_path.stem 394 | 395 | # Construct right image path 396 | right_path = left_path.parent.parent / 'right' / left_path.name.replace('left', 'right') 397 | 398 | if not right_path.exists(): 399 | logger.warning(f"Right image not found: {right_path}") 400 | continue 401 | 402 | # Use the infer method for consistent processing 403 | depth = self.infer( 404 | left_path, right_path, return_disparity=False 405 | ) 406 | 407 | # Save depth map as numpy array 408 | output_file = self.output_path / f"{base_name}.npy" 409 | np.save(output_file, depth) 410 | successful_count += 1 411 | 412 | except Exception as e: 413 | logger.error(f"Failed to process {left_path}: {e}") 414 | continue 415 | 416 | logger.info( 417 | f"Successfully processed {successful_count}/{len(self.left_images)} " 418 | f"stereo pairs" 419 | ) 420 | 421 | 422 | def run_depth_estimation( 423 | config: Dict[str, Any], 424 | exp_path: Path, 425 | rgb_path: Path, 426 | depth_path: Optional[Path] = None 427 | ) -> Optional[bool]: 428 | """ 429 | Set up and run depth estimation pipeline. 430 | 431 | This function orchestrates the complete depth estimation process: 432 | 1. Sets up output directory structure 433 | 2. Checks if depth maps already exist 434 | 3. Runs FoundationStereo processing if needed 435 | 4. Returns success status 436 | 437 | Args: 438 | config: Configuration dictionary containing model and camera parameters 439 | exp_path: Path to experiment directory 440 | rgb_path: Path to RGB frames directory containing left/right images 441 | depth_path: Optional custom path for depth output (defaults to exp_path/depth) 442 | 443 | Returns: 444 | True if successful, False/None if failed 445 | 446 | Example: 447 | >>> config = { 448 | ... 'cfg_path': 'model_config.yaml', 449 | ... 'pth_path': 'weights.pth', 450 | ... 'intrinsic': [[fx, 0, cx], [0, fy, cy], [0, 0, 1]], 451 | ... 'baseline': 0.1, 452 | ... 'scale': 0.5 453 | ... } 454 | >>> success = run_depth_estimation(config, exp_path, rgb_path) 455 | """ 456 | # Setup depth output directory 457 | if depth_path is None: 458 | depth_path = exp_path / 'depth' 459 | depth_path.mkdir(parents=True, exist_ok=True) 460 | logger.info(f"Depth estimation directory: {depth_path}") 461 | 462 | try: 463 | # Check if depth images already exist (either all .npy or all .png) 464 | depth_images_npy = list(depth_path.glob('*.npy')) 465 | depth_images_png = list(depth_path.glob('*.png')) 466 | rgb_images = list(rgb_path.glob('*.png')) 467 | 468 | # Check if we have sufficient depth images in either format 469 | if (depth_images_npy and len(depth_images_npy) >= len(rgb_images)) or \ 470 | (depth_images_png and len(depth_images_png) >= len(rgb_images)): 471 | logger.info("Depth images already exist, skipping depth estimation") 472 | return True 473 | 474 | # Run depth estimation 475 | logger.info("Running depth estimation...") 476 | 477 | # Load additional model configuration 478 | cfg_model = OmegaConf.load(config['cfg_path']) 479 | args = OmegaConf.merge(OmegaConf.create(config), cfg_model) 480 | 481 | # Initialize and run processor 482 | processor = FoundationStereoProcessor(args, rgb_path, depth_path) 483 | processor.run() 484 | 485 | logger.info("Depth estimation completed successfully") 486 | return True 487 | 488 | except Exception as e: 489 | logger.error(f"Error running depth estimation: {e}") 490 | return None 491 | -------------------------------------------------------------------------------- /data/configs/base.yaml: -------------------------------------------------------------------------------- 1 | # Main data paths for input and output 2 | data_path: /workspace/3d-object-reconstruction/data/samples/retail_item/ # Path to input data folder containing images and masks 3 | workdir: /workspace/3d-object-reconstruction/data/output/retail_item/ # Path to output directory for reconstruction results 4 | downscale: 1.0 # not used for now since we specify image downscale for each part. 5 | # Camera intrinsic parameters in 3x3 matrix format 6 | camera_config: 7 | step: 4 8 | intrinsic: # Camera(Qoocam) intrinsic matrix, change depending on your camera - check camera's config 9 | - 3079.6 # fx 10 | - 0 11 | - 2000.0 # cx 12 | - 0 13 | - 3075.1 # fy 14 | - 1500.01 # cy 15 | - 0 16 | - 0 17 | - 1 18 | 19 | # BundleTrack configuration for camera pose estimation and tracking 20 | bundletrack: 21 | debug_dir: /workspace/3d-object-reconstruction/data/output/retail_item/ # Directory for debug outputs 22 | SPDLOG: 1 # Logging level 23 | USE_GRAY: false 24 | port: '5555' 25 | nerf_port: '9999' 26 | downscale: 1.0 # Image downscale factor for tracking 27 | min_resolution: 300 # Minimum resolution for tracking 28 | erode_mask: 3 # Mask erosion size to remove boundary artifacts 29 | visible_angle: 70 30 | 31 | # Object segmentation parameters, unused for now since we are using SAM2 for segmentation 32 | segmentation: 33 | ob_scales: 34 | - 0.3 35 | - 0.3 36 | - 0.3 37 | tolerance: 0.03 38 | 39 | # Depth map processing parameters 40 | depth_processing: 41 | zfar: 1.0 # depth max bound, same as nerf far,might need to change depending on scene scale 42 | erode: 43 | radius: 1 44 | diff: 0.001 45 | ratio: 0.8 46 | bilateral_filter: 47 | radius: 2 48 | sigma_D: 2 49 | sigma_R: 100000 50 | outlier_removal: 51 | num: 30 52 | std_mul: 3 53 | edge_normal_thres: 10 54 | denoise_cloud: false 55 | percentile: 95 # Percentile for depth truncation 56 | 57 | # Bundle adjustment parameters 58 | bundle: 59 | num_iter_outter: 7 60 | num_iter_inner: 5 61 | window_size: 5 #window size for non-keyframe saving 62 | max_BA_frames: 10 #max frames 63 | subset_selection_method: normal_orientation_nearest 64 | depth_association_radius: 5 65 | non_neighbor_max_rot: 90 #maximum rotation difference between two frames 66 | non_neighbor_min_visible: 0.1 #minimum convisiblity 67 | icp_pose_rot_thres: 60 68 | w_rpi: 0 #not used 69 | w_p2p: 1 #not used 70 | w_fm: 1 #not used 71 | w_sdf: 0 #not used 72 | w_pm: 0 #not used 73 | robust_delta: 0.005 #delta scaled for huberloss 74 | min_fm_edges_newframe: 15 75 | image_downscale: #image downscale factor 76 | - 4 77 | feature_edge_dist_thres: 0.01 #sparse feature edge distance threshold, not used 78 | feature_edge_normal_thres: 30 #sparse feature edge normal threshold, not used 79 | max_optimized_feature_loss: 0.03 #max optimized feature loss,not used 80 | 81 | # Keyframe selection parameters 82 | keyframe: 83 | min_interval: 1 84 | min_feat_num: 0 85 | min_trans: 0 86 | min_rot: 5 87 | min_visible: 1 88 | 89 | # SIFT feature detection parameters 90 | sift: 91 | scales: 92 | - 2 93 | - 4 94 | - 8 95 | max_match_per_query: 5 96 | nOctaveLayers: 3 97 | contrastThreshold: 0.01 98 | edgeThreshold: 50 99 | sigma: 1.6 100 | 101 | # Feature correspondence parameters 102 | feature_corres: 103 | mutual: true 104 | map_points: true # Use 3D map points 105 | max_dist_no_neighbor: 0.01 # Maximum distance for non-neighbors 106 | max_normal_no_neighbor: 20 # Maximum normal angle for non-neighbors 107 | max_dist_neighbor: 0.02 # Maximum distance for neighbors 108 | max_normal_neighbor: 30 # Maximum normal angle for neighbors 109 | suppression_patch_size: 5 110 | max_view_normal_angle: 180 111 | min_match_with_ref: 5 # Minimum matches with reference 112 | resize: 800 113 | rematch_after_nerf: false 114 | 115 | # RANSAC parameters for robust estimation not in use? 116 | ransac: 117 | max_iter: 2000 # Maximum RANSAC iterations 118 | num_sample: 3 # Number of samples per iteration 119 | inlier_dist: 0.01 # Inlier distance threshold 120 | inlier_normal_angle: 20 # Inlier normal angle threshold 121 | desired_succ_rate: 0.99 # Desired success rate 122 | max_trans_neighbor: 0.02 # Maximum translation for neighbors 123 | max_rot_deg_neighbor: 30 # Maximum rotation for neighbors 124 | max_trans_no_neighbor: 0.01 # Maximum translation for non-neighbors 125 | max_rot_no_neighbor: 10 # Maximum rotation for non-neighbors 126 | epipolar_thres: 1 # Epipolar constraint threshold 127 | min_match_after_ransac: 5 # Minimum matches after RANSAC 128 | 129 | # Point-to-point ICP parameters 130 | p2p: 131 | projective: false # Use projective ICP 132 | max_dist: 0.02 # Maximum correspondence distance 133 | max_normal_angle: 45 # Maximum normal angle difference 134 | 135 | # SDF edge parameters, not in use 136 | sdf_edge: 137 | max_dist: 0.02 138 | 139 | # Shape reconstruction parameters, not in use 140 | shape: 141 | res: 0.005 # Voxel resolution 142 | xrange: # X range for reconstruction 143 | - -0.2 144 | - 0.2 145 | yrange: # Y range for reconstruction 146 | - -0.2 147 | - 0.2 148 | zrange: # Z range for reconstruction 149 | - -0.2 150 | - 0.2 151 | max_weight: 100 # Maximum TSDF weight 152 | 153 | # Foundation Stereo parameters for depth estimation 154 | foundation_stereo: 155 | pth_path: /workspace/3d-object-reconstruction/data/weights/foundationstereo/model_best_bp2.pth 156 | cfg_path: /workspace/3d-object-reconstruction/data/weights/foundationstereo/cfg.yaml 157 | dinov2_path: /workspace/3d-object-reconstruction/data/weights/roma/dinov2_vitl14_pretrain.pth 158 | vit_size: vitl 159 | scale: 0.3 # Image scale factor 160 | hiera: 0 161 | z_far: 10 162 | remove_invisible: true 163 | intrinsic: # Default camera(Qoocam) intrinsic matrix, changes depending on cameras - check camera's config 164 | - 3079.6 165 | - 0 166 | - 2000.0 167 | - 0 168 | - 3075.1 169 | - 1500.01 170 | - 0 171 | - 0 172 | - 1 173 | baseline: 0.0657696127 # Stereo baseline(Qoocam), changes depending on cameras 174 | 175 | # SAM2 parameters for segmentation 176 | sam2: 177 | checkpoint_path: /workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_large.pt 178 | model_config: //workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_l.yaml 179 | bbox: # Bounding box for segmentation 180 | - 1144 181 | - 627 182 | - 2227 183 | - 2232 184 | device: cuda 185 | 186 | # NeRF parameters for neural rendering 187 | nerf: 188 | batch_size: 32 # Training image batch size, change to smaller bs 189 | downscale: 0.2 # Image downscale factor 190 | min_resolution: 300 # Minimum resolution for training 191 | n_step: 3000 # Number of training steps 192 | save_dir: ??? # Directory for saving models 193 | netdepth: 8 194 | netwidth: 256 195 | netdepth_fine: 8 196 | netwidth_fine: 256 197 | N_rand: 2048 # Training rays per batch 198 | lrate: 0.01 # Learning rate 199 | lrate_pose: 0.01 # Pose learning rate 200 | decay_rate: 0.1 # Learning rate decay 201 | chunk: 99999999999 # Chunk size for inference 202 | netchunk: 6553600 # Network chunk size 203 | no_batching: 0 # Disable batching 204 | amp: false # Use mixed precision 205 | N_samples: 64 # Number of coarse samples 206 | N_samples_around_depth: 256 # Samples around depth 207 | N_importance: 0 # Number of fine samples 208 | perturb: 1 # Random sampling 209 | use_viewdirs: 1 # Use view directions 210 | i_embed: 1 # Position embedding type 211 | i_embed_views: 2 # View direction embedding 212 | multires: 8 # Position embedding levels 213 | multires_views: 3 # View direction embedding levels 214 | feature_grid_dim: 2 # Feature grid dimension 215 | raw_noise_std: 0 # Noise standard deviation 216 | #logging 217 | i_img: 99999 # Image save interval 218 | i_weights: 99999 # Weight save interval 219 | i_mesh: 99999 # Mesh save interval 220 | i_pose: 999999 # Pose save interval 221 | i_print: 99999 # Print interval 222 | #Hash embedding config 223 | finest_res: 256 # Finest hash resolution 224 | base_res: 16 # Base hash resolution 225 | num_levels: 16 # Number of hash levels 226 | log2_hashmap_size: 22 # Hash table size 227 | #octree config 228 | use_octree: 1 # Use octree acceleration 229 | first_frame_weight: 1 # First frame weight 230 | denoise_depth_use_octree_cloud: true # Use octree for depth denoising 231 | octree_embed_base_voxel_size: 0.02 # Base octree voxel size 232 | octree_smallest_voxel_size: 0.02 # Smallest octree voxel 233 | octree_raytracing_voxel_size: 0.02 # Raytracing voxel size 234 | octree_dilate_size: 0.02 # Octree dilation size 235 | down_scale_ratio: 1 # Downscaling ratio 236 | bounding_box: # Scene bounding box 237 | - - -1 238 | - -1 239 | - -1 240 | - - 1 241 | - 1 242 | - 1 243 | use_mask: 1 # Use segmentation masks 244 | dilate_mask_size: 0 # Mask dilation size 245 | rays_valid_depth_only: true # Only use valid depth rays 246 | near: 0.1 # Near plane 247 | far: 1.0 # Far plane 248 | #loss weights 249 | rgb_weight: 10 # RGB loss weight 250 | depth_weight: 0 #depth loss not in use 251 | sdf_lambda: 5 # SDF weight 252 | 253 | neg_trunc_ratio: 1 # Negative truncation ratio 254 | 255 | fs_weight: 100 # Free space weight 256 | empty_weight: 2 # Empty space weight 257 | fs_rgb_weight: 0 258 | fs_sdf: 0.1 # free spacethreshold 259 | trunc_weight: 6000 # sdf loss weight, regularize depth 260 | tv_loss_weight: 0 261 | frame_features: 2 # Per-frame feature dimension 262 | optimize_poses: 0 # Optimize camera poses, 1 for enabled 263 | pose_reg_weight: 0 # Pose regularization 264 | feature_reg_weight: 0.1 # Feature regularization 265 | share_coarse_fine: 1 # Share coarse and fine networks,enabled when N_importance > 0 266 | eikonal_weight: 0 # Eikonal regularization on normal 267 | mode: sdf # Reconstruction mode 268 | 269 | #Mesh quality related parameters 270 | trunc: 0.004 # TSDF truncation, could set larger(0.01) if mesh has holes 271 | trunc_start: 0.004 # Initial truncation 272 | trunc_decay_type: '' # Truncation decay type 273 | mesh_resolution: 0.002 # grid voxel size for mesh extraction, Recommended equals or smaller than trunc 274 | max_trans: 0.02 # Maximum translation 275 | max_rot: 20 # Maximum rotation 276 | mesh_smoothing: # Mesh smoothing parameters 277 | enabled: true # Enable smoothing 278 | iterations: 2 # Number of iterations 279 | lambda_: 0.5 # Smoothing strength 280 | use_taubin: true # Use Taubin smoothing 281 | save_octree_clouds: true # Save octree point clouds 282 | 283 | # Texture baking parameters 284 | texture_bake: 285 | downscale: 1.0 # Texture image scale 286 | min_resolution: 300 # Minimum resolution for texture baking 287 | texture_res: 2048 # Texture resolution 288 | 289 | # ROMA feature matching parameters 290 | roma: 291 | coarse_res: 560 # Coarse resolution 292 | upsample_res: # Upsampling resolution 293 | - 864 294 | - 864 295 | device: cuda # Device for inference 296 | weights: /workspace/3d-object-reconstruction/data/weights/roma/roma_outdoor.pth # ROMA weights path 297 | dinov2_weights: /workspace/3d-object-reconstruction/data/weights/roma/dinov2_vitl14_pretrain.pth # DINOv2 weights path 298 | 299 | # Base path configuration 300 | base_path: 301 | base_folder: /workspace/3d-object-reconstruction/data/samples/retail_item/ # Base data folder 302 | image_folder: /workspace/3d-object-reconstruction/data/samples/retail_item/left/ # Input image folder 303 | save_dir: /workspace/3d-object-reconstruction/data/output/retail_item/ # Output directory 304 | -------------------------------------------------------------------------------- /src/nvidia/objectreconstruction/networks/sam2infer.py: -------------------------------------------------------------------------------- 1 | """ 2 | SAM2 Inference Module for 3D Object Reconstruction. 3 | 4 | This module provides functionality for running SAM2 (Segment Anything Model 2) 5 | inference on image sequences for mask generation. It includes utilities for 6 | processing single images, directories of images, and video sequences. 7 | 8 | The module handles PNG image formats and provides compatibility with the 9 | original SAM2 video processing pipeline. 10 | """ 11 | 12 | import os 13 | import glob 14 | import logging 15 | import numpy as np 16 | import torch 17 | import warnings 18 | from tqdm import tqdm 19 | from PIL import Image 20 | from pathlib import Path 21 | from typing import Dict, Any, Tuple, Optional, Union 22 | 23 | from sam2.build_sam import build_sam2_video_predictor 24 | from sam2.utils import misc 25 | 26 | # Add monkey patch for torch.load to ensure compatibility with older checkpoints 27 | original_torch_load = torch.load 28 | 29 | 30 | def patched_torch_load(*args, **kwargs): 31 | """ 32 | Patch torch.load to handle compatibility issues with older checkpoints. 33 | 34 | This function removes the weights_only=True parameter if present to ensure 35 | compatibility with older PyTorch checkpoints that don't support this flag. 36 | 37 | Args: 38 | *args: Positional arguments passed to torch.load 39 | **kwargs: Keyword arguments passed to torch.load 40 | 41 | Returns: 42 | The result of torch.load with modified parameters 43 | """ 44 | if 'weights_only' in kwargs and kwargs['weights_only'] is True: 45 | kwargs['weights_only'] = False 46 | return original_torch_load(*args, **kwargs) 47 | 48 | 49 | torch.load = patched_torch_load 50 | 51 | # Configure logging 52 | logger = logging.getLogger(__name__) 53 | 54 | # Select the device for computation 55 | if torch.cuda.is_available(): 56 | device = torch.device("cuda") 57 | elif torch.backends.mps.is_available(): 58 | device = torch.device("mps") 59 | else: 60 | device = torch.device("cpu") 61 | 62 | print(f"using device: {device}") 63 | 64 | # Configure device-specific settings 65 | if device.type == "cuda": 66 | # Use bfloat16 for better performance on CUDA 67 | torch.autocast("cuda", dtype=torch.bfloat16).__enter__() 68 | # Enable tfloat32 for Ampere GPUs 69 | if torch.cuda.get_device_properties(0).major >= 8: 70 | torch.backends.cuda.matmul.allow_tf32 = True 71 | torch.backends.cudnn.allow_tf32 = True 72 | elif device.type == "mps": 73 | print( 74 | "\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA " 75 | "and might give numerically different outputs and sometimes degraded " 76 | "performance on MPS. See e.g. " 77 | "https://github.com/pytorch/pytorch/issues/84936 for a discussion." 78 | ) 79 | 80 | 81 | def png_compatible_load_video_frames( 82 | video_path: str, 83 | image_size: int = 1024, 84 | offload_video_to_cpu: bool = False, 85 | img_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406), 86 | img_std: Tuple[float, float, float] = (0.229, 0.224, 0.225), 87 | async_loading_frames: bool = False, 88 | compute_device: torch.device = torch.device("cuda"), 89 | ) -> Tuple[torch.Tensor, int, int]: 90 | """ 91 | Load video frames from a directory of image files (JPEG and PNG). 92 | 93 | This is a drop-in replacement for misc.load_video_frames that supports 94 | PNG format images in addition to JPEG. 95 | 96 | Args: 97 | video_path: Path to the directory containing image files 98 | image_size: Target size for resizing images 99 | offload_video_to_cpu: Whether to keep images on CPU 100 | img_mean: RGB mean values for normalization 101 | img_std: RGB standard deviation values for normalization 102 | async_loading_frames: Whether to load frames asynchronously (unused) 103 | compute_device: Device to load images to 104 | 105 | Returns: 106 | Tuple containing: 107 | - images: Tensor of shape (N, 3, H, W) containing loaded images 108 | - video_height: Original height of the video frames 109 | - video_width: Original width of the video frames 110 | 111 | Raises: 112 | FileNotFoundError: If video_path doesn't exist 113 | RuntimeError: If no images found or unsupported image format 114 | NotImplementedError: If video_path is not a directory 115 | """ 116 | if not os.path.exists(video_path): 117 | raise FileNotFoundError(f"Video file or folder not found: {video_path}") 118 | 119 | if not os.path.isdir(video_path): 120 | warnings.warn( 121 | "This implementation only supports directories of image files, " 122 | "not video files." 123 | ) 124 | raise NotImplementedError( 125 | "Only image frames are supported. For video files, " 126 | "extract frames to a directory first." 127 | ) 128 | 129 | img_folder = video_path 130 | # Get all supported image files 131 | frame_names = [] 132 | for ext in [".jpg", ".jpeg", ".JPG", ".JPEG", ".png", ".PNG"]: 133 | frame_names.extend([ 134 | p for p in os.listdir(img_folder) if p.endswith(ext) 135 | ]) 136 | 137 | if not frame_names: 138 | raise RuntimeError(f"No images found in {img_folder}") 139 | 140 | # Sort the filenames 141 | try: 142 | # Try to sort based on filename pattern (assuming frame_xxxx format) 143 | frame_names.sort(key=lambda p: int(os.path.splitext(p)[0][4:])) 144 | except (ValueError, IndexError): 145 | # Fallback to regular sorting 146 | frame_names.sort() 147 | 148 | # Load the images 149 | img_paths = [os.path.join(img_folder, frame_name) for frame_name in frame_names] 150 | img_mean_tensor = torch.tensor(img_mean, dtype=torch.float32)[:, None, None] 151 | img_std_tensor = torch.tensor(img_std, dtype=torch.float32)[:, None, None] 152 | 153 | # Load the first image to get dimensions 154 | first_img = Image.open(img_paths[0]) 155 | video_width, video_height = first_img.size 156 | 157 | # Load all images 158 | num_frames = len(img_paths) 159 | images = torch.zeros( 160 | num_frames, 3, image_size, image_size, dtype=torch.float32 161 | ) 162 | 163 | for n, img_path in enumerate(tqdm(img_paths, desc="Loading frames")): 164 | img_pil = Image.open(img_path).convert("RGB").resize( 165 | (image_size, image_size) 166 | ) 167 | img_np = np.array(img_pil) 168 | if img_np.dtype == np.uint8: 169 | img_np = img_np / 255.0 170 | else: 171 | raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}") 172 | images[n] = torch.from_numpy(img_np).permute(2, 0, 1) 173 | 174 | if not offload_video_to_cpu: 175 | images = images.to(compute_device) 176 | img_mean_tensor = img_mean_tensor.to(compute_device) 177 | img_std_tensor = img_std_tensor.to(compute_device) 178 | 179 | # Normalize by mean and std 180 | images -= img_mean_tensor 181 | images /= img_std_tensor 182 | 183 | return images, video_height, video_width 184 | 185 | 186 | def preprocess_single_image( 187 | image_path: str, 188 | image_size: int = 1024, 189 | img_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406), 190 | img_std: Tuple[float, float, float] = (0.229, 0.224, 0.225), 191 | compute_device: torch.device = torch.device("cuda"), 192 | ) -> Tuple[torch.Tensor, int, int]: 193 | """ 194 | Preprocess a single image for SAM2 inference. 195 | 196 | Args: 197 | image_path: Path to the image file 198 | image_size: Size to resize the image to 199 | img_mean: RGB mean values for normalization 200 | img_std: RGB standard deviation values for normalization 201 | compute_device: Device to load the image to 202 | 203 | Returns: 204 | Tuple containing: 205 | - preprocessed_image: Preprocessed image tensor of shape (1, 3, H, W) 206 | - height: Original image height 207 | - width: Original image width 208 | 209 | Raises: 210 | FileNotFoundError: If image file doesn't exist 211 | RuntimeError: If image has unsupported dtype 212 | """ 213 | if not os.path.exists(image_path): 214 | raise FileNotFoundError(f"Image file not found: {image_path}") 215 | 216 | # Load and preprocess the image 217 | img_pil = Image.open(image_path).convert("RGB") 218 | width, height = img_pil.size 219 | 220 | # Resize and convert to numpy array 221 | img_pil = img_pil.resize((image_size, image_size)) 222 | img_np = np.array(img_pil) 223 | 224 | if img_np.dtype == np.uint8: 225 | img_np = img_np / 255.0 226 | else: 227 | raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {image_path}") 228 | 229 | # Convert to tensor and normalize 230 | img_tensor = torch.from_numpy(img_np).permute(2, 0, 1).float() 231 | img_tensor = img_tensor.unsqueeze(0) # Add batch dimension 232 | 233 | # Create normalization tensors 234 | img_mean_tensor = torch.tensor(img_mean, dtype=torch.float32)[:, None, None] 235 | img_std_tensor = torch.tensor(img_std, dtype=torch.float32)[:, None, None] 236 | 237 | # Move to device 238 | img_tensor = img_tensor.to(compute_device) 239 | img_mean_tensor = img_mean_tensor.to(compute_device) 240 | img_std_tensor = img_std_tensor.to(compute_device) 241 | 242 | # Normalize 243 | img_tensor -= img_mean_tensor 244 | img_tensor /= img_std_tensor 245 | 246 | return img_tensor, height, width 247 | 248 | 249 | def segment_image_with_bbox( 250 | image_path: str, 251 | bbox: Union[list, np.ndarray], 252 | checkpoint_path: str = "/sam2/checkpoints/sam2.1_hiera_large.pt", 253 | model_config: str = "configs/sam2.1/sam2.1_hiera_l.yaml", 254 | output_path: Optional[str] = None, 255 | image_size: int = 1024, 256 | device: torch.device = torch.device("cuda") 257 | ) -> np.ndarray: 258 | """ 259 | Segment an object in a single image using a 2D bounding box. 260 | 261 | Args: 262 | image_path: Path to the input image 263 | bbox: 2D bounding box in format [x1, y1, x2, y2] 264 | checkpoint_path: Path to the SAM2 checkpoint 265 | model_config: Path to the model configuration file 266 | output_path: Optional path to save the output mask 267 | image_size: Size to resize the image to during processing 268 | device: Device to run inference on 269 | 270 | Returns: 271 | Binary mask of the segmented object as numpy array 272 | 273 | Raises: 274 | Various exceptions from SAM2 model initialization and inference 275 | """ 276 | # Build the SAM2 predictor 277 | predictor = build_sam2_video_predictor( 278 | model_config, checkpoint_path, device=device 279 | ) 280 | 281 | # Preprocess the image 282 | image_tensor, original_height, original_width = preprocess_single_image( 283 | image_path, 284 | image_size=image_size, 285 | compute_device=device 286 | ) 287 | 288 | # Convert bbox to numpy array if it's not already 289 | bbox = np.array(bbox, dtype=np.float32) 290 | 291 | # Calculate the center point of the bounding box for positive click 292 | center_point = np.array([ 293 | [bbox[0] + (bbox[2] - bbox[0]) / 2, bbox[1] + (bbox[3] - bbox[1]) / 2] 294 | ], dtype=np.float32) 295 | 296 | # Set positive label 297 | labels = np.array([1], np.int32) 298 | 299 | with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16): 300 | # Initialize the inference state 301 | tmp_dir = os.path.dirname(image_path) 302 | inference_state = predictor.init_state(video_path=tmp_dir) 303 | predictor.reset_state(inference_state) 304 | 305 | # Set frame index and object ID 306 | frame_idx = 0 307 | obj_id = 1 308 | 309 | # Add the bounding box 310 | _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box( 311 | inference_state=inference_state, 312 | frame_idx=frame_idx, 313 | obj_id=obj_id, 314 | box=bbox, 315 | labels=labels, 316 | points=center_point, 317 | ) 318 | 319 | # Get the mask 320 | mask = (out_mask_logits[0] > 0.0).cpu().numpy() 321 | 322 | # Save the mask if output path is provided 323 | if output_path: 324 | mask_image = mask.astype(np.uint8) * 255 325 | mask_image = Image.fromarray(mask_image[0]) 326 | mask_image.save(output_path) 327 | logger.debug(f"Saved mask to {output_path}") 328 | 329 | return mask 330 | 331 | 332 | def process_directory_masks( 333 | rgb_path: str, 334 | mask_path: str, 335 | bbox: Optional[Union[list, np.ndarray]] = None, 336 | checkpoint_path: str = "/sam2/checkpoints/sam2.1_hiera_large.pt", 337 | model_config: str = "configs/sam2.1/sam2.1_hiera_l.yaml", 338 | device: torch.device = torch.device("cuda") 339 | ) -> None: 340 | """ 341 | Process all images in a directory and generate masks using a bounding box. 342 | 343 | The bounding box is applied to the first frame and then propagated through 344 | all subsequent frames using SAM2's video tracking capabilities. 345 | 346 | Args: 347 | rgb_path: Path to the directory containing RGB images 348 | mask_path: Path to save the generated masks 349 | bbox: Bounding box in format [x1, y1, x2, y2]. If None, will be 350 | calculated as central 80% of the first frame 351 | checkpoint_path: Path to the SAM2 checkpoint 352 | model_config: Path to the model configuration file 353 | device: Device to run inference on 354 | 355 | Raises: 356 | Various exceptions from image loading and SAM2 inference 357 | """ 358 | # Create mask directory if it doesn't exist 359 | os.makedirs(mask_path, exist_ok=True) 360 | 361 | # Check if masks already exist 362 | if os.path.exists(mask_path) and any(os.listdir(mask_path)): 363 | logger.info("Masks already extracted") 364 | return 365 | 366 | # Get all image files in the directory 367 | image_files = sorted(glob.glob(os.path.join(rgb_path, "*.png"))) 368 | if not image_files: 369 | image_files = sorted(glob.glob(os.path.join(rgb_path, "*.jpg"))) 370 | 371 | if not image_files: 372 | logger.error("No image files found in RGB frames directory") 373 | return 374 | 375 | # If bbox is None, create a default one 376 | if bbox is None: 377 | import cv2 378 | 379 | # Read the first image to get dimensions 380 | first_image = cv2.imread(image_files[0]) 381 | height, width = first_image.shape[:2] 382 | 383 | # Create a default bounding box (central 80% of the image) 384 | margin_x = int(width * 0.1) 385 | margin_y = int(height * 0.1) 386 | bbox = [margin_x, margin_y, width - margin_x, height - margin_y] 387 | 388 | logger.info(f"Using default bounding box: {bbox}") 389 | 390 | # Convert bbox to numpy array 391 | bbox = np.array(bbox, dtype=np.float32) 392 | 393 | # Build the SAM2 predictor 394 | predictor = build_sam2_video_predictor( 395 | model_config, checkpoint_path, device=device 396 | ) 397 | 398 | logger.info(f"Processing {len(image_files)} frames for mask extraction...") 399 | 400 | # Process all frames with the same bounding box 401 | with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16): 402 | # Initialize the inference state 403 | inference_state = predictor.init_state(video_path=rgb_path) 404 | predictor.reset_state(inference_state) 405 | 406 | # Set frame index and object ID 407 | ann_frame_idx = 0 # the frame index we interact with 408 | ann_obj_id = 1 # give a unique id to each object 409 | 410 | # Calculate the center point of the bounding box for positive click 411 | points = np.array([ 412 | [bbox[0] + (bbox[2] - bbox[0]) / 2, bbox[1] + (bbox[3] - bbox[1]) / 2] 413 | ], dtype=np.float32) 414 | 415 | # Set positive label 416 | labels = np.array([1], np.int32) 417 | 418 | # Add the bounding box 419 | _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box( 420 | inference_state=inference_state, 421 | frame_idx=ann_frame_idx, 422 | obj_id=ann_obj_id, 423 | box=bbox, 424 | labels=labels, 425 | points=points, 426 | ) 427 | 428 | # Run propagation throughout the video and collect the results 429 | video_segments = {} 430 | for out_frame_idx, out_obj_ids, out_mask_logits in ( 431 | predictor.propagate_in_video(inference_state) 432 | ): 433 | video_segments[out_frame_idx] = { 434 | out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() 435 | for i, out_obj_id in enumerate(out_obj_ids) 436 | } 437 | 438 | # Get frame names 439 | frame_names = [os.path.basename(image_file) for image_file in image_files] 440 | 441 | # Render the segmentation results 442 | for out_frame_idx in range(0, len(frame_names)): 443 | for out_obj_id, out_mask in video_segments[out_frame_idx].items(): 444 | mask_image = out_mask.astype(np.uint8) * 255 445 | mask_image = Image.fromarray(mask_image[0]) 446 | mask_output_path = os.path.join( 447 | mask_path, frame_names[out_frame_idx] 448 | ) 449 | mask_image.save(mask_output_path) 450 | logger.debug(f"Saved mask for frame {out_frame_idx}") 451 | 452 | logger.info(f"Mask extraction completed. Masks saved to {mask_path}") 453 | 454 | 455 | # Replace the original function with PNG-compatible version 456 | misc.load_video_frames = png_compatible_load_video_frames 457 | 458 | 459 | class Sam2Infer: 460 | """ 461 | SAM2 inference wrapper class. 462 | 463 | This class provides a simple interface for running SAM2 mask extraction 464 | on directories of images using configuration parameters. 465 | """ 466 | 467 | def __init__(self, config: Dict[str, Any]) -> None: 468 | """ 469 | Initialize the SAM2 inference wrapper. 470 | 471 | Args: 472 | config: Configuration dictionary containing: 473 | - checkpoint_path: Path to SAM2 model checkpoint 474 | - model_config: Path to model configuration file 475 | - bbox: Bounding box for segmentation [x1, y1, x2, y2] 476 | - device: Device to run inference on 477 | """ 478 | print(config) 479 | self.checkpoint_path = config['checkpoint_path'] 480 | self.model_config = config['model_config'] 481 | self.bbox = config['bbox'] 482 | self.device = config['device'] 483 | 484 | def run(self, rgb_path: str, mask_path: str) -> None: 485 | """ 486 | Run mask extraction on a directory of images. 487 | 488 | Args: 489 | rgb_path: Path to directory containing RGB images 490 | mask_path: Path to directory where masks will be saved 491 | """ 492 | process_directory_masks( 493 | rgb_path=rgb_path, 494 | mask_path=mask_path, 495 | bbox=self.bbox, 496 | checkpoint_path=self.checkpoint_path, 497 | model_config=self.model_config 498 | ) 499 | 500 | 501 | def run_mask_extraction( 502 | config: Dict[str, Any], 503 | exp_path: Path, 504 | rgb_path: Path, 505 | mask_path: Path 506 | ) -> bool: 507 | """ 508 | Set up and run mask extraction with error handling. 509 | 510 | Args: 511 | config: Configuration dictionary for SAM2 inference 512 | exp_path: Path to experiment directory 513 | rgb_path: Path to RGB frames directory 514 | mask_path: Path where masks will be saved 515 | 516 | Returns: 517 | True if mask extraction was successful, False otherwise 518 | """ 519 | # Create parent directories if they don't exist 520 | mask_path.parent.mkdir(parents=True, exist_ok=True) 521 | mask_path.mkdir(exist_ok=True) 522 | logger.info(f"Mask extraction directory: {mask_path}") 523 | 524 | # Check if mask images already exist 525 | mask_images = list(mask_path.glob('*.png')) 526 | rgb_images = list(rgb_path.glob('*.png')) 527 | if mask_images and len(mask_images) == len(rgb_images): 528 | logger.info("Mask images already exist, skipping mask extraction") 529 | return True 530 | 531 | # Run mask extraction 532 | logger.info("Running mask extraction...") 533 | sam2_infer = Sam2Infer(config) 534 | 535 | try: 536 | sam2_infer.run(str(rgb_path), str(mask_path)) 537 | logger.info("Mask extraction completed successfully") 538 | return True 539 | except Exception as e: 540 | logger.error(f"Error running mask extraction: {e}") 541 | return False 542 | 543 | 544 | if __name__ == "__main__": 545 | """Example usage of the SAM2 inference module.""" 546 | sam2_checkpoint = ( 547 | "/workspace/3d-object-reconstruction/data/weights/sam2/" 548 | "sam2.1_hiera_large.pt" 549 | ) 550 | model_cfg = ( 551 | "/workspace/3d-object-reconstruction/data/weights/sam2/" 552 | "sam2.1_hiera_l.yaml" 553 | ) 554 | 555 | # Example for processing a video directory with bounding box 556 | video_dir = "/workspace/3d-object-reconstruction/data/samples/retail_item/left/" 557 | output_dir = "/workspace/3d-object-reconstruction/data/samples/retail_item/masks/" 558 | 559 | # Define a bounding box [x1, y1, x2, y2] 560 | bbox = [1144, 627, 2227, 2232] 561 | 562 | # Process the directory 563 | process_directory_masks( 564 | rgb_path=video_dir, 565 | mask_path=output_dir, 566 | bbox=bbox, 567 | checkpoint_path=sam2_checkpoint, 568 | model_config=model_cfg 569 | ) 570 | --------------------------------------------------------------------------------