├── .dockerignore
├── data
    ├── docs
    │   ├── adv.gif
    │   ├── input_dino.gif
    │   ├── pipeline_overview.png
    │   └── bundlesdf_pipeline.png
    ├── samples
    │   └── retail_item
    │   │   ├── left
    │   │       ├── left000000.png
    │   │       ├── left000001.png
    │   │       ├── left000002.png
    │   │       ├── left000003.png
    │   │       ├── left000004.png
    │   │       ├── left000005.png
    │   │       ├── left000006.png
    │   │       ├── left000007.png
    │   │       ├── left000008.png
    │   │       ├── left000009.png
    │   │       ├── left000010.png
    │   │       ├── left000011.png
    │   │       ├── left000012.png
    │   │       ├── left000013.png
    │   │       ├── left000014.png
    │   │       ├── left000015.png
    │   │       ├── left000016.png
    │   │       ├── left000017.png
    │   │       ├── left000018.png
    │   │       ├── left000019.png
    │   │       ├── left000020.png
    │   │       ├── left000021.png
    │   │       ├── left000022.png
    │   │       ├── left000023.png
    │   │       ├── left000024.png
    │   │       ├── left000025.png
    │   │       ├── left000026.png
    │   │       ├── left000027.png
    │   │       ├── left000028.png
    │   │       ├── left000029.png
    │   │       ├── left000030.png
    │   │       ├── left000031.png
    │   │       ├── left000032.png
    │   │       ├── left000033.png
    │   │       ├── left000034.png
    │   │       ├── left000035.png
    │   │       └── left000036.png
    │   │   └── right
    │   │       ├── right000000.png
    │   │       ├── right000001.png
    │   │       ├── right000002.png
    │   │       ├── right000003.png
    │   │       ├── right000004.png
    │   │       ├── right000005.png
    │   │       ├── right000006.png
    │   │       ├── right000007.png
    │   │       ├── right000008.png
    │   │       ├── right000009.png
    │   │       ├── right000010.png
    │   │       ├── right000011.png
    │   │       ├── right000012.png
    │   │       ├── right000013.png
    │   │       ├── right000014.png
    │   │       ├── right000015.png
    │   │       ├── right000016.png
    │   │       ├── right000017.png
    │   │       ├── right000018.png
    │   │       ├── right000019.png
    │   │       ├── right000020.png
    │   │       ├── right000021.png
    │   │       ├── right000022.png
    │   │       ├── right000023.png
    │   │       ├── right000024.png
    │   │       ├── right000025.png
    │   │       ├── right000026.png
    │   │       ├── right000027.png
    │   │       ├── right000028.png
    │   │       ├── right000029.png
    │   │       ├── right000030.png
    │   │       ├── right000031.png
    │   │       ├── right000032.png
    │   │       ├── right000033.png
    │   │       ├── right000034.png
    │   │       ├── right000035.png
    │   │       └── right000036.png
    ├── LICENSE
    └── configs
    │   └── base.yaml
├── src
    ├── nvidia
    │   ├── __init__.py
    │   └── objectreconstruction
    │   │   ├── cli
    │   │       ├── __init__.py
    │   │       └── main.py
    │   │   ├── dataloader
    │   │       ├── __init__.py
    │   │       └── reconstruction_dataloader.py
    │   │   ├── utils
    │   │       ├── __init__.py
    │   │       ├── structures.py
    │   │       └── preprocessing.py
    │   │   ├── configs
    │   │       ├── __init__.py
    │   │       └── schema.py
    │   │   ├── networks
    │   │       ├── __init__.py
    │   │       ├── foundationstereo.py
    │   │       └── sam2infer.py
    │   │   └── __init__.py
    ├── requirements.txt
    ├── setup.py
    ├── MANIFEST.in
    ├── README.md
    └── pyproject.toml
├── .gitignore
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── documentation_request_new.yml
    │   ├── documentation_request_correction.yml
    │   ├── bug_report_form.yml
    │   └── feature_request_form.yml
    └── CODEOWNERS
├── CITATION.md
├── SECURITY.md
├── deploy
    └── compose
    │   └── docker-compose.yml
├── CHANGELOG.md
├── print_env.sh
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── .gitattributes
└── docker
    └── Dockerfile


/.dockerignore:
--------------------------------------------------------------------------------
1 | data/output/
2 | notebooks/.ipynb_checkpoints
3 | **/__pycache__/


--------------------------------------------------------------------------------
/data/docs/adv.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/adv.gif


--------------------------------------------------------------------------------
/data/docs/input_dino.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/input_dino.gif


--------------------------------------------------------------------------------
/data/docs/pipeline_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/pipeline_overview.png


--------------------------------------------------------------------------------
/data/docs/bundlesdf_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/3DObjectReconstruction/main/data/docs/bundlesdf_pipeline.png


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000000.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:68985599fe71f6dc055407e0f76f50e8bcb8408cc476a8f27be761efb5082fb9
3 | size 12328835
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000001.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2746f8871ff093274775fb4b79a65c6970509aa0b988e370d0bef3dced907581
3 | size 12645958
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000002.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a43e5f30816b9587665ad3d5804499934a19ffddfae1702c3595dc557277652b
3 | size 12901737
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000003.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d85d7ee84e8590359da375799f4ad4229a3a587165642556146e8ba01d556f17
3 | size 13136485
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000004.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:26877db244dc839d62740e409a0d2244a07835f6f9efb2a8c79f8d378b0619e7
3 | size 13373248
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000005.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:877451413167d59ecf065f02cf1c4d69afcf544468598ace4471e3d2522838bb
3 | size 13383554
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000006.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aea8389834d151ec74cda682754d09043a0780f96f19a4235d1e55cac546e994
3 | size 13373851
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000007.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:72cc382a208a6b5ce834d98a38dc1291aa9588c982779d8c3d1516ed107c7520
3 | size 13500831
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000008.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7ae3e9d6fa0ab9b606aa875ad4c9633d0eb8edb5de7fab820ca79fca44f11de8
3 | size 13588682
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000009.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ad87e70fd50b80c48de1fffa805b389388e7f5bb077616ef99e34d4c6a3844b5
3 | size 13536425
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000010.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d2de80254f165ba9af319dfc38f21a4e098a50f32dc04367564086ae85061cfe
3 | size 13417513
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000011.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1e2e711be062493a5ebaa8651757aac47a6c0c7af168f0116082004991994afe
3 | size 13267914
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000012.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:33f0771c52d544bef69aab4fe8e93a9592daa77e4fa29f5771ee425d7f8d3950
3 | size 13330359
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000013.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ad8ad9a26c234da03e2da8ec6224578d38e339a239f9da83b8997b5122836132
3 | size 13444945
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000014.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c01f5c411eb24750a924f402f4aef014c350b44424dc0d4123820942e36cd8ad
3 | size 13884438
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000015.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f1a51b9a266bb1229986cf90da9a6a33d679519146c94cfb43bbe650b1c4ef4b
3 | size 14303246
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000016.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c656c108c1c7a5a4d7723c10330b521c827fff56a2f207d9c9298114b401c689
3 | size 14432480
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000017.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fd40e0d79962dc9aea79a9b83b99081ff55fc7c4b1587b2958e138d7cf085cff
3 | size 14454634
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000018.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:789e3866539b30b044cb6a20c8d62b07e2334888e0189df546a1338e95acaa41
3 | size 14286318
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000019.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:705593bccb0eedb2adfbe9f4e96b9eeba1eea8783a71853c0dd7fb728a6f7f0e
3 | size 14215593
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000020.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:767e3ae666f6112ba9cba2b377d46f81d3c451b6436c721835d993ca943e6233
3 | size 14076802
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000021.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:616c3f25b6b61a7c0982c81bbf68a2e2ca3867a227abfb5df5d16cb5532164c9
3 | size 13681923
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000022.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:48422c0d80fa448d72f15cfd037750a076f5e55c4fe10189858c06602ee5a0e6
3 | size 13125454
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000023.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:66ac600ac985e642604eec8aad827caebc13ac92862323db8320d1832bf8ee5c
3 | size 12530119
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000024.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fb405139a307425f147cd1cb0ee99a0c744b9e9f1e1d53d1fccffac058bb289b
3 | size 12430620
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000025.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:11b526b859367b6959284bc7053e3f2be16a0fe6b95b9a4292d8b48fbc08152f
3 | size 12214426
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000026.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:baa326185701e7220667e70e60324a8fdfc4f06ca05474bdab459dba30742610
3 | size 12310511
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000027.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b93891800b183c4afd3c91f8eee2c88342cdfd04b26b05236a335cfc3fcd07af
3 | size 12370352
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000028.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b39383c3e0b983c9e916b2bd4cfb61d1982fd5fbbf8e14b0beaf866efabfa3ca
3 | size 12429603
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000029.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:105062f742fe427fd22e5ef3b8d02f133e4dacdb18706f5e085ec67c32a8b292
3 | size 12301322
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000030.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2936bf6a7dc69abfe383ab0ec81cdb76f88f36554b27ca43b85c46d215c8ad54
3 | size 12403034
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000031.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b2b3be7ab9fa5e692cb169d93e4e1be41b5e339fcb769061711a81dc08f55512
3 | size 12329273
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000032.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fec230cc65ddd61e4352ae6a1d915b239d2e62a7b7844e46e3359fa881f42185
3 | size 12209108
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000033.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e93f0e786c257f98f092949c11517f2be585168b15877bf5b36bc1b4a2095293
3 | size 12484521
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000034.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:07d89d7d879b350e827643a0ae253f103c3841597954795d871a96c9031dd49f
3 | size 12734645
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000035.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:dd29e837c3664469da918eb78cf69b07c9297b29583a5247c1bc7f4e32155813
3 | size 12645619
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/left/left000036.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:926ee1c89388dc7663333fa5f0ecea6beb2742de7a6ee6e50c1af13d2be261a3
3 | size 12604569
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000000.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8e4034609bf574176f920ac4ad20a3f14b87a340e1d7e2c000cabc4a0e7b440e
3 | size 12307906
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000001.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:88115a8a54b5ed07d3fdd96fe9a5a2c875ce99598618279375ac8cd8bf5c4c26
3 | size 12293559
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000002.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:131d4d4168cbd1077f8073ef09342999020414bbd11d7aadd5143c89bd6e7ced
3 | size 12564149
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000003.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d45a197af4133dd8269969c27330d6b59f9acd8ccd5904de3c1196337aea043
3 | size 12808280
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000004.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5b5ef4edae33b0c08bd002aa204c12db6814aaa9d5bb20e88f42a30637fad615
3 | size 13056049
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000005.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:23c61ba9d90eb651d4fb95399cf125504fbc8a21bfb9942d002df5b673d4aa62
3 | size 13244619
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000006.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8b21d5a1ac30c374066502bc8a1bcc2639d611401ea3cfb611e7cd7994ec982e
3 | size 13229861
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000007.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5c76813368be67e309a1d1e1f4493ce7e01c060ba234369328c7e8a96dd7c5fd
3 | size 13204653
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000008.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b1bd3edcfc08202dc61265787eeb34b9174f10fb340043dcbc419149e8c8f577
3 | size 13342051
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000009.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:08e8cf210cf291e76191fde7047ef62cc964fd66a0b2c15334e0e076fb503496
3 | size 13409813
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000010.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eea81615e6646aca2a36f62113437e1fb7ac0d3fc871e66fa6f3253c00edb749
3 | size 13390113
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000011.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2748c4290add290351ce5c2b9fa5c268b4c1239962bcc49d2a5c783982307bfe
3 | size 13248047
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000012.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:efc8c30f58a8d7db6e5c872e50c742141e89c7b830152a5b3fdded25ce03898d
3 | size 13016947
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000013.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d13a128dbff945de945b13924004da5724d4eca458a871838c657e65c8fe8196
3 | size 13065264
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000014.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5956cbe858c3c1c1f450b20bc60afb5778628aa99cc2929fb98386cdaa86baf5
3 | size 13213599
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000015.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e2f8a63ace3b74ed519cd53bcfd62b85b6826dfa6113e56da26c16a70a7f6d38
3 | size 13774114
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000016.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:498abff2f00065db5ca824f24196743ee0a7cfe87f6ede7270b51515d5ce9eeb
3 | size 14068457
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000017.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ea77d130df3b02cd4ef4e26b5ccb94f54c8f7921366e9c42494fe48ca64612a5
3 | size 14304233
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000018.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eec46fb29932d53824c28aa82fe4a0aeaf721991b4350db963aa3b61de4596af
3 | size 14118226
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000019.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3d0effe946906a30827ed6567b4e5d32a37a9e019209c33869f1f9d0033e957c
3 | size 14019921
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000020.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:26c5617ee88fcdbacd766ef3037441acf85ea8203757a952f0611104ecf985d4
3 | size 13940315
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000021.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6158ba1c18170380d50bbab089982ab75e8cd33004b0cca61700d63dd3f2acd0
3 | size 13685382
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000022.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:62f101d0869faa5aa570ea49fed99ea847d611d2d0ed2da1a1b3584f6b411978
3 | size 13263611
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000023.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:710cb7e20dcd157b32f77e23a74c0cfee4cfe4d148d4f235fd0b0e420e9f6855
3 | size 12688863
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000024.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b18d5b4d56355da128da4e9cbd96adecf5e06d721c29ffa50bb60ea04d83c4fb
3 | size 12231639
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000025.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:549eb2a5197e45557ae18d2b1e9a7212681815063bc304ec03c44171391f845f
3 | size 12055552
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000026.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:36acb14e260f58539d6424f35b51ddccda09f84421ce6ac66188c5a39c57aa7f
3 | size 12070464
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000027.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4aaaa6eac9bca39177ce9a7922073e8d5c38d5aef252ad35d17f40cda6e07976
3 | size 12192296
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000028.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8a9298dbb57130763f03f95820031a66717b21ca64d55e8cb3385307eea519b5
3 | size 12229413
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000029.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:38cb95acd5bf7c074ad6942cd74c02ef24016a541c8da63aaa268bf7f5fdb8ab
3 | size 12204798
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000030.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:21ddcf8e517c69ef1d3d9bf0d4de1171676793f8628e2cabbcaa1a83bc75ef30
3 | size 12209473
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000031.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d25dbf139f894ba6b37b6697fba8396c79fb6f1b3cf7a18c309ee1a81371e615
3 | size 12118436
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000032.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7dc9825de6e885c105414dc4f960b1330cd6c96ed0ea280471c501cabf5dca19
3 | size 12043599
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000033.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:58e19246104c0fe4d99648159c37be0371a9b54da13fdcb14dd2701ed9e48dbd
3 | size 12375658
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000034.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d35b8de816e854a62ff25bb33e9a8e6838a73926b56d40477f520e6afbeb24a2
3 | size 12525838
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000035.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:48721abf0cf2cefb2a093b4cf830ad634ba3e210206a24994fa9366ccee360cf
3 | size 12429751
4 | 


--------------------------------------------------------------------------------
/data/samples/retail_item/right/right000036.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a355d7ecc831546852bfb2400ccb9a1a8ef9c233be181e8b17e4649d4bcad6c1
3 | size 12356552
4 | 


--------------------------------------------------------------------------------
/src/nvidia/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | NVIDIA namespace package.
3 | 
4 | This is a namespace package that allows multiple NVIDIA packages to coexist.
5 | """
6 | 
7 | __path__ = __import__('pkgutil').extend_path(__path__, __name__) 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/weights
 2 | data/output
 3 | datasets/
 4 | weights/
 5 | __pycache__/
 6 | .vscode/
 7 | **/*.pyc
 8 | notebooks/.ipynb_checkpoints/
 9 | data/samples/test/
10 | src/.ipynb_checkpoints/README-checkpoint.md
11 | src/test.py
12 | 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Command-Line Interface for 3D Object Reconstruction.
 3 | 
 4 | This module provides command-line tools for running the reconstruction pipeline
 5 | and its individual components.
 6 | """
 7 | 
 8 | from .main import main
 9 | 
10 | __all__ = [
11 |     'main',
12 | ] 


--------------------------------------------------------------------------------
/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | ruamel_yaml
 2 | pycuda==2025.1
 3 | imageio
 4 | numpy==1.26.4
 5 | trimesh==4.6.1
 6 | libigl==2.5.1
 7 | iopath
 8 | joblib==1.4.2
 9 | scipy==1.15.1
10 | scikit-learn==1.6.1
11 | opencv-python==4.11.0.86
12 | python-multipart==0.0.20
13 | pytest
14 | omegaconf==2.3.0
15 | flash-attn==2.7.3
16 | xatlas==0.0.10
17 | transformations
18 | 


--------------------------------------------------------------------------------
/src/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Setup script for nvidia-3d-object-reconstruction package.
 3 | 
 4 | This file provides backward compatibility with older Python packaging tools.
 5 | All configuration is now handled by pyproject.toml.
 6 | """
 7 | 
 8 | from setuptools import setup
 9 | 
10 | # All configuration is in pyproject.toml
11 | # This file exists only for backward compatibility
12 | setup() 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/dataloader/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data Loading Components for 3D Object Reconstruction.
 3 | 
 4 | This module provides data readers and transformations for various input formats
 5 | used in the reconstruction pipeline.
 6 | """
 7 | 
 8 | from .reconstruction_dataloader import ReconstructionDataLoader
 9 | 
10 | __all__ = [
11 |     'ReconstructionDataLoader'
12 | ] 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | <!-- Provide a standalone description of changes in this PR. -->
 3 | <!-- Reference any issues closed by this PR with "closes #1234". -->
 4 | <!-- Note: The pull request title will be included in the CHANGELOG. -->
 5 | 
 6 | ## Checklist
 7 | - [ ] I am familiar with the [Contributing Guidelines](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CONTRIBUTING.md).
 8 | - [ ] New or existing tests using the default retail item example cover these changes.
 9 | - [ ] The documentation is up to date with these changes.
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
 1 | # GitHub info on config.yml
 2 | # https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/configuring-issue-templates-for-your-repository#configuring-the-template-chooser
 3 | # Set to 'false' if you only want the templates to be used. 
 4 | blank_issues_enabled: false
 5 | 
 6 | # When using discussions instead of Question issue templates,
 7 | # link that below to have it show up in the 'Submit Issue' page
 8 | contact_links:
 9 |   - name: Report an issue
10 |     url: https://github.com/NVIDIA/3DObjectReconstruction/issues/new/choose
11 |     about: Please raise any issues here.
12 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | #package code owners
 2 | src/             @3d-object-reconstruction-team/3d-object-reconstruction-codeowners
 3 | 
 4 | #build/ops code owners
 5 | .github/         @3d-object-reconstruction-team/3d-object-reconstruction-codeowners
 6 | deploy/          @3d-object-reconstruction-team/3d-object-reconstruction-codeowners
 7 | docker/          @3d-object-reconstruction-team/3d-object-reconstruction-codeowners
 8 | 
 9 | # docs and examples code owners
10 | docs/            @3d-object-reconstruction-team/3d-object-reconstruction-codeowners
11 | notebooks/       @3d-object-reconstruction-team/3d-object-reconstruction-codeowners
12 | data/            @3d-object-reconstruction-team/3d-object-reconstruction-codeowners


--------------------------------------------------------------------------------
/src/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # Include the package README
 2 | include README.md
 3 | 
 4 | # Include package configuration files
 5 | recursive-include nvidia/objectreconstruction/configs *.yaml *.yml
 6 | recursive-include nvidia/objectreconstruction/data *.txt
 7 | 
 8 | # Include Python package files
 9 | recursive-include nvidia *.py
10 | recursive-include nvidia *.pyi
11 | 
12 | # Exclude compiled files and cache
13 | global-exclude *.pyc
14 | global-exclude *.pyo
15 | global-exclude *.so
16 | global-exclude __pycache__
17 | global-exclude .git*
18 | global-exclude .DS_Store
19 | 
20 | # Exclude development and build files
21 | exclude .gitignore
22 | exclude .pre-commit-config.yaml
23 | exclude .github
24 | exclude tox.ini
25 | exclude .coverage
26 | exclude .pytest_cache
27 | exclude build
28 | exclude dist
29 | exclude *.egg-info 


--------------------------------------------------------------------------------
/CITATION.md:
--------------------------------------------------------------------------------
 1 | # Citation
 2 | 
 3 | Please cite the following papers when using this workflow:
 4 | 
 5 | **FoundationStereo**:
 6 | ```bibtex
 7 | @article{wen2025stereo,
 8 |   title={FoundationStereo: Zero-Shot Stereo Matching},
 9 |   author={Bowen Wen and Matthew Trepte and Joseph Aribido and Jan Kautz and Orazio Gallo and Stan Birchfield},
10 |   journal={CVPR},
11 |   year={2025}
12 | }
13 | ```
14 | 
15 | **BundleSDF**:
16 | ```bibtex
17 | @InProceedings{bundlesdfwen2023,
18 | author        = {Bowen Wen and Jonathan Tremblay and Valts Blukis and Stephen Tyree and Thomas M\"{u}ller and Alex Evans and Dieter Fox and Jan Kautz and Stan Birchfield},
19 | title         = {{BundleSDF}: {N}eural 6-{DoF} Tracking and {3D} Reconstruction of Unknown Objects},
20 | booktitle     = {CVPR},
21 | year          = {2023},
22 | }
23 | ```
24 | 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility Functions for 3D Object Reconstruction.
 3 | 
 4 | This module provides essential utility functions for data preprocessing,
 5 | structure conversion, and I/O operations used throughout the reconstruction pipeline.
 6 | """
 7 | 
 8 | from .preprocessing import (
 9 |     load_config,
10 |     setup_experiment_directory,
11 |     process_video_frames,
12 |     depth2xyzmap,
13 |     toOpen3dCloud,
14 |     read_video
15 | )
16 | 
17 | from .structures import dataclass_to_dict
18 | 
19 | __all__ = [
20 |     # Preprocessing functions
21 |     'load_config',
22 |     'setup_experiment_directory', 
23 |     'process_video_frames',
24 |     'depth2xyzmap',
25 |     'toOpen3dCloud',
26 |     'read_video',
27 |     
28 |     # Structure utilities
29 |     'dataclass_to_dict',
30 | ] 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration Management for 3D Object Reconstruction.
 3 | 
 4 | This module provides configuration schemas, default values, and validation
 5 | for all components of the reconstruction pipeline.
 6 | """
 7 | 
 8 | from .schema import (
 9 |     NVBundleSDFConfig,
10 |     BundleTrackConfig,
11 |     NeRFConfig,
12 |     FoundationStereoConfig,
13 |     SAM2Config,
14 |     RoMaConfig,
15 |     CameraConfig,
16 |     TextureBakeConfig,
17 |     SegmentationConfig,
18 |     DepthProcessingConfig,
19 |     BasePathConfig
20 | )
21 | 
22 | __all__ = [
23 |     # Main configuration
24 |     'NVBundleSDFConfig',
25 |     
26 |     # Component configurations
27 |     'BundleTrackConfig',
28 |     'NeRFConfig', 
29 |     'FoundationStereoConfig',
30 |     'SAM2Config',
31 |     'RoMaConfig',
32 |     'CameraConfig',
33 |     'TextureBakeConfig',
34 |     'SegmentationConfig', 
35 |     'DepthProcessingConfig',
36 |     'BasePathConfig',
37 | ] 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/utils/structures.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | def dataclass_to_dict(obj):
 4 |     """
 5 |     Recursively convert a dataclass object and its nested attributes to a dictionary.
 6 |     
 7 |     Args:
 8 |         obj: A dataclass object or any other Python object
 9 |         
10 |     Returns:
11 |         dict: A dictionary representation of the object with all nested objects converted
12 |     """
13 |     if obj is None:
14 |         return {}
15 |     
16 |     # Get the object's dictionary
17 |     if hasattr(obj, '__dict__'):
18 |         result = vars(obj)
19 |     else:
20 |         return obj
21 |     
22 |     # Recursively convert nested objects
23 |     for key, value in result.items():
24 |         if hasattr(value, '__dict__'):
25 |             result[key] = dataclass_to_dict(value)
26 |         elif isinstance(value, (list, tuple)):
27 |             result[key] = [dataclass_to_dict(item) if hasattr(item, '__dict__') else item for item in value]
28 |         elif isinstance(value, dict):
29 |             result[key] = {k: dataclass_to_dict(v) if hasattr(v, '__dict__') else v for k, v in value.items()}
30 |         elif isinstance(value, Path):
31 |             result[key] = str(value)
32 |     
33 |     return result 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/networks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Neural Network Models for 3D Object Reconstruction.
 3 | 
 4 | This module contains all the neural network components used in the reconstruction pipeline:
 5 | - FoundationStereoProcessor: Stereo depth estimation
 6 | - NVBundleSDF: Main reconstruction pipeline combining BundleTrack and NeRF
 7 | - Sam2Infer: SAM2-based object segmentation
 8 | - FeatureMatchingInfer: RoMa-based feature matching
 9 | - NerfRunner: Neural Radiance Field implementation
10 | - Tool utilities: Point cloud processing, mesh operations
11 | """
12 | 
13 | from .foundationstereo import FoundationStereoProcessor, FoundationStereoNet, run_depth_estimation
14 | from .nvbundlesdf import NVBundleSDF, vis_camera_poses
15 | from .sam2infer import Sam2Infer, run_mask_extraction
16 | from .roma import FeatureMatchingInfer
17 | from .nerf_runner import NerfRunner, ModelRendererOffscreen
18 | from .tool import (
19 |     PointCloudProcessor,
20 |     MeshProcessor, 
21 |     TensorUtils,
22 |     PoseUtils,
23 |     compute_scene_bounds,
24 |     set_seed
25 | )
26 | 
27 | __all__ = [
28 |     # Main pipeline
29 |     'NVBundleSDF',
30 |     
31 |     # Individual processors
32 |     'FoundationStereoProcessor',
33 |     'FoundationStereoNet', 
34 |     'run_depth_estimation',
35 |     'Sam2Infer',
36 |     'run_mask_extraction',
37 |     'FeatureMatchingInfer',
38 |     'NerfRunner',
39 |     
40 |     # Utility classes
41 |     'PointCloudProcessor',
42 |     'MeshProcessor',
43 |     'TensorUtils', 
44 |     'PoseUtils',
45 |     
46 |     # Utility functions
47 |     'compute_scene_bounds',
48 |     'set_seed',
49 |     'ModelRendererOffscreen',
50 |     'vis_camera_poses'
51 | ] 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | ## Security
 2 | 
 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all source code repositories managed through our organization.
 4 | 
 5 | If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report security vulnerabilities through GitHub.**
 6 | 
 7 | ## Reporting Potential Security Vulnerability in an NVIDIA Product
 8 | 
 9 | To report a potential security vulnerability in any NVIDIA product:
10 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
11 | - E-Mail: psirt@nvidia.com
12 |     - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
13 |     - Please include the following information:
14 |    	 - Product/Driver name and version/branch that contains the vulnerability
15 |      - Type of vulnerability (code execution, denial of service, buffer overflow, etc.)
16 |    	 - Instructions to reproduce the vulnerability
17 |    	 - Proof-of-concept or exploit code
18 |    	 - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability
19 | 
20 | While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more information.
21 | 
22 | ## NVIDIA Product Security
23 | 
24 | For all security-related concerns, please visit NVIDIA's Product Security portal at https://www.nvidia.com/en-us/security
25 | 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | NVIDIA 3D Object Reconstruction Package.
 3 | 
 4 | A comprehensive framework for high-quality 3D object reconstruction from RGB-D input
 5 | using neural implicit surfaces, bundle adjustment, and advanced feature matching.
 6 | 
 7 | Key Features:
 8 | - BundleTrack for camera pose tracking
 9 | - FoundationStereo for depth estimation  
10 | - SAM2 for object segmentation
11 | - Neural Implicit Surface representation
12 | - Texture baking for photorealistic results
13 | 
14 | Example Usage:
15 |     >>> from nvidia.reconstruction3d.object.networks import NVBundleSDF
16 |     >>> from nvidia.reconstruction3d.object.configs.schema import NVBundleSDFConfig
17 |     >>> 
18 |     >>> config = NVBundleSDFConfig()
19 |     >>> pipeline = NVBundleSDF(config.nerf, config.bundletrack, config.roma)
20 |     >>> pipeline.run_track(reader)
21 |     >>> pipeline.run_global_sdf(reader)
22 | """
23 | 
24 | __version__ = "1.0.0"
25 | __author__ = "NVIDIA Corporation"
26 | __email__ = "support@nvidia.com"
27 | 
28 | # Main pipeline imports
29 | from .networks.nvbundlesdf import NVBundleSDF
30 | from .configs.schema import NVBundleSDFConfig
31 | 
32 | # Individual component imports  
33 | from .networks.foundationstereo import FoundationStereoProcessor, run_depth_estimation
34 | from .networks.sam2infer import Sam2Infer, run_mask_extraction
35 | from .networks.roma import FeatureMatchingInfer
36 | from .dataloader.reconstruction_dataloader import ReconstructionDataLoader
37 | 
38 | __all__ = [
39 |     'NVBundleSDF',
40 |     'NVBundleSDFConfig', 
41 |     'FoundationStereoProcessor',
42 |     'run_depth_estimation',
43 |     'Sam2Infer', 
44 |     'run_mask_extraction',
45 |     'FeatureMatchingInfer',
46 |     'ReconstructionDataLoader',
47 |     '__version__'
48 | ] 


--------------------------------------------------------------------------------
/deploy/compose/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   reconstruction-app:
 3 |     # Build configuration (used when BUILD_MODE=local)
 4 |     build:
 5 |       context: ../../
 6 |       dockerfile: docker/Dockerfile
 7 |       args:
 8 |         - BUILDKIT_INLINE_CACHE=1
 9 |     
10 |     # Image configuration - can be overridden with IMAGE_NAME env var
11 |     image: ${IMAGE_NAME:-3d-object-reconstruction:latest}
12 |     container_name: 3d-object-reconstruction-container-${USER:-default}
13 |     
14 |     # Shared memory size - useful for heavy workloads (adjust this as needed)
15 |     shm_size: 8gb
16 |     
17 |     # GPU support
18 |     deploy:
19 |       resources:
20 |         reservations:
21 |           devices:
22 |             - driver: nvidia
23 |               count: all
24 |               capabilities: [gpu]
25 |     
26 |     # Environment variables
27 |     environment:
28 |       - NVIDIA_VISIBLE_DEVICES=all
29 |       - CUDA_VISIBLE_DEVICES=0
30 |       - PYTHONPATH=/workspace/3d-object-reconstruction
31 |     
32 |     # Working directory
33 |     working_dir: /workspace/3d-object-reconstruction
34 |     
35 |     # Volume mounts
36 |     volumes:
37 |       # Mount source code
38 |       - ../../src:/workspace/3d-object-reconstruction/src
39 |       # Mount data folder
40 |       - ../../data:/workspace/3d-object-reconstruction/data
41 |       # Mount notebooks for development
42 |       - ../../notebooks:/workspace/3d-object-reconstruction/notebooks
43 |       # Mount README.md
44 |       - ../../README.md:/workspace/3d-object-reconstruction/README.md
45 |     
46 |     # Port mappings (for Jupyter notebook) - dynamically allocated
47 |     ports:
48 |       - "${JUPYTER_HOST_PORT:-8888}:8888"
49 |       
50 |     # Keep container running
51 |     stdin_open: true
52 |     tty: true
53 |         
54 |     # Restart policy
55 |     restart: unless-stopped
56 |     
57 |     # User-specific network
58 |     networks:
59 |       - reconstruction-network
60 | 
61 | 
62 | # User-specific network to avoid conflicts
63 | networks:
64 |   reconstruction-network:
65 |     name: 3d-recon-network-${USER:-default}
66 |     driver: bridge
67 | 
68 | 
69 | volumes:
70 |   weights:
71 |     driver: local
72 |   output:
73 |     driver: local 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation_request_new.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | name: Documentation - New Documentation Request
17 | description: Request additions to 3D Object Reconstruction documentation
18 | title: "[DOC]: "
19 | labels: ["doc"]
20 | 
21 | body:
22 |   - type: markdown
23 |     attributes:
24 |       value: |
25 |         Thanks for taking the time to improve our documentation!
26 | 
27 |   - type: dropdown
28 |     id: criticality
29 |     attributes:
30 |       label: How would you describe the priority of this documentation request
31 |       options:
32 |         - Critical (currently preventing usage)
33 |         - High
34 |         - Medium
35 |         - Low (would be nice)
36 |     validations:
37 |       required: true
38 | 
39 |   - type: textarea
40 |     id: problem
41 |     attributes:
42 |       label: Describe the future/missing documentation
43 |       placeholder: A code snippet mentions function foo(args) but I cannot find any documentation on it.
44 |     validations:
45 |       required: true
46 | 
47 |   - type: textarea
48 |     id: search_locs
49 |     attributes:
50 |       label: Where have you looked?
51 |       placeholder: |
52 |        https://github.com/NVIDIA/3DObjectReconstruction/blob/main/README.md
53 | 
54 |   - type: checkboxes
55 |     id: terms
56 |     attributes:
57 |       label: Code of Conduct
58 |       description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md)
59 |       options:
60 |         - label: I agree to follow 3D Object Reconstruction's Code of Conduct
61 |           required: true
62 |         - label: I have searched the [open documentation issues](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3Adoc) and have found no duplicates for this bug report
63 |           required: true
64 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 3D Object Reconstruction 0.1.0 (18 Jul 2025)
 2 | 
 3 | ## New Features
 4 | 
 5 | - **End-to-End 3D Reconstruction Workflow:** Initial release of the 3D Object Reconstruction workflow, providing a complete workflow to convert stereo video inputs into high-quality 3D assets.
 6 | - **State-of-the-Art Model Integration:** The workflow integrates several cutting-edge models for robust and accurate reconstruction:
 7 |     - **FoundationStereo:** A transformer-based model for high-accuracy stereo depth estimation.
 8 |     - **SAM2 (Segment Anything Model 2):** Used for precise and consistent object segmentation in video sequences.
 9 |     - **RoMA (Robust Matching):** Employs robust feature matching to establish reliable correspondences between images.
10 |     - **BundleSDF:** Implements neural 6-DoF tracking and 3D reconstruction for unknown objects, ensuring geometric accuracy.
11 | - **Sample Inference Data:** Includes a sample dataset of a retail item with corresponding configuration files, allowing users to quickly test and validate the reconstruction workflow.
12 | - **Docker Compose-Based Deployment:**
13 |     - **Simplified Setup:** A single script (`deploy.sh`) automates the entire setup process, including downloading model weights, building container images, and managing external dependencies.
14 |     - **Pre-configured Environment:** The Dockerfile is based on DeepStream base images and includes all necessary components to run the workflow out-of-the-box.
15 | - **Interactive Jupyter Notebook:**
16 |     - **Step-by-Step Guidance:** A demo notebook (`3d_object_reconstruction_demo.ipynb`) provides an interactive, step-by-step guide through the reconstruction process.
17 |     - **Easy to Use:** Designed for ease of use, allowing users to experiment with the workflow and visualize results in real-time.
18 | - **Command-Line Interface (CLI):**
19 |     - **Automated Workflows:** Provides a CLI for running the reconstruction workflow, enabling batch processing and integration into automated workflows.
20 | 
21 | ## Improvements
22 | 
23 | - **High-Quality Mesh and Texture Generation:** The workflow is optimized to produce production-ready 3D meshes with photorealistic textures, suitable for digital twin creation, synthetic data generation, and more.
24 | - **Performance:** Achieves rapid processing, with the capability to generate a complete 3D asset in under 30 minutes on an NVIDIA RTX A6000 GPU.
25 | - **Extensibility:** The modular architecture allows for customization and integration of new models or components.
26 | 
27 | ## Bug Fixes
28 | 
29 | - No major bug fixes in this initial release.
30 | 


--------------------------------------------------------------------------------
/print_env.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Copyright (c) 2025, NVIDIA CORPORATION.
  3 | # Reports relevant environment information useful for diagnosing and
  4 | # debugging __PROJECT__ issues.
  5 | # Usage:
  6 | # "./print_env.sh" - prints to stdout
  7 | # "./print_env.sh > env.txt" - prints to file "env.txt"
  8 | 
  9 | print_env() {
 10 | echo "**git***"
 11 | if [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" == "true" ]; then
 12 | git log --decorate -n 1
 13 | echo "**git submodules***"
 14 | git submodule status --recursive
 15 | else
 16 | echo "Not inside a git repository"
 17 | fi
 18 | echo
 19 | 
 20 | echo "***OS Information***"
 21 | cat /etc/*-release
 22 | uname -a
 23 | echo
 24 | 
 25 | echo "***GPU Information***"
 26 | nvidia-smi
 27 | echo
 28 | 
 29 | echo "***CPU***"
 30 | lscpu
 31 | echo
 32 | 
 33 | echo "***Docker***"
 34 | which docker && docker --version
 35 | echo
 36 | 
 37 | echo "***Docker Compose***"
 38 | if command -v docker-compose &> /dev/null; then
 39 |     docker-compose --version
 40 | elif docker compose version &> /dev/null; then
 41 |     docker compose version
 42 | else
 43 |     echo "docker-compose or docker compose not found"
 44 | fi
 45 | echo
 46 | 
 47 | echo "***NVIDIA Container Toolkit***"
 48 | which nvidia-container-toolkit && nvidia-container-toolkit --version
 49 | echo
 50 | 
 51 | echo "***CMake***"
 52 | which cmake && cmake --version
 53 | echo
 54 | 
 55 | echo "***g++***"
 56 | which g++ && g++ --version
 57 | echo
 58 | 
 59 | echo "***nvcc***"
 60 | which nvcc && nvcc --version
 61 | echo
 62 | 
 63 | echo "***Python***"
 64 | which python && python -c "import sys; print('Python {0}.{1}.{2}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))"
 65 | echo
 66 | 
 67 | echo "***Environment Variables***"
 68 | 
 69 | printf '%-32s: %s\n' PATH $PATH
 70 | 
 71 | printf '%-32s: %s\n' LD_LIBRARY_PATH $LD_LIBRARY_PATH
 72 | 
 73 | printf '%-32s: %s\n' NUMBAPRO_NVVM $NUMBAPRO_NVVM
 74 | 
 75 | printf '%-32s: %s\n' NUMBAPRO_LIBDEVICE $NUMBAPRO_LIBDEVICE
 76 | 
 77 | printf '%-32s: %s\n' CONDA_PREFIX $CONDA_PREFIX
 78 | 
 79 | printf '%-32s: %s\n' PYTHON_PATH $PYTHON_PATH
 80 | 
 81 | echo
 82 | 
 83 | 
 84 | # Print conda packages if conda exists
 85 | if type "conda" &> /dev/null; then
 86 | echo '***conda packages***'
 87 | which conda && conda list
 88 | echo
 89 | # Print pip packages if pip exists
 90 | elif type "pip" &> /dev/null; then
 91 | echo "conda not found"
 92 | echo "***pip packages***"
 93 | which pip && pip list
 94 | echo
 95 | else
 96 | echo "conda not found"
 97 | echo "pip not found"
 98 | fi
 99 | }
100 | 
101 | echo "<details><summary>Click here to see environment details</summary><pre>"
102 | echo "     "
103 | print_env | while read -r line; do
104 |     echo "     $line"
105 | done
106 | echo "</pre></details>"
107 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation_request_correction.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | name: Documentation - Correction/Update Request
17 | description: Request corrections or updates to existing documentation
18 | title: "[DOC]: "
19 | labels: ["doc"]
20 | 
21 | body:
22 |   - type: markdown
23 |     attributes:
24 |       value: |
25 |         Thanks for taking the time to improve our documentation!
26 | 
27 |   - type: dropdown
28 |     id: criticality
29 |     attributes:
30 |       label: How would you describe the priority of this documentation request
31 |       options:
32 |         - Critical (currently preventing usage)
33 |         - High
34 |         - Medium
35 |         - Low (would be nice)
36 |     validations:
37 |       required: true
38 | 
39 |   - type: input
40 |     id: correction_location
41 |     attributes:
42 |       label: Please provide a link or source to the relevant docs
43 |       placeholder: "ex: https://github.com/NVIDIA/3DObjectReconstruction/blob/main/README.md"
44 |     validations:
45 |       required: true
46 | 
47 |   - type: textarea
48 |     id: problem
49 |     attributes:
50 |       label: Describe the problems in the documentation
51 |       placeholder: The documents say to use foo.func(args) however an AttributeError is thrown
52 |     validations:
53 |       required: true
54 | 
55 |   - type: textarea
56 |     id: correction
57 |     attributes:
58 |       label: (Optional) Propose a correction
59 |       placeholder: foo.func() was deprecated, replace documentation with foo.new_func()
60 | 
61 |   - type: checkboxes
62 |     id: terms
63 |     attributes:
64 |       label: Code of Conduct
65 |       description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md)
66 |       options:
67 |         - label: I agree to follow 3D Object Reconstruction's Code of Conduct
68 |           required: true
69 |         - label: I have searched the [open documentation issues](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3Adoc) and have found no duplicates for this bug report
70 |           required: true
71 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report_form.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | name: Bug Report
17 | description: File a bug report
18 | title: "[BUG]: "
19 | labels: ["bug"]
20 | 
21 | body:
22 |   - type: markdown
23 |     attributes:
24 |       value: |
25 |         Thanks for taking the time to fill out this bug report!
26 | 
27 |   - type: input
28 |     id: version
29 |     attributes:
30 |       label: Version
31 |       description: What version of 3D Object Reconstruction are you running?
32 |       placeholder: "example: 0.1.0"
33 |     validations:
34 |       required: true
35 | 
36 |   - type: dropdown
37 |     id: installation-method
38 |     attributes:
39 |       label: Which installation method(s) does this occur on?
40 |       multiple: true
41 |       options:
42 |         - Docker
43 |         - Conda
44 |         - Pip
45 |         - Source
46 | 
47 |   - type: textarea
48 |     id: description
49 |     attributes:
50 |       label: Describe the bug.
51 |       description: Also tell us, what did you expect to happen?
52 |       placeholder: XYZ occured, I expected QRS results
53 |     validations:
54 |       required: true
55 | 
56 |   - type: textarea
57 |     id: mvr
58 |     attributes:
59 |       label: Minimum reproducible example
60 |       description: Please supply a [minimum reproducible code example](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) here
61 |       render: shell
62 | 
63 |   - type: textarea
64 |     id: logs
65 |     attributes:
66 |       label: Relevant log output
67 |       description: Please paste relevant error and log output here
68 |       render: shell
69 | 
70 |   - type: textarea
71 |     id: env-printout
72 |     attributes:
73 |       label: Full env printout
74 |       description: Please run and paste the output of the `print_env.sh` script here, to gather any other relevant environment details
75 |       render: shell
76 | 
77 |   - type: textarea
78 |     id: misc
79 |     attributes:
80 |       label: Other/Misc.
81 |       description: Please enter any other helpful information here.
82 | 
83 |   - type: textarea
84 |     id: dataset
85 |     attributes:
86 |       label: Dataset
87 |       description: Please provide a public link to the dataset you are using along with the output directory for repro if possible.
88 | 
89 |   - type: checkboxes
90 |     id: terms
91 |     attributes:
92 |       label: Code of Conduct
93 |       description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md)
94 |       options:
95 |         - label: I agree to follow 3D Object Reconstruction's Code of Conduct
96 |           required: true
97 |         - label: I have searched the [open bugs](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3Abug) and have found no duplicates for this bug report
98 |           required: true
99 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to 3D Object Reconstruction
 2 | 
 3 | If you are interested in contributing to 3D Object Reconstruction, your contributions will fall
 4 | into three categories:
 5 | 1. You want to report a bug, feature request, or documentation issue
 6 |     - File an [issue](hhttps://github.com/NVIDIA/3DObjectReconstruction/issues/new/choose)
 7 |     describing what you encountered or what you want to see changed.
 8 |     - Please run and paste the output of the `3DObjectReconstruction/print_env.sh` script while
 9 |     reporting a bug to gather and report relevant environment details.
10 |     - The 3D Object Reconstruction team will evaluate the issues and triage them, scheduling
11 |     them for a release. If you believe the issue needs priority attention
12 |     comment on the issue to notify the team.
13 | 2. You want to propose a new Feature and implement it
14 |     - Post about your intended feature, and we shall discuss the design and
15 |     implementation.
16 |     - Once we agree that the plan looks good, go ahead and implement it, using
17 |     the [code contributions](#code-contributions) guide below.
18 | 3. You want to implement a feature or bug-fix for an outstanding issue
19 |     - Follow the [code contributions](#code-contributions) guide below.
20 |     - If you need more context on a particular issue, please ask and we shall
21 |     provide.
22 | 
23 | ## Code contributions
24 | 
25 | ### Your first issue
26 | 
27 | 1. Read the project's [README.md](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/README.md)
28 |     to learn how to setup the development environment.
29 | 2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/nvidia/3DObjectReconstruction/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
30 |     or [help wanted](https://github.com/nvidia/3DObjectReconstruction/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels
31 | 3. Comment on the issue saying you are going to work on it.
32 | 4. Code! Make sure to contribute any unit tests and validate that the workflow works with the default inference example!
33 | 6. When done, [create your pull request](https://github.com/nvidia/3DObjectReconstruction/compare).
34 | 7. Wait for other developers to review your code and update code as needed.
35 | 8. Once reviewed and approved, a 3D Object Reconstruction developer will merge your pull request.
36 | 
37 | Remember, if you are unsure about anything, don't hesitate to comment on issues and ask for clarifications!
38 | 
39 | ### Managing PR labels
40 | 
41 | Each PR must be labeled according to whether it is a "breaking" or "non-breaking" change (using Github labels). This is used to highlight changes that users should know about when upgrading.
42 | 
43 | For 3D Object Reconstruction, a "breaking" change is one that modifies the codebase in a
44 | non-backward-compatible way. 
45 | 
46 | Additional labels must be applied to indicate whether the change is a feature, improvement, bugfix, or documentation change. 
47 | 
48 | ### Branch naming
49 | 
50 | Branches used to create PRs should have a name of the form `<type>-<name>`
51 | which conforms to the following conventions:
52 | - Type:
53 |     - fea - For if the branch is for a new feature(s)
54 |     - enh - For if the branch is an enhancement of an existing feature(s)
55 |     - bug - For if the branch is for fixing a bug(s) or regression(s)
56 | - Name:
57 |     - A name to convey what is being worked on
58 |     - Please use dashes or underscores between words as opposed to spaces.
59 | 
60 | ## Attribution
61 | Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
62 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Overview
 4 | 
 5 | Define the code of conduct followed and enforced for 3D Object Reconstruction.
 6 | 
 7 | ### Intended audience
 8 | 
 9 | Community | Developers | Project Leads
10 | 
11 | ## Our Pledge
12 | 
13 | In the interest of fostering an open and welcoming environment, we as
14 | contributors and maintainers pledge to making participation in our project and
15 | our community a harassment-free experience for everyone, regardless of age, body
16 | size, disability, ethnicity, sex characteristics, gender identity and expression,
17 | level of experience, education, socio-economic status, nationality, personal
18 | appearance, race, religion, or sexual identity and orientation.
19 | 
20 | ## Our Standards
21 | 
22 | Examples of behavior that contributes to creating a positive environment
23 | include:
24 | 
25 | * Using welcoming and inclusive language
26 | * Being respectful of differing viewpoints and experiences
27 | * Gracefully accepting constructive criticism
28 | * Focusing on what is best for the community
29 | * Showing empathy towards other community members
30 | 
31 | Examples of unacceptable behavior by participants include:
32 | 
33 | * The use of sexualized language or imagery and unwelcome sexual attention or
34 |   advances
35 | * Trolling, insulting/derogatory comments, and personal or political attacks
36 | * Public or private harassment
37 | * Publishing others' private information, such as a physical or electronic
38 |   address, without explicit permission
39 | * Other conduct which could reasonably be considered inappropriate in a
40 |   professional setting
41 | 
42 | ## Our Responsibilities
43 | 
44 | Project maintainers are responsible for clarifying the standards of acceptable
45 | behavior and are expected to take appropriate and fair corrective action in
46 | response to any instances of unacceptable behavior.
47 | 
48 | Project maintainers have the right and responsibility to remove, edit, or
49 | reject comments, commits, code, wiki edits, issues, and other contributions
50 | that are not aligned to this Code of Conduct, or to ban temporarily or
51 | permanently any contributor for other behaviors that they deem inappropriate,
52 | threatening, offensive, or harmful.
53 | 
54 | ## Scope
55 | 
56 | This Code of Conduct applies both within project spaces and in public spaces
57 | when an individual is representing the project or its community. Examples of
58 | representing a project or community include using an official project e-mail
59 | address, posting via an official social media account, or acting as an appointed
60 | representative at an online or offline event. Representation of a project may be
61 | further defined and clarified by project maintainers.
62 | 
63 | ## Enforcement
64 | 
65 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
66 | reported by contacting GitHub_Conduct@nvidia.com. All complaints will be reviewed and
67 | investigated and will result in a response that is deemed necessary and appropriate
68 | to the circumstances. The project team is obligated to maintain confidentiality with
69 | regard to the reporter of an incident. Further details of specific enforcement policies
70 | may be posted separately. 
71 | 
72 | Project maintainers who do not follow or enforce the Code of Conduct in good
73 | faith may face temporary or permanent repercussions as determined by other
74 | members of the project's leadership.
75 | 
76 | ## Attribution
77 | 
78 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
79 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
80 | 
81 | [homepage]: https://www.contributor-covenant.org
82 | 
83 | For answers to common questions about this code of conduct, see
84 | https://www.contributor-covenant.org/faq
85 | 


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
  1 | # NVIDIA 3D Object Reconstruction Package
  2 | 
  3 | A comprehensive workflow for high-quality 3D object reconstruction from RGB-D input using neural implicit surfaces, bundle adjustment, and advanced feature matching.
  4 | 
  5 | ## Installation
  6 | 
  7 | ```bash
  8 | pip install nvidia-3d-object-reconstruction
  9 | ```
 10 | 
 11 | ## Quick Start
 12 | 
 13 | ```python
 14 | from nvidia.reconstruction3d.object.networks import NVBundleSDF
 15 | from nvidia.reconstruction3d.object.configs.schema import NVBundleSDFConfig
 16 | 
 17 | # Initialize configuration
 18 | config = NVBundleSDFConfig()
 19 | 
 20 | # Create reconstruction workflow
 21 | workflow = NVBundleSDF(
 22 |     config_nerf=config.nerf,
 23 |     cfg_bundletrack=config.bundletrack, 
 24 |     roma_config=config.roma
 25 | )
 26 | 
 27 | # Run the reconstruction workflow
 28 | workflow.run_track(reader)
 29 | workflow.run_global_sdf(reader)
 30 | workflow.run_texture_bake(reader)
 31 | ```
 32 | 
 33 | ## Package Components
 34 | 
 35 | ### Networks (`nvidia.reconstruction3d.object.networks`)
 36 | 
 37 | - **NVBundleSDF**: Main reconstruction workflow
 38 | - **FoundationStereoProcessor**: Stereo depth estimation
 39 | - **Sam2Infer**: SAM2-based object segmentation
 40 | - **FeatureMatchingInfer**: RoMa feature matching
 41 | - **NerfRunner**: Neural Radiance Field implementation
 42 | 
 43 | ### Configuration (`nvidia.reconstruction3d.object.configs`)
 44 | 
 45 | - **NVBundleSDFConfig**: Main configuration schema
 46 | - **BundleTrackConfig**: Bundle adjustment settings
 47 | - **NeRFConfig**: Neural field parameters
 48 | - **FoundationStereoConfig**: Stereo depth settings
 49 | - **SAM2Config**: Segmentation parameters
 50 | 
 51 | ### Utilities (`nvidia.reconstruction3d.object.utils`)
 52 | 
 53 | - **preprocessing**: Data preprocessing functions
 54 | - **structures**: Data structure utilities
 55 | 
 56 | ## Individual Component Usage
 57 | 
 58 | ### Stereo Depth Estimation
 59 | 
 60 | ```python
 61 | from nvidia.reconstruction3d.object.networks import FoundationStereoProcessor
 62 | 
 63 | processor = FoundationStereoProcessor(config, rgb_path, output_path)
 64 | processor.run()
 65 | ```
 66 | 
 67 | ### Object Segmentation
 68 | 
 69 | ```python
 70 | from nvidia.reconstruction3d.object.networks import Sam2Infer
 71 | 
 72 | sam2 = Sam2Infer(config)
 73 | sam2.run(rgb_path, mask_path)
 74 | ```
 75 | 
 76 | ### Feature Matching
 77 | 
 78 | ```python
 79 | from nvidia.reconstruction3d.object.networks import FeatureMatchingInfer
 80 | 
 81 | matcher = FeatureMatchingInfer(config)
 82 | ```
 83 | 
 84 | ## Configuration Management
 85 | 
 86 | ```python
 87 | from nvidia.reconstruction3d.object.configs.schema import (
 88 |     NVBundleSDFConfig,
 89 |     FoundationStereoConfig,
 90 |     SAM2Config,
 91 |     BundleTrackConfig,
 92 |     NeRFConfig
 93 | )
 94 | 
 95 | # Create and customize configuration
 96 | config = NVBundleSDFConfig()
 97 | config.nerf.n_step = 5000
 98 | config.foundation_stereo.scale = 0.5
 99 | config.sam2.bbox = [1144, 627, 2227, 2232]
100 | ```
101 | 
102 | ## Command Line Interface
103 | 
104 | ```bash
105 | # Run reconstruction workflow
106 | nvidia-3d-reconstruct --config config.yaml --data-path /path/to/data
107 | 
108 | # Get help
109 | nvidia-3d-reconstruct --help
110 | ```
111 | 
112 | ## Key Features
113 | 
114 | - **BundleTrack**: Camera pose tracking and bundle adjustment
115 | - **FoundationStereo**: Advanced stereo depth estimation  
116 | - **SAM2**: Object segmentation using Segment Anything Model 2
117 | - **Neural Implicit Surfaces**: High-quality 3D reconstruction using NeRF
118 | - **Texture Baking**: Photorealistic texture generation
119 | 
120 | ## Requirements
121 | 
122 | - **GPU**: NVIDIA GPU with CUDA support (minimum requirements: Compute Capability 7.0 with at least 24GB VRAM)
123 | - **Memory**: 32GB+ RAM recommended
124 | - **Storage**: 100GB+ free space recommended
125 | - **OS**: Ubuntu 22.04+
126 | 
127 | ## License
128 | 
129 | NVIDIA License (Non-Commercial) - see LICENSE file for details.
130 | 
131 | **Important**: This software is for non-commercial use only. This package incorporates third-party components under different licenses including CC BY-NC-SA 4.0. Review the complete LICENSE file for all terms and attributions.
132 | 
133 | ## Support
134 | 
135 | For issues and questions, please visit the project repository. 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request_form.yml:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | name: Feature Request Form
 17 | description: Request new or improved functionality or changes to existing functionality
 18 | title: "[FEA]: "
 19 | labels: ["feature request"]
 20 | 
 21 | body:
 22 |   - type: markdown
 23 |     attributes:
 24 |       value: |
 25 |         Thanks for taking the time to fill out this feature request!
 26 | 
 27 |   - type: dropdown
 28 |     id: new_or_improvement
 29 |     attributes:
 30 |       label: Is this a new feature, an improvement, or a change to existing functionality?
 31 |       options:
 32 |         - New Feature
 33 |         - Improvement
 34 |         - Change
 35 |     validations:
 36 |       required: true
 37 | 
 38 |   - type: dropdown
 39 |     id: criticality
 40 |     attributes:
 41 |       label: How would you describe the priority of this feature request
 42 |       options:
 43 |         - Critical (currently preventing usage)
 44 |         - High
 45 |         - Medium
 46 |         - Low (would be nice)
 47 |     validations:
 48 |       required: true
 49 | 
 50 |   - type: textarea
 51 |     id: problem
 52 |     attributes:
 53 |       label: Please provide a clear description of problem this feature solves
 54 |       description: Real usage examples are especially helpful, non-code.
 55 |     validations:
 56 |       required: true
 57 | 
 58 |   - type: textarea
 59 |     id: Feature_Description
 60 |     attributes:
 61 |       label: Feature Description
 62 |       description: Please provide clear description of the feature you request (refer to [User Story format](https://www.atlassian.com/agile/project-management/user-stories#:~:text=User%20story%20template%20and%20examples) and [EARS format](https://ieeexplore.ieee.org/document/5328509))
 63 |       placeholder: >
 64 |         For new feature request, please use one of the following format to describe the feature
 65 |           1. From End-user perspective, use the following user story format 
 66 |               As a <persona>, I <want to>, <so that>.
 67 |           2. From System perspective, use the following EARS format
 68 |               <Pre-Condition> <System> shall  <process> <object to be process> <condition>
 69 |         For changing or improving existing feature, it's recommended to provide the previoius Feature Request ID.
 70 |     validations:
 71 |       required: true
 72 | 
 73 |   - type: textarea
 74 |     id: solution
 75 |     attributes:
 76 |       label: Describe your ideal solution
 77 |       description: Please describe the functionality you would like added.
 78 |       placeholder: >
 79 |         A new function that takes in the information in this form, and triages the issue
 80 | 
 81 |         def feature_request(form_info):
 82 |             parse(form_info)
 83 |             return triage_outcome
 84 |     validations:
 85 |       required: true
 86 | 
 87 |   - type: textarea
 88 |     id: alternatives
 89 |     attributes:
 90 |       label: Describe any alternatives you have considered
 91 |       description: List any other libraries, or approaches you have looked at or tried.
 92 |       placeholder: I have looked at library xyz and qrs, but they do not offer GPU accleration
 93 | 
 94 |   - type: textarea
 95 |     id: misc
 96 |     attributes:
 97 |       label: Additional context
 98 |       description: Add any other context, code examples, or references to existing implementations about the feature request here. If applicable, please list the modules affected.
 99 | 
100 |   - type: checkboxes
101 |     id: terms
102 |     attributes:
103 |       label: Code of Conduct
104 |       description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CODE_OF_CONDUCT.md)
105 |       options:
106 |         - label: I agree to follow 3D Object Reconstruction's Code of Conduct
107 |           required: true
108 |         - label: I have searched the [open feature requests](https://github.com/NVIDIA/3DObjectReconstruction/issues?q=is%3Aopen+is%3Aissue+label%3A%22feature+request%22%2Cimprovement%2Cenhancement) and have found no duplicates for this feature request
109 |           required: true
110 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | NVIDIA License
 2 | 
 3 | 1. Definitions
 4 | 
 5 | "Licensor" means any person or entity that distributes its Work.
 6 | 
 7 | "Work" means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license.
 8 | 
 9 | The terms "reproduce," "reproduction," "derivative works," and "distribution" have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
10 | 
11 | Works are "made available" under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license.
12 | 
13 | 2. License Grant
14 | 
15 | 2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
16 | 
17 | 3. Limitations
18 | 
19 | 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work.
20 | 
21 | 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work ("Your Terms") only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself.
22 | 
23 | 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, "non-commercially" means for research or evaluation purposes only.
24 | 
25 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately.
26 | 
27 | 3.5 Trademarks. This license does not grant any rights to use any Licensor's or its affiliates' names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
28 | 
29 | 3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately.
30 | 
31 | 4. Disclaimer of Warranty
32 | 
33 | THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
34 | 
35 | 5. Limitation of Liability
36 | 
37 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
38 | 
39 | ================================================================================
40 | THIRD-PARTY COMPONENTS
41 | ================================================================================
42 | 
43 | This project incorporates code from third-party sources under different licenses:
44 | 
45 | BundleTrack CUDA Components
46 | ---------------------------
47 | Source: https://github.com/NVlabs/BundleSDF/tree/master/BundleTrack/src/cuda
48 | License: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
49 | Copyright: NVIDIA Corporation
50 | 
51 | This component is licensed under CC BY-NC-SA 4.0. You may:
52 | - Share and adapt the material for non-commercial purposes only
53 | - Must provide appropriate attribution
54 | - Must distribute any derivative works under the same license
55 | - Cannot use for commercial purposes
56 | 
57 | Full license text: https://creativecommons.org/licenses/by-nc-sa/4.0/
58 | 
59 | Note: The CC BY-NC-SA 4.0 non-commercial restriction applies to any derivative works 
60 | of the BundleTrack CUDA components. Users must ensure compliance with both the main 
61 | project license and this third-party license when using this software.
62 | 


--------------------------------------------------------------------------------
/src/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=65.0", "wheel", "setuptools-scm"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "nvidia-3d-object-reconstruction"
  7 | version = "0.1.0"
  8 | description = "NVIDIA 3D Object Reconstruction Framework using BundleTrack and Neural Implicit Surfaces"
  9 | readme = "README.md"
 10 | license = {text = "NVIDIA License (Non-Commercial)"}
 11 | authors = [
 12 |     {name = "NVIDIA Corporation", email = "support@nvidia.com"}
 13 | ]
 14 | maintainers = [
 15 |     {name = "NVIDIA Corporation", email = "support@nvidia.com"}
 16 | ]
 17 | keywords = [
 18 |     "3D reconstruction",
 19 |     "neural implicit surfaces", 
 20 |     "bundle adjustment",
 21 |     "computer vision",
 22 |     "NVIDIA",
 23 |     "stereo vision",
 24 |     "depth estimation",
 25 |     "NeRF"
 26 | ]
 27 | classifiers = [
 28 |     "Development Status :: 4 - Beta",
 29 |     "Intended Audience :: Developers",
 30 |     "Intended Audience :: Science/Research",
 31 |     "Operating System :: POSIX :: Linux",
 32 |     "Programming Language :: Python :: 3",
 33 |     "Programming Language :: Python :: 3.8",
 34 |     "Programming Language :: Python :: 3.9", 
 35 |     "Programming Language :: Python :: 3.10",
 36 |     "Programming Language :: Python :: 3.11",
 37 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 38 |     "Topic :: Scientific/Engineering :: Image Recognition",
 39 |     "Topic :: Multimedia :: Graphics :: 3D Modeling",
 40 | ]
 41 | requires-python = ">=3.8"
 42 | dependencies = [
 43 |     # Core ML/CV libraries
 44 |     "torch>=2.0.0",
 45 |     "torchvision>=0.15.0",
 46 |     "numpy==1.26.4",
 47 |     "opencv-python>=4.5.0",
 48 |     "Pillow>=8.0.0",
 49 |     "imageio>=2.9.0",
 50 |     "scikit-image>=0.18.0",
 51 |     "scikit-learn>=1.0.0",
 52 |     
 53 |     # 3D processing
 54 |     "open3d>=0.15.0",
 55 |     "trimesh>=3.15.0",
 56 |     "pyrender>=0.1.45",
 57 |     
 58 |     # Configuration and data handling
 59 |     "omegaconf>=2.1.0",
 60 |     "pyyaml>=6.0",
 61 |     "tqdm>=4.60.0",
 62 |     "joblib>=1.1.0",
 63 |     
 64 |     # Utilities
 65 |     "typing-extensions>=4.0.0",
 66 |     "ipympl",
 67 |     
 68 |     # External models (these may need to be installed separately)
 69 |     # "foundation-stereo>=1.0.0",  # Custom package
 70 |     # "roma>=1.0.0",               # RoMa matcher
 71 |     # "sam2>=1.0.0",               # SAM2 segmentation
 72 |     "xformers"
 73 | ]
 74 | 
 75 | [project.optional-dependencies]
 76 | dev = [
 77 |     "pytest>=7.0.0",
 78 |     "pytest-cov>=3.0.0", 
 79 |     "black>=22.0.0",
 80 |     "isort>=5.10.0",
 81 |     "flake8>=4.0.0",
 82 |     "mypy>=0.950",
 83 |     "pre-commit>=2.17.0",
 84 | ]
 85 | jupyter = [
 86 |     "jupyter>=1.0.0",
 87 |     "jupyterlab>=3.0.0",
 88 |     "notebook>=6.4.0",
 89 |     "ipywidgets>=7.6.0",
 90 | ]
 91 | viz = [
 92 |     "matplotlib>=3.5.0",
 93 |     "plotly>=5.0.0",
 94 |     "seaborn>=0.11.0",
 95 | ]
 96 | all = [
 97 |     "nvidia-3d-object-reconstruction[dev,jupyter,viz]"
 98 | ]
 99 | 
100 | [project.urls]
101 | Homepage = "https://github.com/NVIDIA/3DObjectReconstruction"
102 | Documentation = "https://github.com/NVIDIA/3DObjectReconstruction"
103 | Repository = "https://github.com/NVIDIA/3DObjectReconstruction.git"
104 | Issues = "https://github.com/NVIDIA/3DObjectReconstruction/issues"
105 | Changelog = "https://github.com/NVIDIA/3DObjectReconstruction/blob/main/CHANGELOG.md"
106 | 
107 | [project.scripts]
108 | nvidia-3d-reconstruct = "nvidia.objectreconstruction.cli.main:main"
109 | 
110 | [tool.setuptools]
111 | zip-safe = false
112 | include-package-data = true
113 | 
114 | [tool.setuptools.packages.find]
115 | where = ["."]
116 | include = ["nvidia*"]
117 | namespaces = true
118 | 
119 | [tool.setuptools.package-data]
120 | "nvidia.objectreconstruction" = [
121 |     "configs/*.yaml",
122 |     "configs/*.yml", 
123 |     "data/*.txt",
124 |     "*.md"
125 | ]
126 | 
127 | # Development tools configuration
128 | [tool.black]
129 | line-length = 100
130 | target-version = ['py38', 'py39', 'py310', 'py311']
131 | include = '\.pyi?$'
132 | extend-exclude = '''
133 | /(
134 |   # directories
135 |   \.eggs
136 |   | \.git
137 |   | \.hg
138 |   | \.mypy_cache
139 |   | \.tox
140 |   | \.venv
141 |   | build
142 |   | dist
143 | )/
144 | '''
145 | 
146 | [tool.isort]
147 | profile = "black"
148 | line_length = 100
149 | multi_line_output = 3
150 | include_trailing_comma = true
151 | force_grid_wrap = 0
152 | use_parentheses = true
153 | ensure_newline_before_comments = true
154 | 
155 | [tool.mypy]
156 | python_version = "3.8"
157 | warn_return_any = true
158 | warn_unused_configs = true
159 | disallow_untyped_defs = true
160 | disallow_incomplete_defs = true
161 | check_untyped_defs = true
162 | disallow_untyped_decorators = true
163 | no_implicit_optional = true
164 | warn_redundant_casts = true
165 | warn_unused_ignores = true
166 | warn_no_return = true
167 | warn_unreachable = true
168 | strict_equality = true
169 | 
170 | [tool.pytest.ini_options]
171 | minversion = "7.0"
172 | addopts = "-ra -q --strict-markers --strict-config"
173 | testpaths = [
174 |     "tests",
175 | ]
176 | filterwarnings = [
177 |     "error",
178 |     "ignore::UserWarning",
179 |     "ignore::DeprecationWarning",
180 | ] 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | data/samples/retail_item/ filter=lfs diff=lfs merge=lfs -text
 2 | data/samples/retail_item/left/left000002.png filter=lfs diff=lfs merge=lfs -text
 3 | data/samples/retail_item/left/left000005.png filter=lfs diff=lfs merge=lfs -text
 4 | data/samples/retail_item/left/left000026.png filter=lfs diff=lfs merge=lfs -text
 5 | data/samples/retail_item/left/left000034.png filter=lfs diff=lfs merge=lfs -text
 6 | data/samples/retail_item/left/left000006.png filter=lfs diff=lfs merge=lfs -text
 7 | data/samples/retail_item/left/left000007.png filter=lfs diff=lfs merge=lfs -text
 8 | data/samples/retail_item/left/left000014.png filter=lfs diff=lfs merge=lfs -text
 9 | data/samples/retail_item/left/left000028.png filter=lfs diff=lfs merge=lfs -text
10 | data/samples/retail_item/left/left000035.png filter=lfs diff=lfs merge=lfs -text
11 | data/samples/retail_item/left/left000003.png filter=lfs diff=lfs merge=lfs -text
12 | data/samples/retail_item/left/left000015.png filter=lfs diff=lfs merge=lfs -text
13 | data/samples/retail_item/left/left000018.png filter=lfs diff=lfs merge=lfs -text
14 | data/samples/retail_item/left/left000022.png filter=lfs diff=lfs merge=lfs -text
15 | data/samples/retail_item/left/left000031.png filter=lfs diff=lfs merge=lfs -text
16 | data/samples/retail_item/left/left000001.png filter=lfs diff=lfs merge=lfs -text
17 | data/samples/retail_item/left/left000010.png filter=lfs diff=lfs merge=lfs -text
18 | data/samples/retail_item/left/left000017.png filter=lfs diff=lfs merge=lfs -text
19 | data/samples/retail_item/left/left000030.png filter=lfs diff=lfs merge=lfs -text
20 | data/samples/retail_item/left/left000032.png filter=lfs diff=lfs merge=lfs -text
21 | data/samples/retail_item/left/left000000.png filter=lfs diff=lfs merge=lfs -text
22 | data/samples/retail_item/left/left000013.png filter=lfs diff=lfs merge=lfs -text
23 | data/samples/retail_item/left/left000021.png filter=lfs diff=lfs merge=lfs -text
24 | data/samples/retail_item/left/left000023.png filter=lfs diff=lfs merge=lfs -text
25 | data/samples/retail_item/left/left000025.png filter=lfs diff=lfs merge=lfs -text
26 | data/samples/retail_item/left/left000027.png filter=lfs diff=lfs merge=lfs -text
27 | data/samples/retail_item/left/left000011.png filter=lfs diff=lfs merge=lfs -text
28 | data/samples/retail_item/left/left000024.png filter=lfs diff=lfs merge=lfs -text
29 | data/samples/retail_item/left/left000009.png filter=lfs diff=lfs merge=lfs -text
30 | data/samples/retail_item/left/left000020.png filter=lfs diff=lfs merge=lfs -text
31 | data/samples/retail_item/left/left000004.png filter=lfs diff=lfs merge=lfs -text
32 | data/samples/retail_item/left/left000008.png filter=lfs diff=lfs merge=lfs -text
33 | data/samples/retail_item/left/left000012.png filter=lfs diff=lfs merge=lfs -text
34 | data/samples/retail_item/left/left000016.png filter=lfs diff=lfs merge=lfs -text
35 | data/samples/retail_item/left/left000019.png filter=lfs diff=lfs merge=lfs -text
36 | data/samples/retail_item/left/left000029.png filter=lfs diff=lfs merge=lfs -text
37 | data/samples/retail_item/left/left000033.png filter=lfs diff=lfs merge=lfs -text
38 | data/samples/retail_item/left/left000036.png filter=lfs diff=lfs merge=lfs -text
39 | data/samples/retail_item/right/right000013.png filter=lfs diff=lfs merge=lfs -text
40 | data/samples/retail_item/right/right000014.png filter=lfs diff=lfs merge=lfs -text
41 | data/samples/retail_item/right/right000020.png filter=lfs diff=lfs merge=lfs -text
42 | data/samples/retail_item/right/right000023.png filter=lfs diff=lfs merge=lfs -text
43 | data/samples/retail_item/right/right000031.png filter=lfs diff=lfs merge=lfs -text
44 | data/samples/retail_item/right/right000005.png filter=lfs diff=lfs merge=lfs -text
45 | data/samples/retail_item/right/right000010.png filter=lfs diff=lfs merge=lfs -text
46 | data/samples/retail_item/right/right000019.png filter=lfs diff=lfs merge=lfs -text
47 | data/samples/retail_item/right/right000029.png filter=lfs diff=lfs merge=lfs -text
48 | data/samples/retail_item/right/right000033.png filter=lfs diff=lfs merge=lfs -text
49 | data/samples/retail_item/right/right000034.png filter=lfs diff=lfs merge=lfs -text
50 | data/samples/retail_item/right/right000000.png filter=lfs diff=lfs merge=lfs -text
51 | data/samples/retail_item/right/right000002.png filter=lfs diff=lfs merge=lfs -text
52 | data/samples/retail_item/right/right000027.png filter=lfs diff=lfs merge=lfs -text
53 | data/samples/retail_item/right/right000036.png filter=lfs diff=lfs merge=lfs -text
54 | data/samples/retail_item/right/right000008.png filter=lfs diff=lfs merge=lfs -text
55 | data/samples/retail_item/right/right000024.png filter=lfs diff=lfs merge=lfs -text
56 | data/samples/retail_item/right/right000032.png filter=lfs diff=lfs merge=lfs -text
57 | data/samples/retail_item/right/right000035.png filter=lfs diff=lfs merge=lfs -text
58 | data/samples/retail_item/right/right000001.png filter=lfs diff=lfs merge=lfs -text
59 | data/samples/retail_item/right/right000016.png filter=lfs diff=lfs merge=lfs -text
60 | data/samples/retail_item/right/right000012.png filter=lfs diff=lfs merge=lfs -text
61 | data/samples/retail_item/right/right000028.png filter=lfs diff=lfs merge=lfs -text
62 | data/samples/retail_item/right/right000006.png filter=lfs diff=lfs merge=lfs -text
63 | data/samples/retail_item/right/right000009.png filter=lfs diff=lfs merge=lfs -text
64 | data/samples/retail_item/right/right000018.png filter=lfs diff=lfs merge=lfs -text
65 | data/samples/retail_item/right/right000030.png filter=lfs diff=lfs merge=lfs -text
66 | data/samples/retail_item/right/right000007.png filter=lfs diff=lfs merge=lfs -text
67 | data/samples/retail_item/right/right000015.png filter=lfs diff=lfs merge=lfs -text
68 | data/samples/retail_item/right/right000017.png filter=lfs diff=lfs merge=lfs -text
69 | data/samples/retail_item/right/right000021.png filter=lfs diff=lfs merge=lfs -text
70 | data/samples/retail_item/right/right000022.png filter=lfs diff=lfs merge=lfs -text
71 | data/samples/retail_item/right/right000004.png filter=lfs diff=lfs merge=lfs -text
72 | data/samples/retail_item/right/right000011.png filter=lfs diff=lfs merge=lfs -text
73 | data/samples/retail_item/right/right000026.png filter=lfs diff=lfs merge=lfs -text
74 | data/samples/retail_item/right/right000003.png filter=lfs diff=lfs merge=lfs -text
75 | data/samples/retail_item/right/right000025.png filter=lfs diff=lfs merge=lfs -text
76 | 


--------------------------------------------------------------------------------
/data/LICENSE:
--------------------------------------------------------------------------------
 1 | NVIDIA ASSET LICENSE 
 2 | 
 3 |  
 4 | 
 5 | IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE ASSET. 
 6 | 
 7 |  
 8 | 
 9 | This license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity ("you”) and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA Sample Images for Demonstrating the 3D Object Reconstruction Workflow, provided under this Agreement (the “Asset”). 
10 | 
11 |  
12 | 
13 | This Agreement can be accepted only by an adult of legal age of majority in the country in which the Asset is used. If you do not have the required age or authority to accept this Agreement or if you don’t accept all the terms and conditions of this Agreement, do not use the Asset. 
14 | 
15 |  
16 | 
17 | You agree to use the Asset only for purposes that are permitted by this Agreement and any applicable law or regulation in the relevant jurisdictions.  
18 | 
19 |  
20 | 
21 | 1. License. Subject to the terms of this Agreement, NVIDIA grants you a limited, non-exclusive, revocable, non-transferable, non-sublicensable license to (a) use and reproduce the Asset solely for research or evaluation purposes, with NVIDIA software or hardware and consistent with the limitations in Section 2 below. 
22 | 
23 |  
24 | 
25 | 2. Limitations. Your license to use the Asset and Derivative Works is restricted. Except as expressly permitted in Section 1 above, you may not: (a) change or remove copyright or other proprietary notices in the Asset and Derivative Works; (b) sell, rent, sublicense, transfer, distribute, or otherwise make the Asset and Derivative Works available to others; (c) train or test AI models using the Asset; (d) offer or distribute the Asset on a stand-alone basis; (e) use the Asset or assist or facilitate using the Asset in any manner inconsistent with NVIDIA’s Trustworthy AI Terms available at https://www.nvidia.com/en-us/agreements/trustworthy-ai/terms/. 
26 | 
27 |  
28 | 
29 | 3. Ownership. The Asset, including all intellectual property rights, are and will remain the sole and exclusive property of NVIDIA or its licensors. Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests, and remedies in connection with the Asset, and (b) no other license or right is granted to you by implication, estoppel or otherwise. 
30 | 
31 |  
32 | 
33 | 4. Feedback. You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the Asset (collectively, “Feedback”). Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates. If you provide Feedback, you hereby grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion.  
34 | 
35 |  
36 | 
37 | 5. Term and Termination. This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the Asset. Additionally, NVIDIA may terminate this Agreement at any time with prior written notice. Upon any termination, you must immediately stop using and destroy all copies of the Asset. Upon written request, you will certify in writing that you have complied with your commitments under this section. All provisions will survive termination, except for the licenses granted to you. 
38 | 
39 |  
40 | 
41 | 6. Disclaimer of Warranties. THE ASSET IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING.  
42 | 
43 |  
44 | 
45 | 7. Limitations of Liability. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, WILL NVIDIA BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY TYPE ARISING OUT OF OR AS A RESULT OF THIS AGREEMENT OR THE USE OR INABILITY TO USE THE ASSET (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER DAMAGES OR LOSSES), EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 
46 | 
47 |  
48 | 
49 | 8. Indemnity. You will indemnify and hold harmless NVIDIA, its affiliates, their employees, officers, directors and agents (“Indemnified Parties”) and, at NVIDIA’s election, defend the Indemnified Parties from all third-party claims or lawsuits, costs, damages, expenses, liabilities, including attorney’s fees, arising out of or in connection with your use of the Asset. 
50 | 
51 |  
52 | 
53 | 10. Governing Law and Jurisdiction. This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.  
54 | 
55 |  
56 | 
57 | 11. No Assignment. NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law. You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void. 
58 | 
59 |  
60 | 
61 | 12. Export. The Asset is subject to United States export laws and regulations. You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, including the Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use.  
62 | 
63 |  
64 | 
65 | 13. Entire Agreement. Regarding the subject matter of this Agreement, the parties agree that this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications. If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect. Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties. 
66 | 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/utils/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import sys
  4 | import yaml
  5 | import logging
  6 | 
  7 | import numpy as np
  8 | import open3d as o3d
  9 | import torch.nn.functional as F
 10 | from tqdm import tqdm
 11 | 
 12 | from omegaconf import OmegaConf
 13 | from pathlib import Path
 14 | from typing import Dict, Any
 15 | 
 16 | from ..networks.foundationstereo import FoundationStereoProcessor
 17 | from ..networks.sam2infer import Sam2Infer
 18 | 
 19 | logger = logging.getLogger("preprocessing")
 20 | 
 21 | def load_config(config_path: str) -> Dict[str, Any]:
 22 |     """
 23 |     Load and validate configuration file.
 24 |     
 25 |     Args:
 26 |         config_path: Path to the configuration YAML file.
 27 |     
 28 |     Returns:
 29 |         Dict containing configuration parameters.
 30 |         
 31 |     Raises:
 32 |         FileNotFoundError: If config file doesn't exist.
 33 |         yaml.YAMLError: If config file is invalid.
 34 |     """
 35 |     try:
 36 |         with open(config_path, 'r') as f:
 37 |             return yaml.load(f)
 38 |     except FileNotFoundError:
 39 |         logger.error(f"Configuration file not found: {config_path}")
 40 |         sys.exit(1)
 41 |     except yaml.YAMLError as e:
 42 |         logger.error(f"Error parsing configuration file: {e}")
 43 |         sys.exit(1)
 44 | 
 45 | def setup_experiment_directory(config: Dict[str, Any]) -> tuple[Path, Path]:
 46 |     """
 47 |     Create and validate experiment directory structure.
 48 |     
 49 |     Args:
 50 |         config: Configuration dictionary.
 51 |     
 52 |     Returns:
 53 |         Tuple of (experiment_path, rgb_path)
 54 |     """
 55 |     exp_path = Path(config['base_path']['base_folder'])
 56 |     exp_path.mkdir(exist_ok=True)
 57 |     rgb_path = Path(config['base_path']['image_folder'])
 58 |     rgb_path.mkdir(exist_ok=True)
 59 |     logger.info(f"Using experiment directory: {exp_path}")
 60 | 
 61 |     return exp_path, rgb_path
 62 | 
 63 | def process_video_frames(config: Dict[str, Any], exp_path: Path, rgb_path: Path) -> None:
 64 |     """
 65 |     Extract frames from input video if not already processed.
 66 |     
 67 |     Args:
 68 |         config: Configuration dictionary.
 69 |         exp_path: Path to experiment directory.
 70 |         rgb_path: Path to RGB frames directory.
 71 |     """
 72 |     if rgb_path.exists() and any(rgb_path.iterdir()):
 73 |         logger.info("RGB frames already extracted")
 74 |         return
 75 | 
 76 |     rgb_path.mkdir(exist_ok=True)
 77 |     logger.info("Extracting video frames...")
 78 |     read_video(config['video']['input_path'], str(exp_path), config)
 79 | 
 80 | 
 81 | def depth2xyzmap(depth:np.ndarray, K, uvs:np.ndarray=None, zmin=0.1):
 82 |     invalid_mask = (depth < zmin)
 83 |     H, W = depth.shape[:2]
 84 |     if uvs is None:
 85 |         vs, us = np.meshgrid(np.arange(0, H), np.arange(0, W), sparse=False, indexing='ij')
 86 |         vs = vs.reshape(-1)
 87 |         us = us.reshape(-1)
 88 |     else:
 89 |         uvs = uvs.round().astype(int)
 90 |         us = uvs[:, 0]
 91 |         vs = uvs[:, 1]
 92 |     zs = depth[vs, us]
 93 |     xs = (us - K[0, 2]) * zs / K[0, 0]
 94 |     ys = (vs - K[1, 2]) * zs / K[1, 1]
 95 |     pts = np.stack((xs.reshape(-1), ys.reshape(-1), zs.reshape(-1)), 1)
 96 |     xyz_map = np.zeros((H, W, 3), dtype=np.float32)
 97 |     xyz_map[vs, us] = pts
 98 |     if invalid_mask.any():
 99 |         xyz_map[invalid_mask] = 0
100 |     return xyz_map
101 | 
102 | def toOpen3dCloud(points, colors=None, normals=None):
103 |     cloud = o3d.geometry.PointCloud()
104 |     cloud.points = o3d.utility.Vector3dVector(points.astype(np.float64))
105 |     
106 |     if colors is not None:
107 |         if colors.max() > 1:
108 |             colors = colors / 255.0
109 |         cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float64))
110 |     if normals is not None:
111 |         cloud.normals = o3d.utility.Vector3dVector(normals.astype(np.float64))
112 |     return cloud
113 | 
114 | 
115 | def read_video(input_video_path, base_folder, config=None):
116 |     """
117 |     Read stereo video and split it into left and right frames
118 |     Save the frames in the respective folders
119 |     """
120 |     
121 |     # Validate input video path
122 |     if not os.path.exists(input_video_path):
123 |         logger.error(f"Input video file not found: {input_video_path}")
124 |         return False
125 |     
126 |     # Try different backends in order of preference
127 |     cap = None
128 |     backends_to_try = [
129 |         (cv2.CAP_FFMPEG, "FFmpeg"),
130 |         (cv2.CAP_GSTREAMER, "GStreamer"), 
131 |         (cv2.CAP_ANY, "Default")
132 |     ]
133 |     
134 |     for backend_id, backend_name in backends_to_try:
135 |         logger.info(f"Trying {backend_name} backend...")
136 |         cap = cv2.VideoCapture(input_video_path, backend_id)
137 |         if cap.isOpened():
138 |             logger.info(f"Successfully opened with {backend_name} backend")
139 |             break
140 |         else:
141 |             logger.warning(f"{backend_name} backend failed")
142 |             if cap:
143 |                 cap.release()
144 |     
145 |     # Check if any backend worked
146 |     if not cap or not cap.isOpened():
147 |         logger.error(f"Failed to open video file with any backend: {input_video_path}")
148 |         logger.error("This could be due to missing codec support or corrupted file")
149 |         return False
150 |     
151 |     # Get video properties for validation
152 |     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
153 |     fps = cap.get(cv2.CAP_PROP_FPS)
154 |     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
155 |     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
156 |     
157 |     logger.info(f"Video properties: {total_frames} frames, {fps:.2f} FPS, {width}x{height}")
158 |     
159 |     if total_frames == 0:
160 |         logger.error("Video contains no frames or frame count could not be determined")
161 |         cap.release()
162 |         return False
163 | 
164 |     # Set default step if config not provided
165 |     step = 1
166 |     if config and 'video' in config and 'step' in config['video']:
167 |         step = config['video']['step']
168 |     
169 |     logger.info(f"Processing every {step} frame(s)")
170 | 
171 |     # Read the video and split it into frames
172 |     ret = True
173 |     count = 0
174 |     frames_saved = 0
175 |     
176 |     left_path = os.path.join(base_folder, 'left')
177 |     if not os.path.exists(left_path):
178 |         os.makedirs(left_path)
179 |     
180 |     right_path = os.path.join(base_folder, 'right')
181 |     if not os.path.exists(right_path):
182 |         os.makedirs(right_path)
183 | 
184 |     # Create progress bar
185 |     pbar = tqdm(total=total_frames, desc="Processing frames", unit="frames")
186 |     
187 |     while ret:
188 |         ret, image = cap.read()
189 | 
190 |         if not ret:
191 |             break
192 |         
193 |         if image is None:
194 |             logger.warning(f"Frame {count} is None, skipping")
195 |             count += 1
196 |             pbar.update(1)
197 |             continue
198 |             
199 |         h, w = image.shape[:2]
200 |         
201 |         # Validate that we have a stereo image (should be twice as wide)
202 |         if w < 100:  # Minimum reasonable width
203 |             logger.error(f"Image width {w} is too small for stereo video")
204 |             break
205 |             
206 |         # separate the stereo video into left and right
207 |         left = image[:, :w//2]
208 |         right = image[:, w//2:]
209 | 
210 |         # Save frames based on step interval
211 |         if count % step == 0:
212 |             left_save_path = os.path.join(left_path, '{}.png'.format(str(frames_saved).zfill(6)))
213 |             right_save_path = os.path.join(right_path, '{}.png'.format(str(frames_saved).zfill(6)))
214 |             
215 |             success_left = cv2.imwrite(left_save_path, left)
216 |             success_right = cv2.imwrite(right_save_path, right)
217 |             
218 |             if success_left and success_right:
219 |                 frames_saved += 1
220 |             else:
221 |                 logger.error(f"Failed to save frame {count}")
222 |         
223 |         count += 1
224 |         
225 |         # Update progress bar with current status
226 |         pbar.set_postfix({
227 |             'saved': frames_saved,
228 |             'step': f'1/{step}' if step > 1 else 'all'
229 |         })
230 |         pbar.update(1)
231 |     
232 |     pbar.close()
233 |     
234 |     cap.release()
235 |     cv2.destroyAllWindows()
236 |     
237 |     if frames_saved > 0:
238 |         logger.info(f"Successfully saved {frames_saved} frame pairs from {count} total frames")
239 |         return True
240 |     else:
241 |         logger.error(f"No frames were saved! Processed {count} frames but none could be saved.")
242 |         return False 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM nvcr.io/nvidia/deepstream:7.1-triton-multiarch
  2 | 
  3 | ARG CMAKE_VERSION_MAJOR=3
  4 | ARG CMAKE_VERSION_MINOR=25
  5 | ARG CMAKE_VERSION_PATCH=3
  6 | 
  7 | ARG EIGEN_VERSION_MAJOR=3
  8 | ARG EIGEN_VERSION_MINOR=4
  9 | ARG EIGEN_VERSION_PATCH=0
 10 | 
 11 | ARG OPENCV_VERSION_MAJOR=4
 12 | ARG OPENCV_VERSION_MINOR=11
 13 | ARG OPENCV_VERSION_PATCH=0
 14 | 
 15 | ARG PCL_VERSION_MAJOR=1
 16 | ARG PCL_VERSION_MINOR=10
 17 | ARG PCL_VERSION_PATCH=0
 18 | 
 19 | ARG PYBIND11_VERSION_MAJOR=2
 20 | ARG PYBIND11_VERSION_MINOR=13
 21 | ARG PYBIND11_VERSION_PATCH=0
 22 | 
 23 | ARG YAML_CPP_VERSION_MAJOR=0
 24 | ARG YAML_CPP_VERSION_MINOR=8
 25 | ARG YAML_CPP_VERSION_PATCH=0
 26 | 
 27 | # Install dependencies
 28 | RUN apt-get update && apt-get install -y --no-install-recommends \
 29 |     python3-pip \
 30 |     python3-dev \
 31 |     libglib2.0-0 \
 32 |     libsm6 \
 33 |     libxext6 \
 34 |     libxrender-dev \
 35 |     libblas-dev \
 36 |     libssl-dev \
 37 |     liblapack-dev \
 38 |     gfortran \
 39 |     gnupg \
 40 |     software-properties-common \
 41 |     libflann-dev \
 42 |     libboost-filesystem-dev \
 43 |     libboost-date-time-dev \
 44 |     libboost-iostreams-dev \
 45 |     libboost-system-dev \
 46 |     libboost-program-options-dev \
 47 |     libzmq3-dev \
 48 |     ffmpeg \
 49 |     && rm -rf /var/lib/apt/lists/*
 50 | 
 51 | # Insttall cmake
 52 | RUN cd / &&\
 53 |     wget http://www.cmake.org/files/v${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}/cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}.${CMAKE_VERSION_PATCH}.tar.gz &&\
 54 |     tar xf cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}.${CMAKE_VERSION_PATCH}.tar.gz &&\
 55 |     cd cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}.${CMAKE_VERSION_PATCH} &&\
 56 |     ./configure &&\
 57 |     make &&\
 58 |     make install
 59 | 
 60 | # Install Eigen
 61 | RUN cd / && \
 62 |     wget https://gitlab.com/libeigen/eigen/-/archive/${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}/eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}.tar.gz && \
 63 |     tar xf eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}.tar.gz && \
 64 |     cd eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH} && \
 65 |     mkdir build && \
 66 |     cd build && \
 67 |     cmake .. && \
 68 |     make install && \
 69 |     cd / && \
 70 |     rm -rf eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}.tar.gz eigen-${EIGEN_VERSION_MAJOR}.${EIGEN_VERSION_MINOR}.${EIGEN_VERSION_PATCH}
 71 | 
 72 | RUN pip3 install -U torch==2.6.0 torchvision==0.21.0
 73 | 
 74 | # Install OpenCV
 75 | RUN cd / && \
 76 |     git clone --depth 1 --branch ${OPENCV_VERSION_MAJOR}.${OPENCV_VERSION_MINOR}.${OPENCV_VERSION_PATCH} https://github.com/opencv/opencv && \
 77 |     git clone --depth 1 --branch ${OPENCV_VERSION_MAJOR}.${OPENCV_VERSION_MINOR}.${OPENCV_VERSION_PATCH} https://github.com/opencv/opencv_contrib && \
 78 |     mkdir -p /opencv/build && \
 79 |     cd /opencv/build && \
 80 |     cmake ..  -DCMAKE_BUILD_TYPE=Release \
 81 |         -DBUILD_CUDA_STUBS=OFF \
 82 |         -DBUILD_DOCS=OFF \
 83 |         -DWITH_MATLAB=OFF \
 84 |         -Dopencv_dnn_BUILD_TORCH_IMPORTE=OFF \
 85 |         -DCUDA_FAST_MATH=ON \
 86 |         -DMKL_WITH_OPENMP=ON \
 87 |         -DOPENCV_ENABLE_NONFREE=ON \
 88 |         -DWITH_OPENMP=ON \
 89 |         -DWITH_QT=ON \
 90 |         -DWITH_OPENEXR=ON \
 91 |         -DENABLE_PRECOMPILED_HEADERS=OFF \
 92 |         -DBUILD_opencv_cudacodec=OFF \
 93 |         -DINSTALL_PYTHON_EXAMPLES=OFF \
 94 |         -DWITH_TIFF=OFF \
 95 |         -DWITH_WEBP=OFF \
 96 |         -DWITH_FFMPEG=ON \
 97 |         -DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules \
 98 |         -DCMAKE_CXX_FLAGS=-std=c++17 \
 99 |         -DENABLE_CXX11=OFF \
100 |         -DBUILD_opencv_xfeatures2d=OFF \
101 |         -DOPENCV_DNN_OPENCL=OFF \
102 |         -DWITH_CUDA=ON \
103 |         -DWITH_OPENCL=OFF \
104 |         -DBUILD_opencv_wechat_qrcode=OFF \
105 |         -DCMAKE_CXX_STANDARD=17 \
106 |         -DCMAKE_CXX_STANDARD_REQUIRED=ON \
107 |         -DOPENCV_CUDA_OPTIONS_opencv_test_cudev=-std=c++17 \
108 |         -DCUDA_ARCH_BIN="7.0 7.5 8.0 8.6 9.0" \
109 |         -DCMAKE_INSTALL_PREFIX=/usr/local \
110 |         -DCMAKE_INSTALL_LIBDIR=lib \
111 |         -DINSTALL_PKGCONFIG=ON \
112 |         -DOPENCV_GENERATE_PKGCONFIG=ON \
113 |         -DPKG_CONFIG_PATH=/usr/local/lib/pkgconfig \
114 |         -DINSTALL_PYTHON_EXAMPLES=OFF \
115 |         -DINSTALL_C_EXAMPLES=OFF && \
116 |     make -j$(nproc) && \
117 |     make install && \
118 |     cd / && \
119 |     rm -rf /opencv /opencv_contrib
120 | 
121 | # Install PCL
122 | RUN cd / && \
123 |     git clone --depth 1 --branch pcl-${PCL_VERSION_MAJOR}.${PCL_VERSION_MINOR}.${PCL_VERSION_PATCH} https://github.com/PointCloudLibrary/pcl && \
124 |     mkdir -p /pcl/build && \
125 |     cd /pcl/build && \
126 |     cmake .. \
127 |         -DCMAKE_BUILD_TYPE=Release \
128 |         -DBUILD_apps=OFF \
129 |         -DBUILD_GPU=OFF \
130 |         -DBUILD_CUDA=OFF \
131 |         -DBUILD_examples=OFF \
132 |         -DBUILD_global_tests=OFF \
133 |         -DBUILD_simulation=OFF \
134 |         -DCUDA_BUILD_EMULATION=OFF \
135 |         -DCMAKE_CXX_FLAGS=-std=c++17 \
136 |         -DPCL_ENABLE_SSE=ON \
137 |         -DPCL_SHARED_LIBS=ON \
138 |         -DWITH_VTK=OFF \
139 |         -DPCL_ONLY_CORE_POINT_TYPES=ON \
140 |         -DPCL_COMMON_WARNINGS=OFF && \
141 |     make -j$(nproc) && \
142 |     make install && \
143 |     cd / && \
144 |     rm -rf /pcl
145 | 
146 | # Install Pybind11
147 | RUN cd / && \
148 |     git clone --depth 1 --branch v${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH} https://github.com/pybind/pybind11 && \
149 |     mkdir -p /pybind11/build && \
150 |     cd /pybind11/build && \
151 |     cmake .. -DCMAKE_BUILD_TYPE=Release -DPYBIND11_INSTALL=ON -DPYBIND11_TEST=OFF && \
152 |     make -j$(nproc) && \
153 |     make install && \
154 |     cd / && \
155 |     rm -rf /pybind11
156 | 
157 | # Install YAML-CPP
158 | RUN cd / && \
159 |     git clone --depth 1 --branch ${YAML_CPP_VERSION_MAJOR}.${YAML_CPP_VERSION_MINOR}.${YAML_CPP_VERSION_PATCH} https://github.com/jbeder/yaml-cpp && \
160 |     mkdir -p /yaml-cpp/build && \
161 |     cd /yaml-cpp/build && \
162 |     cmake .. \
163 |         -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
164 |         -DBUILD_TESTING=OFF \
165 |         -DCMAKE_BUILD_TYPE=Release \
166 |         -DINSTALL_GTEST=OFF \
167 |         -DYAML_CPP_BUILD_TESTS=OFF \
168 |         -DYAML_BUILD_SHARED_LIBS=ON && \
169 |     make -j$(nproc) && \
170 |     make install && \
171 |     cd / && \
172 |     rm -rf /yaml-cpp
173 | 
174 | # Create workspace directory
175 | WORKDIR /workspace
176 | 
177 | COPY src/requirements.txt /workspace/
178 | # Install Python dependencies
179 | RUN pip3 install --no-cache-dir -r /workspace/requirements.txt
180 | 
181 | # Install Python dependencies
182 | RUN pip3 install -U pip && pip3 install --no-cache-dir \
183 |     scikit-learn scikit-image --force-reinstall \
184 |     && pip3 install --no-cache-dir --ignore-installed open3d \
185 |     && pip3 install --no-cache-dir kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu124.html \
186 |     && pip3 install --no-cache-dir --no-index pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt241/download.html \
187 |     && pip3 install --no-cache-dir numpy==1.26.4 scipy joblib scikit-learn scikit-image --force-reinstall \
188 |     && pip3 install --no-cache-dir pyrender \
189 |     && pip3 install --no-cache-dir jupyter jupyterlab notebook
190 | 
191 | # Setup environment
192 | ENV PATH="/bin/python3:${PATH}"
193 | RUN alias python="/bin/python3"
194 | RUN echo 'alias python="/bin/python3"' >> /etc/bash.bashrc && \
195 |     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
196 |     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
197 |     echo "export PYTHONPATH=/usr/local/lib/python3.10/dist-packages:${PYTHONPATH}" >> /etc/bash.bashrc
198 | 
199 | RUN cd / && git clone https://github.com/NVLabs/BundleSDF.git
200 | 
201 | RUN cp -r /BundleSDF/mycuda /customize_cuda
202 | RUN cd /customize_cuda && \
203 |     TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0" FORCE_CUDA=1 pip install . --no-build-isolation
204 |     
205 | # Setup the OpenMPI
206 | RUN mkdir -p /opt/hpcx/ompi/lib/x86_64-linux-gnu
207 | RUN ln -s /opt/hpcx/ompi /opt/hpcx/ompi/lib/x86_64-linux-gnu/openmpi
208 | RUN apt remove -y libvtk9-dev || true
209 | 
210 | # Reinstall VTK package for OpenCV compatibility
211 | RUN apt-get update && apt-get install -y libvtk9-dev && \
212 |     rm -rf /var/lib/apt/lists/*
213 | 
214 | # Build and install BundleTrack (copy only necessary artifacts)
215 | RUN cp -r /BundleSDF/BundleTrack /tmp/BundleTrack
216 | RUN cd /tmp/BundleTrack && \
217 |     mkdir -p build && \
218 |     cd build && \
219 |     cmake .. -DCMAKE_BUILD_TYPE=Release && \
220 |     make -j$(nproc) && \
221 |     # Copy only the essential built artifacts to standard locations \
222 |     cp my_cpp*.so /usr/local/lib/python3.10/dist-packages/ && \
223 |     cp libBundleTrack.so /usr/local/lib/ && \
224 |     cp libMY_CUDA_LIB.so /usr/local/lib/ && \
225 |     # Update library cache \
226 |     ldconfig && \
227 |     # Remove all source code and build artifacts \
228 |     cd / && \
229 |     rm -rf /tmp/BundleTrack
230 | 
231 | RUN cd / && \
232 |     git clone https://github.com/NVlabs/FoundationStereo.git 
233 | 
234 | ENV PYTHONPATH=/FoundationStereo/core:$PYTHONPATH
235 | 
236 | # Install sam2 and roma libraries
237 | RUN cd / && git clone https://github.com/facebookresearch/sam2.git &&\
238 |     cd sam2 &&\
239 |     SAM2_BUILD_CUDA=0 pip install -e ".[notebooks]" && \ 
240 |     python3 setup.py build_ext --inplace
241 | 
242 | RUN cd / && git clone https://github.com/Parskatt/RoMa.git &&\
243 |     cd RoMa &&\
244 |     pip3 install . &&\
245 |     cd / && rm -rf /RoMa
246 | 
247 | # Final cleanup and ldconfig
248 | RUN ldconfig && \
249 |     apt-get autoremove -y && \
250 |     apt-get clean && \
251 |     rm -rf /var/lib/apt/lists/* && \
252 |     rm -rf /root/.cache/pip && \
253 |     # Remove any remaining temporary files \
254 |     rm -rf /tmp/*
255 | 
256 | WORKDIR /workspace
257 | 
258 | # Copy the entire package structure for proper installation
259 | COPY src /workspace/3d-object-reconstruction/src
260 | COPY README.md /workspace/3d-object-reconstruction/
261 | COPY notebooks /workspace/3d-object-reconstruction/notebooks
262 | COPY data /workspace/3d-object-reconstruction/data
263 | 
264 | # Install the package in editable mode (package files are now under src/)
265 | WORKDIR /workspace/3d-object-reconstruction
266 | RUN pip3 install -e src/
267 | 
268 | # Create Jupyter configuration
269 | RUN jupyter notebook --generate-config && \
270 |     echo "c.NotebookApp.ip = '0.0.0.0'" >> ~/.jupyter/jupyter_notebook_config.py && \
271 |     echo "c.NotebookApp.port = 8888" >> ~/.jupyter/jupyter_notebook_config.py && \
272 |     echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py && \
273 |     echo "c.NotebookApp.allow_root = True" >> ~/.jupyter/jupyter_notebook_config.py && \
274 |     echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py && \
275 |     echo "c.NotebookApp.password = ''" >> ~/.jupyter/jupyter_notebook_config.py
276 | 
277 | # Expose Jupyter port
278 | EXPOSE 8888
279 | 
280 | # Set the default command to start Jupyter notebook
281 | CMD ["jupyter", "notebook", "--notebook-dir=/workspace/3d-object-reconstruction", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=''", "--NotebookApp.password=''"]
282 | 
283 | 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/configs/schema.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import List, Optional, Tuple
  3 | from omegaconf import MISSING
  4 | 
  5 | """BundleTrack and NVBundleSDF config schemas"""
  6 | 
  7 | @dataclass
  8 | class RoMaConfig:
  9 |     """RoMa configuration."""
 10 |     coarse_res: int = 560
 11 |     upsample_res: Tuple[int, int] = (864, 864)
 12 |     device: str = "cuda"
 13 |     weights: str = "/workspace/3d-object-reconstruction/data/weights/roma/roma_outdoor.pth"
 14 |     dinov2_weights: str = "/workspace/3d-object-reconstruction/data/weights/roma/dinov2_vitl14_pretrain.pth"
 15 |     
 16 | 
 17 | @dataclass
 18 | class CameraConfig:
 19 |     """Camera configuration."""
 20 |     step: int = field(default=1, metadata={"validate": lambda x: x > 0})
 21 |     intrinsic: List[float] = field(default_factory=lambda: [3.0796e+03, 0, 2.0000e+03, 0, 3.0751e+03, 1.50001e+03, 0, 0, 1])
 22 | 
 23 | @dataclass
 24 | class FoundationStereoConfig:
 25 |     """Foundation Stereo configuration."""
 26 |     # inference_uri: str = MISSING
 27 |     pth_path: str = '/workspace/3d-object-reconstruction/data/weights/foundationstereo/model_best_bp2.pth'
 28 |     cfg_path: str = '/workspace/3d-object-reconstruction/data/weights/foundationstereo/cfg.yaml'
 29 |     vit_size: str = 'vitl'
 30 |     scale: float = 0.3
 31 |     hiera: int = 0
 32 |     z_far: float = 10
 33 |     remove_invisible: bool = True
 34 |     intrinsic: List[float] = field(default_factory=lambda: [3.0796e+03, 0, 2.0000e+03, 0, 3.0751e+03, 1.50001e+03, 0, 0, 1])
 35 |     baseline: float = 0.0657696127
 36 | 
 37 | @dataclass
 38 | class SAM2Config:
 39 |     """SAM2 configuration."""
 40 |     checkpoint_path: str = "/workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_large.pt"
 41 |     model_config: str = "/workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_l.yaml"
 42 |     bbox: List[int] = field(default_factory=lambda: [1144, 627, 2227, 2232])
 43 |     device: str = "cuda"
 44 | 
 45 | @dataclass
 46 | class TextureBakeConfig:
 47 |     """Texture baking configuration."""
 48 |     downscale: float = 1.0
 49 |     texture_res: int = 2048
 50 | 
 51 | @dataclass
 52 | class SegmentationConfig:
 53 |     """Segmentation configuration."""
 54 |     ob_scales: List[float] = field(default_factory=lambda: [0.3, 0.3, 0.3])
 55 |     tolerance: float = 0.03
 56 | 
 57 | @dataclass
 58 | class DepthProcessingConfig:
 59 |     """Depth processing configuration."""
 60 |     zfar: float = 1.0
 61 |     
 62 |     @dataclass
 63 |     class ErodeConfig:
 64 |         radius: int = 1
 65 |         diff: float = 0.001
 66 |         ratio: float = 0.8  # If ratio larger than this, depth set to 0
 67 |     
 68 |     erode: ErodeConfig = field(default_factory=ErodeConfig)
 69 |     
 70 |     @dataclass
 71 |     class BilateralFilterConfig:
 72 |         radius: int = 2
 73 |         sigma_D: int = 2
 74 |         sigma_R: int = 100000
 75 |     
 76 |     bilateral_filter: BilateralFilterConfig = field(default_factory=BilateralFilterConfig)
 77 |     
 78 |     @dataclass
 79 |     class OutlierRemovalConfig:
 80 |         num: int = 30
 81 |         std_mul: int = 3
 82 |     
 83 |     outlier_removal: OutlierRemovalConfig = field(default_factory=OutlierRemovalConfig)
 84 |     
 85 |     edge_normal_thres: int = 10  # Deg between normal and ray
 86 |     denoise_cloud: bool = False
 87 |     percentile: int = 95
 88 | 
 89 | @dataclass
 90 | class BundleConfig:
 91 |     num_iter_outter: int = 7
 92 |     num_iter_inner: int = 5
 93 |     window_size: int = 5  # Exclude keyframes, include new frame
 94 |     max_BA_frames: int = 10
 95 |     subset_selection_method: str = "normal_orientation_nearest"
 96 |     depth_association_radius: int = 5  # Used for depth point association
 97 |     non_neighbor_max_rot: int = 90
 98 |     non_neighbor_min_visible: float = 0.1   # Ratio of pixel visible
 99 |     icp_pose_rot_thres: int = 60    # Rotation larger than XX deg is ignored for icp
100 |     w_rpi: int = 0
101 |     w_p2p: int = 1    # Used in loss.cpp
102 |     w_fm: int = 1
103 |     w_sdf: int = 0
104 |     w_pm: int = 0
105 |     robust_delta: float = 0.005
106 |     min_fm_edges_newframe: int = 15
107 |     image_downscale: List[int] = field(default_factory=lambda: [4])
108 |     feature_edge_dist_thres: float = 0.01
109 |     feature_edge_normal_thres: int = 30   # Normal angle should be within this range
110 |     max_optimized_feature_loss: float = 0.03
111 | 
112 | @dataclass
113 | class KeyframeConfig:
114 |     min_interval: int = 1
115 |     min_feat_num: int = 0
116 |     min_trans: int = 0
117 |     min_rot: int = 5
118 |     min_visible: int = 1
119 | 
120 | @dataclass
121 | class SiftConfig:
122 |     scales: List[int] = field(default_factory=lambda: [2, 4, 8])
123 |     max_match_per_query: int = 5
124 |     nOctaveLayers: int = 3
125 |     contrastThreshold: float = 0.01
126 |     edgeThreshold: int = 50
127 |     sigma: float = 1.6
128 | 
129 | @dataclass
130 | class FeatureCorresConfig:
131 |     mutual: bool = True
132 |     map_points: bool = True
133 |     max_dist_no_neighbor: float = 0.01
134 |     max_normal_no_neighbor: int = 20
135 |     max_dist_neighbor: float = 0.02
136 |     max_normal_neighbor: int = 30
137 |     suppression_patch_size: int = 5
138 |     max_view_normal_angle: int = 180
139 |     min_match_with_ref: int = 5
140 |     resize: int = 800
141 |     rematch_after_nerf: bool = False
142 | 
143 | @dataclass
144 | class RansacConfig:
145 |     max_iter: int = 2000
146 |     num_sample: int = 3
147 |     inlier_dist: float = 0.01
148 |     inlier_normal_angle: int = 20
149 |     desired_succ_rate: float = 0.99
150 |     max_trans_neighbor: float = 0.02   # ransac model estimated pose shouldnt be too far
151 |     max_rot_deg_neighbor: int = 30
152 |     max_trans_no_neighbor: float = 0.01
153 |     max_rot_no_neighbor: int = 10
154 |     epipolar_thres: int = 1
155 |     min_match_after_ransac: int = 5
156 | 
157 | @dataclass
158 | class P2PConfig:
159 |     projective: bool = False
160 |     max_dist: float = 0.02
161 |     max_normal_angle: int = 45
162 | 
163 | @dataclass
164 | class SDFEdgeConfig:
165 |     max_dist: float = 0.02
166 | 
167 | @dataclass
168 | class ShapeConfig:
169 |     res: float = 0.005
170 |     xrange: Tuple[float, float] = (-0.2, 0.2)
171 |     yrange: Tuple[float, float] = (-0.2, 0.2)
172 |     zrange: Tuple[float, float] = (-0.2, 0.2)
173 |     max_weight: int = 100
174 | 
175 | @dataclass
176 | class BundleTrackConfig:
177 |     debug_dir: str = MISSING
178 |     SPDLOG: int = 2
179 |     USE_GRAY: bool = False
180 |     port: str = "5555"
181 |     nerf_port: str = "9999"
182 |     downscale: float = 1.0
183 |     erode_mask: int = 3
184 |     visible_angle: int = 70  # Angle between normal and point to camera origin within XXX is regarded as visible
185 |     
186 |     segmentation: SegmentationConfig = field(default_factory=SegmentationConfig)
187 |     depth_processing: DepthProcessingConfig = field(default_factory=DepthProcessingConfig)
188 |     bundle: BundleConfig = field(default_factory=BundleConfig)
189 |     keyframe: KeyframeConfig = field(default_factory=KeyframeConfig)
190 |     sift: SiftConfig = field(default_factory=SiftConfig)
191 |     feature_corres: FeatureCorresConfig = field(default_factory=FeatureCorresConfig)
192 |     ransac: RansacConfig = field(default_factory=RansacConfig)
193 |     p2p: P2PConfig = field(default_factory=P2PConfig)
194 |     sdf_edge: SDFEdgeConfig = field(default_factory=SDFEdgeConfig)
195 |     shape: ShapeConfig = field(default_factory=ShapeConfig)
196 |     
197 | 
198 | @dataclass
199 | class NeRFConfig:
200 |     """NeRF configuration."""
201 |     batch_size: int = 32
202 |     downscale: float = 0.5
203 |     n_step: int = 2000
204 |     save_dir: str = MISSING
205 |     
206 |     # Network architecture
207 |     netdepth: int = 8
208 |     netwidth: int = 256
209 |     netdepth_fine: int = 8
210 |     netwidth_fine: int = 256
211 |     
212 |     # Training parameters
213 |     N_rand: int = 2048
214 |     lrate: float = 0.01
215 |     lrate_pose: float = 0.01
216 |     decay_rate: float = 0.1
217 |     chunk: int = 99999999999
218 |     netchunk: int = 6553600
219 |     no_batching: int = 0
220 |     amp: bool = True
221 |     
222 |     # Sampling parameters
223 |     N_samples: int = 64
224 |     N_samples_around_depth: int = 256
225 |     N_importance: int = 0
226 |     perturb: int = 1
227 |     use_viewdirs: int = 1
228 |     
229 |     # Embedding parameters
230 |     i_embed: int = 1
231 |     i_embed_views: int = 2
232 |     multires: int = 8
233 |     multires_views: int = 3
234 |     feature_grid_dim: int = 2
235 |     raw_noise_std: int = 0
236 |     
237 |     # Logging options
238 |     i_img: int = 99999
239 |     i_weights: int = 999999
240 |     i_mesh: int = 999999
241 |     i_pose: int = 999999
242 |     i_print: int = 999999
243 |     
244 |     # Hash encoding parameters
245 |     finest_res: int = 256
246 |     base_res: int = 16
247 |     num_levels: int = 16
248 |     log2_hashmap_size: int = 22
249 |     
250 |     # Octree parameters
251 |     use_octree: int = 1
252 |     first_frame_weight: int = 1
253 |     denoise_depth_use_octree_cloud: bool = True
254 |     octree_embed_base_voxel_size: float = 0.02
255 |     octree_smallest_voxel_size: float = 0.02
256 |     octree_raytracing_voxel_size: float = 0.02
257 |     octree_dilate_size: float = 0.02
258 |     down_scale_ratio: int = 1
259 |     
260 |     # Scene parameters
261 |     bounding_box: List[List[float]] = field(default_factory=lambda: [[-1, -1, -1], [1, 1, 1]])
262 |     use_mask: int = 1
263 |     dilate_mask_size: int = 0
264 |     rays_valid_depth_only: bool = True
265 |     near: float = 0.1
266 |     far: float = 1.0
267 |     
268 |     # Loss weights
269 |     rgb_weight: int = 10
270 |     depth_weight: int = 0
271 |     sdf_lambda: int = 5
272 |     trunc: float = 0.002
273 |     trunc_start: float = 0.002
274 |     neg_trunc_ratio: int = 1
275 |     trunc_decay_type: str = ""
276 |     fs_weight: int = 100
277 |     empty_weight: int = 2
278 |     fs_rgb_weight: int = 0
279 |     fs_sdf: float = 0.1
280 |     trunc_weight: int = 6000
281 |     tv_loss_weight: int = 0
282 |     frame_features: int = 2
283 |     optimize_poses: int = 0
284 |     pose_reg_weight: int = 0
285 |     feature_reg_weight: float = 0.1
286 |     share_coarse_fine: int = 1
287 |     eikonal_weight: int = 0
288 |     
289 |     # Rendering mode and mesh extraction
290 |     mode: str = "sdf"
291 |     mesh_resolution: float = 0.002
292 |     max_trans: float = 0.02
293 |     max_rot: int = 20
294 |     
295 |     
296 |     @dataclass
297 |     class MeshSmoothingConfig:
298 |         enabled: bool = True
299 |         iterations: int = 2
300 |         lambda_: float = 0.5
301 |         use_taubin: bool = True
302 |     
303 |     mesh_smoothing: MeshSmoothingConfig = field(default_factory=MeshSmoothingConfig)
304 |     save_octree_clouds: bool = True
305 | 
306 | @dataclass
307 | class BasePathConfig:
308 |     """Base path configuration."""
309 |     base_folder: str = MISSING
310 |     image_folder: str = MISSING
311 |     save_dir: str = MISSING
312 | 
313 | @dataclass
314 | class NVBundleSDFConfig:
315 |     """NVBundleSDF configuration."""
316 |     data_path: str = MISSING
317 |     workdir: str = MISSING
318 |     downscale: float = 1.0
319 |     camera_config: CameraConfig = field(default_factory=CameraConfig)
320 |     bundletrack: BundleTrackConfig = field(default_factory=BundleTrackConfig)
321 |     foundation_stereo: FoundationStereoConfig = field(default_factory=FoundationStereoConfig)
322 |     sam2: SAM2Config = field(default_factory=SAM2Config)
323 |     nerf: NeRFConfig = field(default_factory=NeRFConfig)
324 |     texture_bake: TextureBakeConfig = field(default_factory=TextureBakeConfig)
325 |     roma: RoMaConfig = field(default_factory=RoMaConfig)
326 |     base_path: BasePathConfig = field(default_factory=BasePathConfig)
327 |     


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/cli/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Command-line interface for NVIDIA 3D Object Reconstruction.
  3 | 
  4 | This module provides the main entry point for the CLI tool.
  5 | """
  6 | 
  7 | import argparse
  8 | import logging
  9 | import sys
 10 | import os
 11 | import shutil
 12 | from pathlib import Path
 13 | import uuid
 14 | import yaml
 15 | import time
 16 | import torch
 17 | 
 18 | from nvidia.objectreconstruction.networks import NVBundleSDF
 19 | from nvidia.objectreconstruction.dataloader import ReconstructionDataLoader
 20 | from nvidia.objectreconstruction.utils.structures import dataclass_to_dict
 21 | from nvidia.objectreconstruction.networks.foundationstereo import run_depth_estimation
 22 | from nvidia.objectreconstruction.networks.sam2infer import run_mask_extraction
 23 | from nvidia.objectreconstruction.utils.preprocessing import setup_experiment_directory
 24 | 
 25 | 
 26 | def setup_logging(verbose: bool = False):
 27 |     """Setup logging configuration."""
 28 |     level = logging.DEBUG if verbose else logging.INFO
 29 |     logging.basicConfig(
 30 |         level=level,
 31 |         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 32 |     )
 33 | 
 34 | def validate_config_file(config_path: str) -> dict:
 35 |     """
 36 |     Load and validate configuration file.
 37 |     
 38 |     Args:
 39 |         config_path: Path to configuration file
 40 |         
 41 |     Returns:
 42 |         dict: Loaded configuration
 43 |         
 44 |     Raises:
 45 |         FileNotFoundError: If config file doesn't exist
 46 |         yaml.YAMLError: If config file is invalid
 47 |         ValueError: If config is missing required sections
 48 |     """
 49 |     if not os.path.exists(config_path):
 50 |         raise FileNotFoundError(f"Configuration file not found: {config_path}")
 51 |     
 52 |     try:
 53 |         with open(config_path, 'r') as f:
 54 |             config = yaml.safe_load(f)
 55 |     except yaml.YAMLError as e:
 56 |         raise yaml.YAMLError(f"Invalid YAML in config file {config_path}: {e}")
 57 |     
 58 |     if not isinstance(config, dict):
 59 |         raise ValueError(f"Configuration file must contain a dictionary, got {type(config)}")
 60 |     
 61 |     # Validate required sections
 62 |     required_sections = ['bundletrack', 'nerf', 'roma', 'sam2', 'foundation_stereo', 'texture_bake']
 63 |     missing_sections = [section for section in required_sections if section not in config]
 64 |     if missing_sections:
 65 |         raise ValueError(f"Configuration missing required sections: {missing_sections}")
 66 |     
 67 |     return config
 68 | 
 69 | def validate_data_path(data_path: str) -> Path:
 70 |     """
 71 |     Validate and return data path.
 72 |     
 73 |     Args:
 74 |         data_path: Path to data directory
 75 |         
 76 |     Returns:
 77 |         Path: Validated path object
 78 |         
 79 |     Raises:
 80 |         FileNotFoundError: If data path doesn't exist
 81 |         NotADirectoryError: If data path is not a directory
 82 |     """
 83 |     path = Path(data_path)
 84 |     
 85 |     if not path.exists():
 86 |         raise FileNotFoundError(f"Data path does not exist: {data_path}")
 87 |     
 88 |     if not path.is_dir():
 89 |         raise NotADirectoryError(f"Data path is not a directory: {data_path}")
 90 |     
 91 |     return path
 92 | 
 93 | def main():
 94 |     """Main CLI entry point with comprehensive error handling."""
 95 |     parser = argparse.ArgumentParser(
 96 |         description="NVIDIA 3D Object Reconstruction Framework",
 97 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 98 |         epilog="""
 99 | Examples:
100 |   nvidia-3d-reconstruct --help
101 |   nvidia-3d-reconstruct --config config.yaml --data-path /path/to/data
102 |         """
103 |     )
104 |     
105 |     parser.add_argument(
106 |         "--config", 
107 |         type=str, 
108 |         default="/workspace/3d-object-reconstruction/data/configs/base.yaml",
109 |         help="Path to configuration file"
110 |     )
111 |     parser.add_argument(
112 |         "--data-path", 
113 |         type=str, 
114 |         default="/workspace/3d-object-reconstruction/data/samples/retail_item/",
115 |         help="Path to input data directory"
116 |     )
117 |     parser.add_argument(
118 |         "--output-path", 
119 |         type=str, 
120 |         default=f"/workspace/3d-object-reconstruction/data/output/{uuid.uuid4()}",
121 |         help="Path to output directory for reconstruction results"
122 |     )
123 |     parser.add_argument(
124 |         "--verbose", "-v", 
125 |         action="store_true", 
126 |         help="Enable verbose logging"
127 |     )
128 |     parser.add_argument(
129 |         "--version", 
130 |         action="version", 
131 |         version="nvidia-3d-object-reconstruction 0.1.0"
132 |     )
133 |     
134 |     # Parse arguments with error handling
135 |     try:
136 |         args = parser.parse_args()
137 |     except SystemExit as e:
138 |         # argparse calls sys.exit on error, catch and re-raise
139 |         return e.code if e.code is not None else 1
140 |     
141 |     # Setup logging
142 |     setup_logging(args.verbose)
143 |     logger = logging.getLogger(__name__)
144 |     
145 |     try:
146 |         start_total = time.time()
147 |         logger.info("NVIDIA 3D Object Reconstruction CLI")
148 |         
149 |         # Validate inputs
150 |         logger.info("Validating configuration and inputs...")
151 |         config = validate_config_file(args.config)
152 |         exp_path = validate_data_path(args.data_path)
153 |         
154 |         # Create output directory
155 |         output_path = Path(args.output_path)
156 |         os.makedirs(output_path, exist_ok=True)
157 |         logger.info(f"Output directory: {output_path}")
158 |         
159 |         # Setup configuration paths
160 |         config['workdir'] = output_path
161 |         config['bundletrack']['debug_dir'] = output_path / "bundletrack"
162 |         config['nerf']['save_dir'] = output_path
163 | 
164 |         # Extract configuration sections
165 |         bundletrack_config = config['bundletrack']
166 |         nerf_config = config['nerf']
167 |         roma_config = config['roma']
168 |         sam2_config = config['sam2']
169 |         foundation_stereo_config = config['foundation_stereo']
170 |         texture_config = config['texture_bake']
171 | 
172 |         logger.info(f"Starting reconstruction pipeline for: {exp_path}")
173 | 
174 |         # Copy contents of input data path to output folder
175 |         logger.info("Copying input data to output folder...")
176 |         for item in exp_path.iterdir():
177 |             if item.is_dir():
178 |                 shutil.copytree(item, output_path / item.name, dirs_exist_ok=True)
179 |             else:
180 |                 shutil.copy2(item, output_path)
181 |         logger.info("Input data copied successfully")
182 | 
183 |         # Step 1: Mask extraction
184 |         logger.info("Step 1/4: Running mask extraction...")
185 |         try:
186 |             start_mask = time.time()
187 |             run_mask_extraction(sam2_config, output_path, output_path / 'left', mask_path=output_path / 'masks')
188 |             logger.info("Mask extraction completed successfully")
189 |             time_mask = time.time() - start_mask
190 |         except Exception as e:
191 |             logger.error(f"Mask extraction failed: {e}")
192 |             raise RuntimeError(f"Mask extraction step failed: {e}")
193 | 
194 |     
195 |         # Step 2: Depth estimation
196 |         logger.info("Step 2/4: Running depth estimation...")
197 |         try:
198 |             start_depth = time.time()
199 |             response = run_depth_estimation(foundation_stereo_config, output_path, output_path / 'left', depth_path=output_path / 'depth')
200 |             if not response:
201 |                 raise RuntimeError("Depth estimation failed")
202 |             logger.info("Depth estimation completed successfully")
203 |             time_depth = time.time() - start_depth
204 |         except Exception as e:
205 |             logger.error(f"Depth estimation failed: {e}")
206 |             raise RuntimeError(f"Depth estimation step failed: {e}")
207 | 
208 |         # Step 3: Initialize tracker and datasets
209 |         logger.info("Step 3/4: Initializing reconstruction components...")
210 |         try:
211 |             start_pipeline = time.time()
212 |             tracker = NVBundleSDF(nerf_config, bundletrack_config, roma_config, texture_config, logger=logger)
213 | 
214 |             track_dataset = ReconstructionDataLoader(
215 |                 str(output_path), 
216 |                 config, 
217 |                 downscale=bundletrack_config['downscale'],
218 |                 min_resolution=bundletrack_config['min_resolution']
219 |             )
220 |             nerf_dataset = ReconstructionDataLoader(
221 |                 str(output_path), 
222 |                 config, 
223 |                 downscale=nerf_config['downscale'],
224 |                 min_resolution=nerf_config['min_resolution']
225 |             )
226 |             texture_dataset = ReconstructionDataLoader(
227 |                 str(output_path), 
228 |                 config, 
229 |                 downscale=texture_config['downscale'],
230 |                 min_resolution=texture_config['min_resolution']
231 |             )
232 |             logger.info("Components initialized successfully")
233 |         except Exception as e:
234 |             logger.error(f"Component initialization failed: {e}")
235 |             raise RuntimeError(f"Failed to initialize reconstruction components: {e}")
236 | 
237 |         # Step 4: Run reconstruction pipeline
238 |         logger.info("Step 4/4: Running reconstruction pipeline...")
239 |         
240 |         # Object tracking
241 |         logger.info("  4a. Running object tracking...")
242 |         try:
243 |             start_track = time.time()
244 |             tracker.run_track(track_dataset)
245 |             logger.info("  Object tracking completed")
246 |             time_track = time.time() - start_track
247 |         except Exception as e:
248 |             logger.error(f"  Object tracking failed: {e}")
249 |             raise RuntimeError(f"Object tracking failed: {e}")
250 | 
251 |         # SDF training
252 |         logger.info("  4b. Running SDF training...")
253 |         try:
254 |             start_sdf = time.time()
255 |             tracker.run_global_sdf(nerf_dataset)
256 |             logger.info("  SDF training completed")
257 |             time_sdf = time.time() - start_sdf
258 |         except Exception as e:
259 |             logger.error(f"  SDF training failed: {e}")
260 |             raise RuntimeError(f"SDF training failed: {e}")
261 | 
262 |         # Texture baking
263 |         logger.info("  4c. Running texture baking...")
264 |         try:
265 |             start_texture = time.time()
266 |             tracker.run_texture_bake(texture_dataset)
267 |             logger.info("  Texture baking completed")
268 |             time_texture = time.time() - start_texture
269 |         except Exception as e:
270 |             logger.error(f"  Texture baking failed: {e}")
271 |             raise RuntimeError(f"Texture baking failed: {e}")
272 | 
273 |         logger.info(f"Reconstruction completed successfully for {output_path}")
274 |         time_pipeline = time.time() - start_pipeline
275 |         times = {
276 |             "total": time.time() - start_total,
277 |             "mask": time_mask,
278 |             "depth": time_depth,
279 |             "pipeline": time_pipeline,
280 |             "track": time_track,
281 |             "sdf": time_sdf,
282 |             "texture": time_texture,
283 |             "gpu_name": torch.cuda.get_device_name(0),
284 |         }
285 |         with open(output_path / "run_time.yaml", "w") as f:
286 |             yaml.dump(times, f)
287 |         return 0
288 | 
289 |     except KeyboardInterrupt:
290 |         logger.warning("Reconstruction interrupted by user (Ctrl+C)")
291 |         return 130  # Standard exit code for SIGINT
292 |         
293 |     except FileNotFoundError as e:
294 |         logger.error(f"File not found: {e}")
295 |         return 2
296 |         
297 |     except NotADirectoryError as e:
298 |         logger.error(f"Invalid directory: {e}")
299 |         return 2
300 |         
301 |     except yaml.YAMLError as e:
302 |         logger.error(f"Configuration file error: {e}")
303 |         return 3
304 |         
305 |     except ValueError as e:
306 |         logger.error(f"Configuration validation error: {e}")
307 |         return 3
308 |         
309 |     except RuntimeError as e:
310 |         logger.error(f"Processing error: {e}")
311 |         return 4
312 |         
313 |     except MemoryError:
314 |         logger.error("Out of memory - try reducing batch size or image resolution")
315 |         return 5
316 |         
317 |     except Exception as e:
318 |         logger.error(f"Unexpected error: {e}")
319 |         logger.debug("Full traceback:", exc_info=True)
320 |         return 1
321 | 
322 | if __name__ == "__main__":
323 |     sys.exit(main()) 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/dataloader/reconstruction_dataloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import numpy as np
  4 | import json
  5 | class ReconstructionDataLoader:
  6 |     """
  7 |     Data loader for multi-view object reconstruction using Bundle-SDF approach.
  8 |     
  9 |     This class manages loading and preprocessing of data for Bundle-SDF reconstruction,
 10 |     including RGB images, depth maps, and segmentation masks.
 11 |     
 12 |     Args:
 13 |         image_dir (str): Directory containing the dataset with subdirectories:
 14 |             - left/: RGB images
 15 |             - depth/: Corresponding depth maps
 16 |             - masks/: Segmentation masks
 17 |             - poses/: Camera poses
 18 |         config (Dict): Configuration dictionary containing:
 19 |             - camera_config: Camera parameters dictionary with:
 20 |                 - intrinsic: Camera intrinsic matrix (flattened)
 21 |         downscale (float, optional): Scale factor to resize inputs (1.0 = original size).
 22 |             Defaults to 1.0.
 23 |         version (int, optional): Version of the dataloader implementation:
 24 |             - 1: Original implementation, returns (color, depth, mask)
 25 |             - 2: Enhanced implementation, returns (left, right, depth, mask, pose, id_str)
 26 |                 where right and pose are None if not available
 27 |             Defaults to 1.
 28 |             
 29 |     Raises:
 30 |         ValueError: If directory structure is invalid or files cannot be found
 31 |         IOError: If image files cannot be read properly
 32 |     """
 33 |     def __init__(self, image_dir, config, downscale=1, version=1, min_resolution=300):
 34 |         if not os.path.exists(image_dir):
 35 |             raise ValueError(f"Image directory not found: {image_dir}")
 36 |             
 37 |         self.image_dir = image_dir
 38 |         self.downscale = downscale
 39 |         self.version = version
 40 |         
 41 |         if self.version not in [1, 2]:
 42 |             raise ValueError(f"Invalid version {version}. Must be 1 or 2.")
 43 |             
 44 |         # Validate and process camera intrinsics
 45 |         if 'camera_config' not in config or 'intrinsic' not in config['camera_config']:
 46 |             raise ValueError("Config must contain 'camera_config' with 'intrinsic' parameter")
 47 |             
 48 |         self.K = np.array(config['camera_config']['intrinsic']).reshape(3, 3)
 49 |         self.time_step = config['camera_config']['step']
 50 |         
 51 |         # Find and sort frame files
 52 |         left_dir = os.path.join(self.image_dir, 'left/')
 53 |         if not os.path.exists(left_dir):
 54 |             raise ValueError(f"Left image directory not found: {left_dir}")
 55 |             
 56 |         # Check for optional directories
 57 |         right_dir = os.path.join(self.image_dir, 'right/')
 58 |         depth_dir = os.path.join(self.image_dir, 'depth/')
 59 |         mask_dir = os.path.join(self.image_dir, 'masks/')
 60 |         pose_dir = os.path.join(self.image_dir, 'poses/')
 61 |         
 62 |         # Track which features are available
 63 |         self.has_right_images = os.path.exists(right_dir)
 64 |         self.has_depth_maps = os.path.exists(depth_dir)
 65 |         self.has_masks = os.path.exists(mask_dir)
 66 |         self.has_poses = os.path.exists(pose_dir)
 67 |         
 68 |         # Validate required directories based on version
 69 |         if self.version == 2:
 70 |             missing_dirs = []
 71 |             if not self.has_depth_maps:
 72 |                 missing_dirs.append("depth/")
 73 |             if not self.has_masks:
 74 |                 missing_dirs.append("masks/")
 75 |             
 76 |             if missing_dirs:
 77 |                 print(f"Warning: Required directories missing for version 2: {', '.join(missing_dirs)}")
 78 |                 print("Returning None for missing data fields")
 79 |         
 80 |         frame_names = [
 81 |             p for p in os.listdir(left_dir)
 82 |             if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg", ".png"]
 83 |         ]
 84 |         
 85 |         if not frame_names:
 86 |             raise ValueError(f"No valid image files found in {left_dir}")
 87 |             
 88 |         frame_names.sort(key=lambda p: int(os.path.splitext(p)[0][4:]))
 89 |         
 90 |         self.color_files = [os.path.join(left_dir, file_name) for file_name in frame_names]
 91 | 
 92 |         # Extract frame IDs
 93 |         self.id_strs = []
 94 |         for color_file in self.color_files:
 95 |             id_str = os.path.basename(color_file)[:-4]
 96 |             self.id_strs.append(id_str)
 97 |         
 98 |         # Get image dimensions from first frame and apply downscaling
 99 |         first_img_path = self.color_files[0]
100 |         first_img = cv2.imread(first_img_path)
101 |         if first_img is None:
102 |             raise IOError(f"Could not read image: {first_img_path}")
103 |             
104 |         self.H, self.W = first_img.shape[:2]
105 |         if self.H < min_resolution or self.W < min_resolution:
106 |             self.downscale = 1.0
107 |         else:
108 |             scale = min_resolution / min(self.H, self.W)
109 |             self.downscale = max(self.downscale, scale)
110 |         self.H = int(self.H * self.downscale)
111 |         self.W = int(self.W * self.downscale)
112 |         
113 |         # Scale intrinsics according to downscale factor
114 |         self.K[:2] *= self.downscale
115 |         self.far = config['nerf']['far']
116 | 
117 |     def __len__(self):
118 |         """Return the number of frames in the dataset."""
119 |         return len(self.color_files)
120 |     
121 |     def get_color(self, idx):
122 |         """
123 |         Load and preprocess RGB image for the specified index.
124 |         
125 |         Args:
126 |             idx (int): Index of the frame to retrieve
127 |             
128 |         Returns:
129 |             np.ndarray: RGB image as np.uint8 with shape (H, W, 3)
130 |             
131 |         Raises:
132 |             IndexError: If idx is out of range
133 |             IOError: If image file cannot be read
134 |         """
135 |         if idx < 0 or idx >= len(self):
136 |             raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}")
137 |             
138 |         color_path = self.color_files[idx]
139 |         color = cv2.imread(color_path)
140 |         
141 |         if color is None:
142 |             raise IOError(f"Failed to load image: {color_path}")
143 |             
144 |         color = cv2.resize(color, (self.W, self.H), interpolation=cv2.INTER_LINEAR)
145 |         return color
146 |     
147 |     def get_right(self, idx):
148 |         """
149 |         Load and preprocess right RGB image for the specified index (if available).
150 |         
151 |         Args:
152 |             idx (int): Index of the frame to retrieve
153 |             
154 |         Returns:
155 |             np.ndarray or None: Right RGB image as np.uint8 with shape (H, W, 3),
156 |                                 or None if not available
157 |             
158 |         Raises:
159 |             IndexError: If idx is out of range
160 |         """
161 |         if not self.has_right_images:
162 |             return None
163 |             
164 |         if idx < 0 or idx >= len(self):
165 |             raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}")
166 |             
167 |         right_path = self.color_files[idx].replace('left/', 'right/')
168 |         
169 |         if not os.path.exists(right_path):
170 |             return None
171 |             
172 |         right = cv2.imread(right_path)
173 |         if right is None:
174 |             return None
175 |             
176 |         right = cv2.resize(right, (self.W, self.H), interpolation=cv2.INTER_LINEAR)
177 |         return right
178 |     
179 |     def get_depth(self, idx):
180 |         """
181 |         Load and preprocess depth map for the specified index.
182 |         
183 |         Args:
184 |             idx (int): Index of the frame to retrieve
185 |             
186 |         Returns:
187 |             np.ndarray or None: Depth map as np.float32 with shape (H, W),
188 |                                 or None if not available
189 |             
190 |         Raises:
191 |             IndexError: If idx is out of range
192 |         """
193 |         if not self.has_depth_maps:
194 |             return None
195 |             
196 |         if idx < 0 or idx >= len(self):
197 |             raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}")
198 |             
199 |         depth_file = self.color_files[idx].replace('left/', 'depth/')
200 |         
201 |         # if not os.path.exists(depth_file):
202 |         #     # check if npy file exists
203 |         #     if os.path.exists(depth_file.replace('.png', '.npy')):
204 |         #         depth_file = depth_file.replace('.png', '.npy')
205 |         #     else:
206 |         #         return None
207 | 
208 |         if os.path.exists(depth_file.replace('.png', '.npy')):
209 |             depth_file = depth_file.replace('.png', '.npy')
210 |         
211 |         # Support multiple depth formats
212 |         try:
213 |             if os.path.splitext(depth_file)[1].lower() == '.npy':
214 |                 depth = np.load(depth_file)
215 |             else:
216 |                 depth = cv2.imread(depth_file, cv2.IMREAD_UNCHANGED)
217 |                 if depth is None:
218 |                     return None
219 |                 depth = depth.astype(np.float32) / 1000.0
220 |                 
221 |             depth = cv2.resize(depth, (self.W, self.H), interpolation=cv2.INTER_NEAREST)
222 |             return depth
223 |         except Exception as e:
224 |             print(f"Warning: Failed to load depth map {depth_file}: {e}")
225 |             return None
226 | 
227 |     def get_mask(self, idx):
228 |         """
229 |         Load and preprocess segmentation mask for the specified index.
230 |         
231 |         Args:
232 |             idx (int): Index of the frame to retrieve
233 |             
234 |         Returns:
235 |             np.ndarray or None: Binary mask as np.uint8 with shape (H, W),
236 |                                or None if not available
237 |             
238 |         Raises:
239 |             IndexError: If idx is out of range
240 |         """
241 |         if not self.has_masks:
242 |             return None
243 |             
244 |         if idx < 0 or idx >= len(self):
245 |             raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}")
246 |             
247 |         mask_file = self.color_files[idx].replace('left/', 'masks/')
248 |         
249 |         if not os.path.exists(mask_file):
250 |             return None
251 |             
252 |         try:
253 |             mask = cv2.imread(mask_file, -1)
254 |             
255 |             if mask is None:
256 |                 return None
257 |                 
258 |             # Ensure binary format
259 |             if len(mask.shape) == 3:
260 |                 mask = (mask.sum(axis=-1) > 0).astype(np.uint8)
261 |                 
262 |             mask = cv2.resize(mask, (self.W, self.H), interpolation=cv2.INTER_NEAREST)
263 |             mask = cv2.erode(mask, np.ones((5, 5), np.uint8))
264 |             mask = cv2.dilate(mask, np.ones((5, 5), np.uint8)) #add denoising and smoothing to masks
265 |             return mask
266 |         except Exception as e:
267 |             print(f"Warning: Failed to load mask {mask_file}: {e}")
268 |             return None
269 |             
270 |     def get_pose(self, idx):
271 |         """
272 |         Load camera pose for the specified index (if available).
273 |         
274 |         Args:
275 |             idx (int): Index of the frame to retrieve
276 |             
277 |         Returns:
278 |             np.ndarray or None: Camera pose matrix, or None if not available
279 |             
280 |         Raises:
281 |             IndexError: If idx is out of range
282 |         """
283 |         if not self.has_poses:
284 |             return None
285 |             
286 |         if idx < 0 or idx >= len(self):
287 |             raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}")
288 |             
289 |         pose_file = self.color_files[idx].replace('left/', 'poses/').replace(
290 |             os.path.splitext(self.color_files[idx])[1], '.json')
291 |             
292 |         if not os.path.exists(pose_file):
293 |             return None
294 |             
295 |         try:
296 |             with open(pose_file, 'r') as f:
297 |                 pose_data = json.load(f)
298 |                 
299 |             # Assuming pose data is a flat list or nested list that can be converted to a matrix
300 |             pose = np.array(pose_data)
301 |             return pose
302 |         except Exception as e:
303 |             print(f"Warning: Failed to load pose {pose_file}: {e}")
304 |             return None
305 | 
306 |     def __getitem__(self, idx):
307 |         """
308 |         Get preprocessed data for the specified index.
309 |         
310 |         Args:
311 |             idx (int): Index of the frame to retrieve
312 |             
313 |         Returns:
314 |             If version=1:
315 |                 Tuple containing:
316 |                 - RGB image (H, W, 3) as np.uint8
317 |                 - Depth map (H, W) as np.float32, or None if not available
318 |                 - Binary mask (H, W) as np.uint8, or None if not available
319 |             
320 |             If version=2:
321 |                 Tuple containing:
322 |                 - left: RGB image (H, W, 3) as np.uint8
323 |                 - right: RGB image (H, W, 3) as np.uint8, or None if not available
324 |                 - depth: Depth map (H, W) as np.float32, or None if not available
325 |                 - mask: Binary mask (H, W) as np.uint8, or None if not available
326 |                 - pose: Camera pose matrix as np.ndarray, or None if not available
327 |                 - id_str: ID string of the frame
328 |             
329 |         Raises:
330 |             IndexError: If idx is out of range
331 |         """
332 |         if idx < 0 or idx >= len(self):
333 |             raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}")
334 |             
335 |         color = self.get_color(idx)
336 |         depth = self.get_depth(idx)
337 |         mask = self.get_mask(idx)
338 |         id_str = self.id_strs[idx]
339 |         
340 |         if self.version == 1:
341 |             return color, depth, mask
342 |         elif self.version == 2:
343 |             # Version 2 returns data in a format compatible with ReconstructionDataloader
344 |             right = self.get_right(idx)
345 |             pose = self.get_pose(idx)
346 |             return color, right, depth, mask, pose, id_str
347 |         
348 |     def get_camera_intrinsics(self):
349 |         """
350 |         Get the camera intrinsics matrix adjusted for current downscale factor.
351 |         
352 |         Returns:
353 |             np.ndarray: 3x3 camera intrinsics matrix
354 |         """
355 |         return self.K.copy()
356 |     
357 |     def get_image_dimensions(self):
358 |         """
359 |         Get the current image dimensions after downscaling.
360 |         
361 |         Returns:
362 |             Tuple[int, int]: Height and width of the images
363 |         """
364 |         return self.H, self.W
365 |         
366 |     def get_frame_id(self, idx):
367 |         """
368 |         Get the ID string for the frame at the specified index.
369 |         
370 |         Args:
371 |             idx (int): Index of the frame
372 |             
373 |         Returns:
374 |             str: ID string of the frame
375 |             
376 |         Raises:
377 |             IndexError: If idx is out of range
378 |         """
379 |         if idx < 0 or idx >= len(self):
380 |             raise IndexError(f"Index {idx} out of range for dataset of length {len(self)}")
381 |             
382 |         return self.id_strs[idx]


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/networks/foundationstereo.py:
--------------------------------------------------------------------------------
  1 | """
  2 | FoundationStereo Network Implementation for 3D Object Reconstruction.
  3 | 
  4 | This module provides a wrapper around the FoundationStereo model for stereo
  5 | depth estimation. It includes preprocessing utilities, model initialization,
  6 | and a high-level processor for batch depth map generation.
  7 | 
  8 | Classes:
  9 |     InputPadder: Utility class for padding images to required dimensions
 10 |     FoundationStereoNet: Wrapper for the FoundationStereo model
 11 |     FoundationStereoProcessor: High-level processor for stereo depth estimation
 12 | 
 13 | Functions:
 14 |     run_depth_estimation: Main entry point for depth estimation pipeline
 15 | """
 16 | 
 17 | import cv2
 18 | import torch
 19 | import imageio
 20 | import numpy as np
 21 | import torch.nn.functional as F
 22 | import sys
 23 | sys.path.append('/FoundationStereo/core')
 24 | from foundation_stereo import FoundationStereo
 25 | from tqdm import tqdm
 26 | from pathlib import Path
 27 | from typing import Dict, Any, List, Tuple, Union, Optional
 28 | from loguru import logger
 29 | from omegaconf import OmegaConf
 30 | 
 31 | 
 32 | class InputPadder:
 33 |     """
 34 |     Utility class for padding images to dimensions divisible by a given factor.
 35 | 
 36 |     This class ensures that input images have dimensions that are compatible
 37 |     with neural network architectures that require specific divisibility
 38 |     constraints (e.g., divisible by 8 or 32).
 39 | 
 40 |     Attributes:
 41 |         ht (int): Original image height
 42 |         wd (int): Original image width
 43 |         _pad (List[int]): Padding values [left, right, top, bottom]
 44 |     """
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         dims: Tuple[int, ...],
 49 |         mode: str = 'sintel',
 50 |         divis_by: int = 8,
 51 |         force_square: bool = False
 52 |     ) -> None:
 53 |         """
 54 |         Initialize the InputPadder.
 55 | 
 56 |         Args:
 57 |             dims: Image dimensions tuple (..., H, W)
 58 |             mode: Padding mode, either 'sintel' or other
 59 |             divis_by: Factor by which dimensions should be divisible
 60 |             force_square: If True, pad to make image square
 61 | 
 62 |         Example:
 63 |             >>> padder = InputPadder((1, 3, 480, 640), divis_by=32)
 64 |             >>> padded_imgs = padder.pad(img1, img2)
 65 |         """
 66 |         self.ht, self.wd = dims[-2:]
 67 | 
 68 |         if force_square:
 69 |             max_side = max(self.ht, self.wd)
 70 |             pad_ht = ((max_side // divis_by) + 1) * divis_by - self.ht
 71 |             pad_wd = ((max_side // divis_by) + 1) * divis_by - self.wd
 72 |         else:
 73 |             pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by
 74 |             pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by
 75 | 
 76 |         if mode == 'sintel':
 77 |             self._pad = [
 78 |                 pad_wd // 2, pad_wd - pad_wd // 2,
 79 |                 pad_ht // 2, pad_ht - pad_ht // 2
 80 |             ]
 81 |         else:
 82 |             self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
 83 | 
 84 |     def pad(self, *inputs: torch.Tensor) -> List[torch.Tensor]:
 85 |         """
 86 |         Apply padding to input tensors.
 87 | 
 88 |         Args:
 89 |             *inputs: Variable number of 4D tensors to pad
 90 | 
 91 |         Returns:
 92 |             List of padded tensors with same order as inputs
 93 | 
 94 |         Raises:
 95 |             AssertionError: If any input tensor is not 4-dimensional
 96 |         """
 97 |         assert all((x.ndim == 4) for x in inputs), \
 98 |             "All inputs must be 4-dimensional tensors"
 99 |         return [F.pad(x, self._pad, mode='replicate') for x in inputs]
100 | 
101 |     def unpad(self, x: torch.Tensor) -> torch.Tensor:
102 |         """
103 |         Remove padding from a tensor.
104 | 
105 |         Args:
106 |             x: 4D tensor to unpad
107 | 
108 |         Returns:
109 |             Tensor with padding removed
110 | 
111 |         Raises:
112 |             AssertionError: If input tensor is not 4-dimensional
113 |         """
114 |         assert x.ndim == 4, "Input must be a 4-dimensional tensor"
115 |         ht, wd = x.shape[-2:]
116 |         c = [
117 |             self._pad[2], ht - self._pad[3],
118 |             self._pad[0], wd - self._pad[1]
119 |         ]
120 |         return x[..., c[0]:c[1], c[2]:c[3]]
121 | 
122 | 
123 | class FoundationStereoNet(FoundationStereo):
124 |     """
125 |     Wrapper class for FoundationStereo network.
126 | 
127 |     This class extends the base FoundationStereo class with additional
128 |     functionality for configuration management, weight loading, and
129 |     simplified inference interface for stereo depth estimation.
130 | 
131 |     Attributes:
132 |         config (Dict[str, Any]): Model configuration parameters
133 |     """
134 | 
135 |     def __init__(self, config: Dict[str, Any]) -> None:
136 |         """
137 |         Initialize the FoundationStereo network.
138 | 
139 |         Args:
140 |             config: Configuration dictionary containing model parameters
141 |                    including architecture settings and hyperparameters
142 | 
143 |         Example:
144 |             >>> config = {'hidden_dims': [128, 128], 'corr_levels': 4}
145 |             >>> model = FoundationStereoNet(config)
146 |         """
147 |         super().__init__(config)
148 |         self.config = config
149 | 
150 |     def load_weights(self) -> None:
151 |         """
152 |         Load pre-trained weights from checkpoint file.
153 | 
154 |         The checkpoint file path should be specified in config['pth_path'].
155 |         The checkpoint is expected to contain a 'model' key with the
156 |         state dictionary.
157 | 
158 |         Raises:
159 |             FileNotFoundError: If checkpoint file doesn't exist
160 |             KeyError: If checkpoint doesn't contain 'model' key
161 |             RuntimeError: If state dict loading fails
162 |         """
163 |         try:
164 |             ckpt = torch.load(self.config['pth_path'], weights_only=False)
165 |             self.load_state_dict(ckpt['model'])
166 |             logger.info(f"Loaded weights from {self.config['pth_path']}")
167 |         except FileNotFoundError as e:
168 |             logger.error(f"Checkpoint file not found: {self.config['pth_path']}")
169 |             raise e
170 |         except KeyError as e:
171 |             logger.error(f"Checkpoint missing 'model' key: {e}")
172 |             raise e
173 | 
174 |     def forward(
175 |         self,
176 |         left: torch.Tensor,
177 |         right: torch.Tensor
178 |     ) -> torch.Tensor:
179 |         """
180 |         Perform forward pass for stereo depth estimation.
181 | 
182 |         Args:
183 |             left: Left stereo image tensor of shape [B, C, H, W]
184 |             right: Right stereo image tensor of shape [B, C, H, W]
185 | 
186 |         Returns:
187 |             Disparity map tensor of shape [B, 1, H, W] representing
188 |             pixel disparities between left and right images
189 |         """
190 |         return super().forward(left, right, iters=32, test_mode=True)
191 | 
192 | 
193 | class FoundationStereoProcessor:
194 |     """
195 |     High-level processor for stereo depth estimation.
196 | 
197 |     This class manages the complete pipeline from loading stereo image pairs
198 |     to generating depth maps. It handles image preprocessing, network inference,
199 |     and depth conversion with configurable camera parameters.
200 | 
201 |     Attributes:
202 |         config (Dict[str, Any]): Configuration parameters
203 |         net (FoundationStereoNet): The stereo network model
204 |         rgb_path (Path): Path to input RGB images
205 |         output_path (Path): Path for output depth maps
206 |         left_images (List[Path]): List of left stereo image paths
207 |         intrinsic (np.ndarray): Camera intrinsic matrix (3x3)
208 |         baseline (float): Baseline distance between cameras
209 |     """
210 | 
211 |     def __init__(
212 |         self,
213 |         config: Dict[str, Any],
214 |         rgb_path: Path,
215 |         output_path: Path
216 |     ) -> None:
217 |         """
218 |         Initialize the stereo depth estimation processor.
219 | 
220 |         Args:
221 |             config: Configuration dictionary containing:
222 |                    - pth_path: Path to model weights
223 |                    - intrinsic: Camera intrinsics matrix (3x3)
224 |                    - baseline: Baseline distance between cameras
225 |                    - scale: Resize scale factor for images
226 |             rgb_path: Path to directory containing left stereo images
227 |                      Supports png, jpg, jpeg formats
228 |             output_path: Directory path where depth maps will be saved
229 |                         as .npy files
230 | 
231 |         Raises:
232 |             RuntimeError: If CUDA is not available
233 |             FileNotFoundError: If rgb_path doesn't exist
234 |         """
235 |         self.config = config
236 | 
237 |         # Initialize and setup the stereo network
238 |         self.net = FoundationStereoNet(config)
239 |         self.net.load_weights()
240 | 
241 |         if not torch.cuda.is_available():
242 |             raise RuntimeError("CUDA is required but not available")
243 | 
244 |         self.net.cuda()  # Move model to GPU
245 |         self.net.eval()  # Set to evaluation mode
246 | 
247 |         self.rgb_path = Path(rgb_path)
248 |         self.output_path = Path(output_path)
249 | 
250 |         if not self.rgb_path.exists():
251 |             raise FileNotFoundError(f"RGB path does not exist: {rgb_path}")
252 | 
253 |         # Discover and sort left stereo images
254 |         self._discover_images()
255 | 
256 |         # Extract camera parameters from configuration
257 |         self._setup_camera_params()
258 | 
259 |     def _discover_images(self) -> None:
260 |         """Discover and sort left stereo images from the input directory."""
261 |         left_images = []
262 |         supported_formats = ['*.png', '*.jpg', '*.jpeg']
263 | 
264 |         for ext in supported_formats:
265 |             left_images.extend(self.rgb_path.glob(ext))
266 | 
267 |         self.left_images = sorted(left_images)
268 | 
269 |         if not self.left_images:
270 |             logger.warning(f"No images found in {self.rgb_path}")
271 | 
272 |         logger.info(f"Found {len(self.left_images)} left images")
273 | 
274 |     def _setup_camera_params(self) -> None:
275 |         """Extract and setup camera parameters from configuration."""
276 |         self.intrinsic = np.array(self.config['intrinsic']).reshape(3, 3)
277 |         # Scale intrinsics to match resized images
278 |         self.intrinsic[:2] *= self.config['scale']
279 |         self.baseline = self.config['baseline']
280 | 
281 |         logger.info(f"Camera baseline: {self.baseline}")
282 |         logger.info(f"Image scale factor: {self.config['scale']}")
283 | 
284 |     def infer(
285 |         self,
286 |         left_input: Union[str, Path, np.ndarray],
287 |         right_input: Union[str, Path, np.ndarray],
288 |         return_disparity: bool = False
289 |     ) -> np.ndarray:
290 |         """
291 |         Perform stereo depth inference on a single pair of images.
292 | 
293 |         Args:
294 |             left_input: Path to left stereo image or numpy array
295 |             right_input: Path to right stereo image or numpy array
296 |             return_disparity: If True, returns disparity map instead of depth
297 | 
298 |         Returns:
299 |             Depth map or disparity map as numpy array of shape [H, W]
300 | 
301 |         Raises:
302 |             ValueError: If inputs are invalid or incompatible
303 |             RuntimeError: If inference fails
304 |         """
305 |         try:
306 |             # Load images - handle both file paths and numpy arrays
307 |             if isinstance(left_input, (str, Path)):
308 |                 left = imageio.imread(str(left_input))
309 |                 right = imageio.imread(str(right_input))
310 |             else:
311 |                 # Assume numpy arrays passed directly
312 |                 left = left_input
313 |                 right = right_input
314 | 
315 |             # Validate image shapes
316 |             if left.shape != right.shape:
317 |                 raise ValueError(
318 |                     f"Image shapes don't match: {left.shape} vs {right.shape}"
319 |                 )
320 | 
321 |             # Resize images according to configuration scale
322 |             scale = self.config['scale']
323 |             left = cv2.resize(
324 |                 left, fx=scale, fy=scale, dsize=None,
325 |                 interpolation=cv2.INTER_LINEAR
326 |             )
327 |             right = cv2.resize(
328 |                 right, fx=scale, fy=scale, dsize=None,
329 |                 interpolation=cv2.INTER_LINEAR
330 |             )
331 |             H, W = left.shape[:2]
332 | 
333 |             # Convert images to PyTorch tensors and move to GPU
334 |             img0 = torch.as_tensor(left).cuda().float()[None].permute(0, 3, 1, 2)
335 |             img1 = torch.as_tensor(right).cuda().float()[None].permute(0, 3, 1, 2)
336 | 
337 |             # Pad images to be divisible by 32 for network processing
338 |             padder = InputPadder(img0.shape, divis_by=32, force_square=False)
339 |             img0, img1 = padder.pad(img0, img1)
340 | 
341 |             # Run stereo matching inference
342 |             with torch.no_grad():
343 |                 disp = self.net(img0, img1)
344 | 
345 |             # Remove padding and convert to numpy
346 |             disp = padder.unpad(disp.float())
347 |             disp = disp.data.cpu().numpy().reshape(H, W)
348 | 
349 |             if return_disparity:
350 |                 return disp
351 | 
352 |             # Convert disparity to metric depth using camera parameters
353 |             # Depth = (focal_length * baseline) / disparity
354 |             # Avoid division by zero
355 |             disp_safe = np.where(disp > 0, disp, np.inf)
356 |             depth = self.intrinsic[0, 0] * self.baseline / disp_safe
357 | 
358 |             return depth
359 | 
360 |         except Exception as e:
361 |             logger.error(f"Inference failed: {e}")
362 |             raise RuntimeError(f"Stereo inference failed: {e}") from e
363 | 
364 |     def run(self) -> None:
365 |         """
366 |         Process all stereo image pairs to generate depth maps.
367 | 
368 |         Main processing loop that:
369 |         1. Loads left/right stereo image pairs
370 |         2. Uses the infer() method for consistent processing
371 |         3. Saves depth maps as numpy arrays
372 | 
373 |         For each left image, expects corresponding right image with 'left'
374 |         replaced by 'right' in the filename.
375 | 
376 |         Output depth maps are saved as {image_name}.npy in the output directory.
377 | 
378 |         Raises:
379 |             FileNotFoundError: If corresponding right image is not found
380 |             RuntimeError: If processing fails
381 |         """
382 |         if not self.left_images:
383 |             logger.warning("No left images found to process")
384 |             return
385 | 
386 |         # Ensure output directory exists
387 |         self.output_path.mkdir(parents=True, exist_ok=True)
388 | 
389 |         successful_count = 0
390 | 
391 |         for left_path in tqdm(self.left_images, desc="Processing stereo pairs"):
392 |             try:
393 |                 base_name = left_path.stem
394 | 
395 |                 # Construct right image path
396 |                 right_path = left_path.parent.parent / 'right' / left_path.name.replace('left', 'right')
397 | 
398 |                 if not right_path.exists():
399 |                     logger.warning(f"Right image not found: {right_path}")
400 |                     continue
401 | 
402 |                 # Use the infer method for consistent processing
403 |                 depth = self.infer(
404 |                     left_path, right_path, return_disparity=False
405 |                 )
406 | 
407 |                 # Save depth map as numpy array
408 |                 output_file = self.output_path / f"{base_name}.npy"
409 |                 np.save(output_file, depth)
410 |                 successful_count += 1
411 | 
412 |             except Exception as e:
413 |                 logger.error(f"Failed to process {left_path}: {e}")
414 |                 continue
415 | 
416 |         logger.info(
417 |             f"Successfully processed {successful_count}/{len(self.left_images)} "
418 |             f"stereo pairs"
419 |         )
420 | 
421 | 
422 | def run_depth_estimation(
423 |     config: Dict[str, Any],
424 |     exp_path: Path,
425 |     rgb_path: Path,
426 |     depth_path: Optional[Path] = None
427 | ) -> Optional[bool]:
428 |     """
429 |     Set up and run depth estimation pipeline.
430 | 
431 |     This function orchestrates the complete depth estimation process:
432 |     1. Sets up output directory structure
433 |     2. Checks if depth maps already exist
434 |     3. Runs FoundationStereo processing if needed
435 |     4. Returns success status
436 | 
437 |     Args:
438 |         config: Configuration dictionary containing model and camera parameters
439 |         exp_path: Path to experiment directory
440 |         rgb_path: Path to RGB frames directory containing left/right images
441 |         depth_path: Optional custom path for depth output (defaults to exp_path/depth)
442 | 
443 |     Returns:
444 |         True if successful, False/None if failed
445 | 
446 |     Example:
447 |         >>> config = {
448 |         ...     'cfg_path': 'model_config.yaml',
449 |         ...     'pth_path': 'weights.pth',
450 |         ...     'intrinsic': [[fx, 0, cx], [0, fy, cy], [0, 0, 1]],
451 |         ...     'baseline': 0.1,
452 |         ...     'scale': 0.5
453 |         ... }
454 |         >>> success = run_depth_estimation(config, exp_path, rgb_path)
455 |     """
456 |     # Setup depth output directory
457 |     if depth_path is None:
458 |         depth_path = exp_path / 'depth'
459 |     depth_path.mkdir(parents=True, exist_ok=True)
460 |     logger.info(f"Depth estimation directory: {depth_path}")
461 | 
462 |     try:
463 |         # Check if depth images already exist (either all .npy or all .png)
464 |         depth_images_npy = list(depth_path.glob('*.npy'))
465 |         depth_images_png = list(depth_path.glob('*.png'))
466 |         rgb_images = list(rgb_path.glob('*.png'))
467 |         
468 |         # Check if we have sufficient depth images in either format
469 |         if (depth_images_npy and len(depth_images_npy) >= len(rgb_images)) or \
470 |            (depth_images_png and len(depth_images_png) >= len(rgb_images)):
471 |             logger.info("Depth images already exist, skipping depth estimation")
472 |             return True
473 | 
474 |         # Run depth estimation
475 |         logger.info("Running depth estimation...")
476 | 
477 |         # Load additional model configuration
478 |         cfg_model = OmegaConf.load(config['cfg_path'])
479 |         args = OmegaConf.merge(OmegaConf.create(config), cfg_model)
480 | 
481 |         # Initialize and run processor
482 |         processor = FoundationStereoProcessor(args, rgb_path, depth_path)
483 |         processor.run()
484 | 
485 |         logger.info("Depth estimation completed successfully")
486 |         return True
487 | 
488 |     except Exception as e:
489 |         logger.error(f"Error running depth estimation: {e}")
490 |         return None
491 | 


--------------------------------------------------------------------------------
/data/configs/base.yaml:
--------------------------------------------------------------------------------
  1 | # Main data paths for input and output
  2 | data_path: /workspace/3d-object-reconstruction/data/samples/retail_item/  # Path to input data folder containing images and masks
  3 | workdir: /workspace/3d-object-reconstruction/data/output/retail_item/     # Path to output directory for reconstruction results                                                        
  4 | downscale: 1.0                                                         # not used for now since we specify image downscale for each part.
  5 | # Camera intrinsic parameters in 3x3 matrix format
  6 | camera_config:
  7 |   step: 4                                                             
  8 |   intrinsic:                                                         # Camera(Qoocam) intrinsic matrix, change depending on your camera - check camera's config
  9 |   - 3079.6  # fx
 10 |   - 0       
 11 |   - 2000.0  # cx
 12 |   - 0       
 13 |   - 3075.1  # fy
 14 |   - 1500.01 # cy
 15 |   - 0       
 16 |   - 0       
 17 |   - 1       
 18 | 
 19 | # BundleTrack configuration for camera pose estimation and tracking
 20 | bundletrack:
 21 |   debug_dir: /workspace/3d-object-reconstruction/data/output/retail_item/ # Directory for debug outputs
 22 |   SPDLOG: 1                                                            # Logging level
 23 |   USE_GRAY: false                                                      
 24 |   port: '5555'                                                         
 25 |   nerf_port: '9999'                                                    
 26 |   downscale: 1.0                                                       # Image downscale factor for tracking
 27 |   min_resolution: 300                                                   # Minimum resolution for tracking
 28 |   erode_mask: 3                                                        # Mask erosion size to remove boundary artifacts
 29 |   visible_angle: 70                                                    
 30 | 
 31 |   # Object segmentation parameters, unused for now since we are using SAM2 for segmentation
 32 |   segmentation:
 33 |     ob_scales:                                                         
 34 |     - 0.3
 35 |     - 0.3
 36 |     - 0.3
 37 |     tolerance: 0.03                                                   
 38 | 
 39 |   # Depth map processing parameters
 40 |   depth_processing:
 41 |     zfar: 1.0                                                         # depth max bound, same as nerf far,might need to change depending on scene scale
 42 |     erode:                                                            
 43 |       radius: 1                                                       
 44 |       diff: 0.001                                                     
 45 |       ratio: 0.8                                                      
 46 |     bilateral_filter:                                                
 47 |       radius: 2                                                       
 48 |       sigma_D: 2                                                      
 49 |       sigma_R: 100000                                                 
 50 |     outlier_removal:                                                  
 51 |       num: 30                                                         
 52 |       std_mul: 3                                                      
 53 |     edge_normal_thres: 10                                            
 54 |     denoise_cloud: false                                             
 55 |     percentile: 95                                                    # Percentile for depth truncation
 56 | 
 57 |   # Bundle adjustment parameters
 58 |   bundle:
 59 |     num_iter_outter: 7                                               
 60 |     num_iter_inner: 5                                               
 61 |     window_size: 5                                                   #window size for non-keyframe saving
 62 |     max_BA_frames: 10                                                #max frames
 63 |     subset_selection_method: normal_orientation_nearest               
 64 |     depth_association_radius: 5                                       
 65 |     non_neighbor_max_rot: 90                                          #maximum rotation difference between two frames
 66 |     non_neighbor_min_visible: 0.1                                    #minimum convisiblity          
 67 |     icp_pose_rot_thres: 60                                          
 68 |     w_rpi: 0                                                       #not used
 69 |     w_p2p: 1                                                       #not used
 70 |     w_fm: 1                                                        #not used 
 71 |     w_sdf: 0                                                       #not used 
 72 |     w_pm: 0                                                        #not used 
 73 |     robust_delta: 0.005                                            #delta scaled for huberloss 
 74 |     min_fm_edges_newframe: 15                                       
 75 |     image_downscale:                                                #image downscale factor
 76 |     - 4
 77 |     feature_edge_dist_thres: 0.01                                   #sparse feature edge distance threshold, not used
 78 |     feature_edge_normal_thres: 30                                   #sparse feature edge normal threshold, not used
 79 |     max_optimized_feature_loss: 0.03                                #max optimized feature loss,not used
 80 | 
 81 |   # Keyframe selection parameters
 82 |   keyframe:
 83 |     min_interval: 1                                                  
 84 |     min_feat_num: 0                                                 
 85 |     min_trans: 0                                                     
 86 |     min_rot: 5                                                       
 87 |     min_visible: 1                                                   
 88 | 
 89 |   # SIFT feature detection parameters
 90 |   sift:
 91 |     scales:                                                          
 92 |     - 2
 93 |     - 4
 94 |     - 8
 95 |     max_match_per_query: 5                                          
 96 |     nOctaveLayers: 3                                                
 97 |     contrastThreshold: 0.01                                         
 98 |     edgeThreshold: 50                                               
 99 |     sigma: 1.6                                                      
100 | 
101 |   # Feature correspondence parameters
102 |   feature_corres:
103 |     mutual: true                                                    
104 |     map_points: true                                                # Use 3D map points
105 |     max_dist_no_neighbor: 0.01                                      # Maximum distance for non-neighbors
106 |     max_normal_no_neighbor: 20                                      # Maximum normal angle for non-neighbors
107 |     max_dist_neighbor: 0.02                                         # Maximum distance for neighbors
108 |     max_normal_neighbor: 30                                         # Maximum normal angle for neighbors
109 |     suppression_patch_size: 5                                       
110 |     max_view_normal_angle: 180                                      
111 |     min_match_with_ref: 5                                          # Minimum matches with reference
112 |     resize: 800                                                     
113 |     rematch_after_nerf: false                                      
114 | 
115 |   # RANSAC parameters for robust estimation not in use?
116 |   ransac:
117 |     max_iter: 2000                                                  # Maximum RANSAC iterations 
118 |     num_sample: 3                                                   # Number of samples per iteration
119 |     inlier_dist: 0.01                                              # Inlier distance threshold
120 |     inlier_normal_angle: 20                                        # Inlier normal angle threshold
121 |     desired_succ_rate: 0.99                                        # Desired success rate
122 |     max_trans_neighbor: 0.02                                       # Maximum translation for neighbors
123 |     max_rot_deg_neighbor: 30                                       # Maximum rotation for neighbors
124 |     max_trans_no_neighbor: 0.01                                    # Maximum translation for non-neighbors
125 |     max_rot_no_neighbor: 10                                        # Maximum rotation for non-neighbors
126 |     epipolar_thres: 1                                              # Epipolar constraint threshold
127 |     min_match_after_ransac: 5                                      # Minimum matches after RANSAC
128 | 
129 |   # Point-to-point ICP parameters
130 |   p2p:
131 |     projective: false                                              # Use projective ICP
132 |     max_dist: 0.02                                                # Maximum correspondence distance
133 |     max_normal_angle: 45                                          # Maximum normal angle difference
134 | 
135 |   # SDF edge parameters, not in use
136 |   sdf_edge:
137 |     max_dist: 0.02                                                
138 | 
139 |   # Shape reconstruction parameters, not in use
140 |   shape:
141 |     res: 0.005                                                    # Voxel resolution
142 |     xrange:                                                       # X range for reconstruction
143 |     - -0.2
144 |     - 0.2
145 |     yrange:                                                       # Y range for reconstruction
146 |     - -0.2
147 |     - 0.2
148 |     zrange:                                                       # Z range for reconstruction
149 |     - -0.2
150 |     - 0.2
151 |     max_weight: 100                                              # Maximum TSDF weight
152 | 
153 | # Foundation Stereo parameters for depth estimation
154 | foundation_stereo:
155 |   pth_path: /workspace/3d-object-reconstruction/data/weights/foundationstereo/model_best_bp2.pth    
156 |   cfg_path: /workspace/3d-object-reconstruction/data/weights/foundationstereo/cfg.yaml              
157 |   dinov2_path: /workspace/3d-object-reconstruction/data/weights/roma/dinov2_vitl14_pretrain.pth    
158 |   vit_size: vitl                                                 
159 |   scale: 0.3                                                     # Image scale factor
160 |   hiera: 0                                                       
161 |   z_far: 10                                                      
162 |   remove_invisible: true                                         
163 |   intrinsic:                                                    # Default camera(Qoocam) intrinsic matrix, changes depending on cameras - check camera's config
164 |   - 3079.6
165 |   - 0
166 |   - 2000.0
167 |   - 0
168 |   - 3075.1
169 |   - 1500.01
170 |   - 0
171 |   - 0
172 |   - 1
173 |   baseline: 0.0657696127                                        # Stereo baseline(Qoocam), changes depending on cameras
174 | 
175 | # SAM2 parameters for segmentation
176 | sam2:
177 |   checkpoint_path: /workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_large.pt     
178 |   model_config: //workspace/3d-object-reconstruction/data/weights/sam2/sam2.1_hiera_l.yaml        
179 |   bbox:                                                         # Bounding box for segmentation
180 |   - 1144
181 |   - 627
182 |   - 2227
183 |   - 2232
184 |   device: cuda                                                  
185 | 
186 | # NeRF parameters for neural rendering
187 | nerf:
188 |   batch_size: 32                                                # Training image batch size, change to smaller bs
189 |   downscale: 0.2                                               # Image downscale factor
190 |   min_resolution: 300                                          # Minimum resolution for training
191 |   n_step: 3000                                                 # Number of training steps
192 |   save_dir: ???                                                # Directory for saving models
193 |   netdepth: 8                                                  
194 |   netwidth: 256                                                
195 |   netdepth_fine: 8                                             
196 |   netwidth_fine: 256                                           
197 |   N_rand: 2048                                                 # Training rays per batch
198 |   lrate: 0.01                                                  # Learning rate
199 |   lrate_pose: 0.01                                             # Pose learning rate
200 |   decay_rate: 0.1                                              # Learning rate decay
201 |   chunk: 99999999999                                           # Chunk size for inference
202 |   netchunk: 6553600                                            # Network chunk size
203 |   no_batching: 0                                               # Disable batching
204 |   amp: false                                                    # Use mixed precision
205 |   N_samples: 64                                                # Number of coarse samples
206 |   N_samples_around_depth: 256                                  # Samples around depth
207 |   N_importance: 0                                              # Number of fine samples
208 |   perturb: 1                                                   # Random sampling
209 |   use_viewdirs: 1                                              # Use view directions
210 |   i_embed: 1                                                   # Position embedding type 
211 |   i_embed_views: 2                                             # View direction embedding
212 |   multires: 8                                                  # Position embedding levels
213 |   multires_views: 3                                            # View direction embedding levels
214 |   feature_grid_dim: 2                                          # Feature grid dimension
215 |   raw_noise_std: 0                                             # Noise standard deviation
216 |   #logging
217 |   i_img: 99999                                                 # Image save interval
218 |   i_weights: 99999                                             # Weight save interval
219 |   i_mesh: 99999                                                # Mesh save interval
220 |   i_pose: 999999                                               # Pose save interval
221 |   i_print: 99999                                               # Print interval
222 |   #Hash embedding config
223 |   finest_res: 256                                              # Finest hash resolution
224 |   base_res: 16                                                 # Base hash resolution
225 |   num_levels: 16                                               # Number of hash levels
226 |   log2_hashmap_size: 22                                        # Hash table size
227 |   #octree config
228 |   use_octree: 1                                                # Use octree acceleration
229 |   first_frame_weight: 1                                        # First frame weight
230 |   denoise_depth_use_octree_cloud: true                        # Use octree for depth denoising
231 |   octree_embed_base_voxel_size: 0.02                          # Base octree voxel size
232 |   octree_smallest_voxel_size: 0.02                            # Smallest octree voxel
233 |   octree_raytracing_voxel_size: 0.02                          # Raytracing voxel size
234 |   octree_dilate_size: 0.02                                    # Octree dilation size
235 |   down_scale_ratio: 1                                          # Downscaling ratio
236 |   bounding_box:                                                # Scene bounding box
237 |   - - -1
238 |     - -1
239 |     - -1
240 |   - - 1
241 |     - 1
242 |     - 1
243 |   use_mask: 1                                                  # Use segmentation masks
244 |   dilate_mask_size: 0                                          # Mask dilation size
245 |   rays_valid_depth_only: true                                  # Only use valid depth rays
246 |   near: 0.1                                                    # Near plane
247 |   far: 1.0                                                     # Far plane
248 |   #loss weights
249 |   rgb_weight: 10                                               # RGB loss weight
250 |   depth_weight: 0                                              #depth loss not in use
251 |   sdf_lambda: 5                                                # SDF weight
252 | 
253 |   neg_trunc_ratio: 1                                          # Negative truncation ratio
254 | 
255 |   fs_weight: 100                                              # Free space weight
256 |   empty_weight: 2                                             # Empty space weight
257 |   fs_rgb_weight: 0                                            
258 |   fs_sdf: 0.1                                                 # free spacethreshold
259 |   trunc_weight: 6000                                          # sdf loss weight, regularize depth
260 |   tv_loss_weight: 0                                           
261 |   frame_features: 2                                           # Per-frame feature dimension
262 |   optimize_poses: 0                                           # Optimize camera poses, 1 for enabled
263 |   pose_reg_weight: 0                                          # Pose regularization
264 |   feature_reg_weight: 0.1                                     # Feature regularization
265 |   share_coarse_fine: 1                                        # Share coarse and fine networks,enabled when N_importance > 0
266 |   eikonal_weight: 0                                           # Eikonal regularization on normal
267 |   mode: sdf                                                   # Reconstruction mode
268 |   
269 |   #Mesh quality related parameters
270 |   trunc: 0.004                                                # TSDF truncation, could set larger(0.01) if mesh has holes
271 |   trunc_start: 0.004                                          # Initial truncation
272 |   trunc_decay_type: ''                                        # Truncation decay type
273 |   mesh_resolution: 0.002                                      # grid voxel size for mesh extraction, Recommended equals or smaller than trunc
274 |   max_trans: 0.02                                             # Maximum translation
275 |   max_rot: 20                                                 # Maximum rotation
276 |   mesh_smoothing:                                             # Mesh smoothing parameters
277 |     enabled: true                                             # Enable smoothing
278 |     iterations: 2                                             # Number of iterations
279 |     lambda_: 0.5                                             # Smoothing strength
280 |     use_taubin: true                                         # Use Taubin smoothing
281 |   save_octree_clouds: true                                    # Save octree point clouds
282 | 
283 | # Texture baking parameters
284 | texture_bake:
285 |   downscale: 1.0                                              # Texture image scale
286 |   min_resolution: 300                                        # Minimum resolution for texture baking
287 |   texture_res: 2048                                           # Texture resolution
288 | 
289 | # ROMA feature matching parameters
290 | roma:
291 |   coarse_res: 560                                             # Coarse resolution
292 |   upsample_res:                                               # Upsampling resolution
293 |   - 864
294 |   - 864
295 |   device: cuda                                                # Device for inference
296 |   weights: /workspace/3d-object-reconstruction/data/weights/roma/roma_outdoor.pth        # ROMA weights path
297 |   dinov2_weights: /workspace/3d-object-reconstruction/data/weights/roma/dinov2_vitl14_pretrain.pth  # DINOv2 weights path
298 | 
299 | # Base path configuration
300 | base_path:
301 |   base_folder: /workspace/3d-object-reconstruction/data/samples/retail_item/               # Base data folder
302 |   image_folder: /workspace/3d-object-reconstruction/data/samples/retail_item/left/         # Input image folder
303 |   save_dir: /workspace/3d-object-reconstruction/data/output/retail_item/                   # Output directory
304 | 


--------------------------------------------------------------------------------
/src/nvidia/objectreconstruction/networks/sam2infer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SAM2 Inference Module for 3D Object Reconstruction.
  3 | 
  4 | This module provides functionality for running SAM2 (Segment Anything Model 2)
  5 | inference on image sequences for mask generation. It includes utilities for
  6 | processing single images, directories of images, and video sequences.
  7 | 
  8 | The module handles PNG image formats and provides compatibility with the
  9 | original SAM2 video processing pipeline.
 10 | """
 11 | 
 12 | import os
 13 | import glob
 14 | import logging
 15 | import numpy as np
 16 | import torch
 17 | import warnings
 18 | from tqdm import tqdm
 19 | from PIL import Image
 20 | from pathlib import Path
 21 | from typing import Dict, Any, Tuple, Optional, Union
 22 | 
 23 | from sam2.build_sam import build_sam2_video_predictor
 24 | from sam2.utils import misc
 25 | 
 26 | # Add monkey patch for torch.load to ensure compatibility with older checkpoints
 27 | original_torch_load = torch.load
 28 | 
 29 | 
 30 | def patched_torch_load(*args, **kwargs):
 31 |     """
 32 |     Patch torch.load to handle compatibility issues with older checkpoints.
 33 | 
 34 |     This function removes the weights_only=True parameter if present to ensure
 35 |     compatibility with older PyTorch checkpoints that don't support this flag.
 36 | 
 37 |     Args:
 38 |         *args: Positional arguments passed to torch.load
 39 |         **kwargs: Keyword arguments passed to torch.load
 40 | 
 41 |     Returns:
 42 |         The result of torch.load with modified parameters
 43 |     """
 44 |     if 'weights_only' in kwargs and kwargs['weights_only'] is True:
 45 |         kwargs['weights_only'] = False
 46 |     return original_torch_load(*args, **kwargs)
 47 | 
 48 | 
 49 | torch.load = patched_torch_load
 50 | 
 51 | # Configure logging
 52 | logger = logging.getLogger(__name__)
 53 | 
 54 | # Select the device for computation
 55 | if torch.cuda.is_available():
 56 |     device = torch.device("cuda")
 57 | elif torch.backends.mps.is_available():
 58 |     device = torch.device("mps")
 59 | else:
 60 |     device = torch.device("cpu")
 61 | 
 62 | print(f"using device: {device}")
 63 | 
 64 | # Configure device-specific settings
 65 | if device.type == "cuda":
 66 |     # Use bfloat16 for better performance on CUDA
 67 |     torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
 68 |     # Enable tfloat32 for Ampere GPUs
 69 |     if torch.cuda.get_device_properties(0).major >= 8:
 70 |         torch.backends.cuda.matmul.allow_tf32 = True
 71 |         torch.backends.cudnn.allow_tf32 = True
 72 | elif device.type == "mps":
 73 |     print(
 74 |         "\nSupport for MPS devices is preliminary. SAM 2 is trained with CUDA "
 75 |         "and might give numerically different outputs and sometimes degraded "
 76 |         "performance on MPS. See e.g. "
 77 |         "https://github.com/pytorch/pytorch/issues/84936 for a discussion."
 78 |     )
 79 | 
 80 | 
 81 | def png_compatible_load_video_frames(
 82 |     video_path: str,
 83 |     image_size: int = 1024,
 84 |     offload_video_to_cpu: bool = False,
 85 |     img_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406),
 86 |     img_std: Tuple[float, float, float] = (0.229, 0.224, 0.225),
 87 |     async_loading_frames: bool = False,
 88 |     compute_device: torch.device = torch.device("cuda"),
 89 | ) -> Tuple[torch.Tensor, int, int]:
 90 |     """
 91 |     Load video frames from a directory of image files (JPEG and PNG).
 92 | 
 93 |     This is a drop-in replacement for misc.load_video_frames that supports
 94 |     PNG format images in addition to JPEG.
 95 | 
 96 |     Args:
 97 |         video_path: Path to the directory containing image files
 98 |         image_size: Target size for resizing images
 99 |         offload_video_to_cpu: Whether to keep images on CPU
100 |         img_mean: RGB mean values for normalization
101 |         img_std: RGB standard deviation values for normalization
102 |         async_loading_frames: Whether to load frames asynchronously (unused)
103 |         compute_device: Device to load images to
104 | 
105 |     Returns:
106 |         Tuple containing:
107 |             - images: Tensor of shape (N, 3, H, W) containing loaded images
108 |             - video_height: Original height of the video frames
109 |             - video_width: Original width of the video frames
110 | 
111 |     Raises:
112 |         FileNotFoundError: If video_path doesn't exist
113 |         RuntimeError: If no images found or unsupported image format
114 |         NotImplementedError: If video_path is not a directory
115 |     """
116 |     if not os.path.exists(video_path):
117 |         raise FileNotFoundError(f"Video file or folder not found: {video_path}")
118 | 
119 |     if not os.path.isdir(video_path):
120 |         warnings.warn(
121 |             "This implementation only supports directories of image files, "
122 |             "not video files."
123 |         )
124 |         raise NotImplementedError(
125 |             "Only image frames are supported. For video files, "
126 |             "extract frames to a directory first."
127 |         )
128 | 
129 |     img_folder = video_path
130 |     # Get all supported image files
131 |     frame_names = []
132 |     for ext in [".jpg", ".jpeg", ".JPG", ".JPEG", ".png", ".PNG"]:
133 |         frame_names.extend([
134 |             p for p in os.listdir(img_folder) if p.endswith(ext)
135 |         ])
136 | 
137 |     if not frame_names:
138 |         raise RuntimeError(f"No images found in {img_folder}")
139 | 
140 |     # Sort the filenames
141 |     try:
142 |         # Try to sort based on filename pattern (assuming frame_xxxx format)
143 |         frame_names.sort(key=lambda p: int(os.path.splitext(p)[0][4:]))
144 |     except (ValueError, IndexError):
145 |         # Fallback to regular sorting
146 |         frame_names.sort()
147 | 
148 |     # Load the images
149 |     img_paths = [os.path.join(img_folder, frame_name) for frame_name in frame_names]
150 |     img_mean_tensor = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
151 |     img_std_tensor = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
152 | 
153 |     # Load the first image to get dimensions
154 |     first_img = Image.open(img_paths[0])
155 |     video_width, video_height = first_img.size
156 | 
157 |     # Load all images
158 |     num_frames = len(img_paths)
159 |     images = torch.zeros(
160 |         num_frames, 3, image_size, image_size, dtype=torch.float32
161 |     )
162 | 
163 |     for n, img_path in enumerate(tqdm(img_paths, desc="Loading frames")):
164 |         img_pil = Image.open(img_path).convert("RGB").resize(
165 |             (image_size, image_size)
166 |         )
167 |         img_np = np.array(img_pil)
168 |         if img_np.dtype == np.uint8:
169 |             img_np = img_np / 255.0
170 |         else:
171 |             raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}")
172 |         images[n] = torch.from_numpy(img_np).permute(2, 0, 1)
173 | 
174 |     if not offload_video_to_cpu:
175 |         images = images.to(compute_device)
176 |         img_mean_tensor = img_mean_tensor.to(compute_device)
177 |         img_std_tensor = img_std_tensor.to(compute_device)
178 | 
179 |     # Normalize by mean and std
180 |     images -= img_mean_tensor
181 |     images /= img_std_tensor
182 | 
183 |     return images, video_height, video_width
184 | 
185 | 
186 | def preprocess_single_image(
187 |     image_path: str,
188 |     image_size: int = 1024,
189 |     img_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406),
190 |     img_std: Tuple[float, float, float] = (0.229, 0.224, 0.225),
191 |     compute_device: torch.device = torch.device("cuda"),
192 | ) -> Tuple[torch.Tensor, int, int]:
193 |     """
194 |     Preprocess a single image for SAM2 inference.
195 | 
196 |     Args:
197 |         image_path: Path to the image file
198 |         image_size: Size to resize the image to
199 |         img_mean: RGB mean values for normalization
200 |         img_std: RGB standard deviation values for normalization
201 |         compute_device: Device to load the image to
202 | 
203 |     Returns:
204 |         Tuple containing:
205 |             - preprocessed_image: Preprocessed image tensor of shape (1, 3, H, W)
206 |             - height: Original image height
207 |             - width: Original image width
208 | 
209 |     Raises:
210 |         FileNotFoundError: If image file doesn't exist
211 |         RuntimeError: If image has unsupported dtype
212 |     """
213 |     if not os.path.exists(image_path):
214 |         raise FileNotFoundError(f"Image file not found: {image_path}")
215 | 
216 |     # Load and preprocess the image
217 |     img_pil = Image.open(image_path).convert("RGB")
218 |     width, height = img_pil.size
219 | 
220 |     # Resize and convert to numpy array
221 |     img_pil = img_pil.resize((image_size, image_size))
222 |     img_np = np.array(img_pil)
223 | 
224 |     if img_np.dtype == np.uint8:
225 |         img_np = img_np / 255.0
226 |     else:
227 |         raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {image_path}")
228 | 
229 |     # Convert to tensor and normalize
230 |     img_tensor = torch.from_numpy(img_np).permute(2, 0, 1).float()
231 |     img_tensor = img_tensor.unsqueeze(0)  # Add batch dimension
232 | 
233 |     # Create normalization tensors
234 |     img_mean_tensor = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
235 |     img_std_tensor = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
236 | 
237 |     # Move to device
238 |     img_tensor = img_tensor.to(compute_device)
239 |     img_mean_tensor = img_mean_tensor.to(compute_device)
240 |     img_std_tensor = img_std_tensor.to(compute_device)
241 | 
242 |     # Normalize
243 |     img_tensor -= img_mean_tensor
244 |     img_tensor /= img_std_tensor
245 | 
246 |     return img_tensor, height, width
247 | 
248 | 
249 | def segment_image_with_bbox(
250 |     image_path: str,
251 |     bbox: Union[list, np.ndarray],
252 |     checkpoint_path: str = "/sam2/checkpoints/sam2.1_hiera_large.pt",
253 |     model_config: str = "configs/sam2.1/sam2.1_hiera_l.yaml",
254 |     output_path: Optional[str] = None,
255 |     image_size: int = 1024,
256 |     device: torch.device = torch.device("cuda")
257 | ) -> np.ndarray:
258 |     """
259 |     Segment an object in a single image using a 2D bounding box.
260 | 
261 |     Args:
262 |         image_path: Path to the input image
263 |         bbox: 2D bounding box in format [x1, y1, x2, y2]
264 |         checkpoint_path: Path to the SAM2 checkpoint
265 |         model_config: Path to the model configuration file
266 |         output_path: Optional path to save the output mask
267 |         image_size: Size to resize the image to during processing
268 |         device: Device to run inference on
269 | 
270 |     Returns:
271 |         Binary mask of the segmented object as numpy array
272 | 
273 |     Raises:
274 |         Various exceptions from SAM2 model initialization and inference
275 |     """
276 |     # Build the SAM2 predictor
277 |     predictor = build_sam2_video_predictor(
278 |         model_config, checkpoint_path, device=device
279 |     )
280 | 
281 |     # Preprocess the image
282 |     image_tensor, original_height, original_width = preprocess_single_image(
283 |         image_path,
284 |         image_size=image_size,
285 |         compute_device=device
286 |     )
287 | 
288 |     # Convert bbox to numpy array if it's not already
289 |     bbox = np.array(bbox, dtype=np.float32)
290 | 
291 |     # Calculate the center point of the bounding box for positive click
292 |     center_point = np.array([
293 |         [bbox[0] + (bbox[2] - bbox[0]) / 2, bbox[1] + (bbox[3] - bbox[1]) / 2]
294 |     ], dtype=np.float32)
295 | 
296 |     # Set positive label
297 |     labels = np.array([1], np.int32)
298 | 
299 |     with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
300 |         # Initialize the inference state
301 |         tmp_dir = os.path.dirname(image_path)
302 |         inference_state = predictor.init_state(video_path=tmp_dir)
303 |         predictor.reset_state(inference_state)
304 | 
305 |         # Set frame index and object ID
306 |         frame_idx = 0
307 |         obj_id = 1
308 | 
309 |         # Add the bounding box
310 |         _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
311 |             inference_state=inference_state,
312 |             frame_idx=frame_idx,
313 |             obj_id=obj_id,
314 |             box=bbox,
315 |             labels=labels,
316 |             points=center_point,
317 |         )
318 | 
319 |         # Get the mask
320 |         mask = (out_mask_logits[0] > 0.0).cpu().numpy()
321 | 
322 |         # Save the mask if output path is provided
323 |         if output_path:
324 |             mask_image = mask.astype(np.uint8) * 255
325 |             mask_image = Image.fromarray(mask_image[0])
326 |             mask_image.save(output_path)
327 |             logger.debug(f"Saved mask to {output_path}")
328 | 
329 |         return mask
330 | 
331 | 
332 | def process_directory_masks(
333 |     rgb_path: str,
334 |     mask_path: str,
335 |     bbox: Optional[Union[list, np.ndarray]] = None,
336 |     checkpoint_path: str = "/sam2/checkpoints/sam2.1_hiera_large.pt",
337 |     model_config: str = "configs/sam2.1/sam2.1_hiera_l.yaml",
338 |     device: torch.device = torch.device("cuda")
339 | ) -> None:
340 |     """
341 |     Process all images in a directory and generate masks using a bounding box.
342 | 
343 |     The bounding box is applied to the first frame and then propagated through
344 |     all subsequent frames using SAM2's video tracking capabilities.
345 | 
346 |     Args:
347 |         rgb_path: Path to the directory containing RGB images
348 |         mask_path: Path to save the generated masks
349 |         bbox: Bounding box in format [x1, y1, x2, y2]. If None, will be
350 |               calculated as central 80% of the first frame
351 |         checkpoint_path: Path to the SAM2 checkpoint
352 |         model_config: Path to the model configuration file
353 |         device: Device to run inference on
354 | 
355 |     Raises:
356 |         Various exceptions from image loading and SAM2 inference
357 |     """
358 |     # Create mask directory if it doesn't exist
359 |     os.makedirs(mask_path, exist_ok=True)
360 | 
361 |     # Check if masks already exist
362 |     if os.path.exists(mask_path) and any(os.listdir(mask_path)):
363 |         logger.info("Masks already extracted")
364 |         return
365 | 
366 |     # Get all image files in the directory
367 |     image_files = sorted(glob.glob(os.path.join(rgb_path, "*.png")))
368 |     if not image_files:
369 |         image_files = sorted(glob.glob(os.path.join(rgb_path, "*.jpg")))
370 | 
371 |     if not image_files:
372 |         logger.error("No image files found in RGB frames directory")
373 |         return
374 | 
375 |     # If bbox is None, create a default one
376 |     if bbox is None:
377 |         import cv2
378 | 
379 |         # Read the first image to get dimensions
380 |         first_image = cv2.imread(image_files[0])
381 |         height, width = first_image.shape[:2]
382 | 
383 |         # Create a default bounding box (central 80% of the image)
384 |         margin_x = int(width * 0.1)
385 |         margin_y = int(height * 0.1)
386 |         bbox = [margin_x, margin_y, width - margin_x, height - margin_y]
387 | 
388 |         logger.info(f"Using default bounding box: {bbox}")
389 | 
390 |     # Convert bbox to numpy array
391 |     bbox = np.array(bbox, dtype=np.float32)
392 | 
393 |     # Build the SAM2 predictor
394 |     predictor = build_sam2_video_predictor(
395 |         model_config, checkpoint_path, device=device
396 |     )
397 | 
398 |     logger.info(f"Processing {len(image_files)} frames for mask extraction...")
399 | 
400 |     # Process all frames with the same bounding box
401 |     with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
402 |         # Initialize the inference state
403 |         inference_state = predictor.init_state(video_path=rgb_path)
404 |         predictor.reset_state(inference_state)
405 | 
406 |         # Set frame index and object ID
407 |         ann_frame_idx = 0  # the frame index we interact with
408 |         ann_obj_id = 1  # give a unique id to each object
409 | 
410 |         # Calculate the center point of the bounding box for positive click
411 |         points = np.array([
412 |             [bbox[0] + (bbox[2] - bbox[0]) / 2, bbox[1] + (bbox[3] - bbox[1]) / 2]
413 |         ], dtype=np.float32)
414 | 
415 |         # Set positive label
416 |         labels = np.array([1], np.int32)
417 | 
418 |         # Add the bounding box
419 |         _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
420 |             inference_state=inference_state,
421 |             frame_idx=ann_frame_idx,
422 |             obj_id=ann_obj_id,
423 |             box=bbox,
424 |             labels=labels,
425 |             points=points,
426 |         )
427 | 
428 |         # Run propagation throughout the video and collect the results
429 |         video_segments = {}
430 |         for out_frame_idx, out_obj_ids, out_mask_logits in (
431 |             predictor.propagate_in_video(inference_state)
432 |         ):
433 |             video_segments[out_frame_idx] = {
434 |                 out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
435 |                 for i, out_obj_id in enumerate(out_obj_ids)
436 |             }
437 | 
438 |         # Get frame names
439 |         frame_names = [os.path.basename(image_file) for image_file in image_files]
440 | 
441 |         # Render the segmentation results
442 |         for out_frame_idx in range(0, len(frame_names)):
443 |             for out_obj_id, out_mask in video_segments[out_frame_idx].items():
444 |                 mask_image = out_mask.astype(np.uint8) * 255
445 |                 mask_image = Image.fromarray(mask_image[0])
446 |                 mask_output_path = os.path.join(
447 |                     mask_path, frame_names[out_frame_idx]
448 |                 )
449 |                 mask_image.save(mask_output_path)
450 |                 logger.debug(f"Saved mask for frame {out_frame_idx}")
451 | 
452 |     logger.info(f"Mask extraction completed. Masks saved to {mask_path}")
453 | 
454 | 
455 | # Replace the original function with PNG-compatible version
456 | misc.load_video_frames = png_compatible_load_video_frames
457 | 
458 | 
459 | class Sam2Infer:
460 |     """
461 |     SAM2 inference wrapper class.
462 | 
463 |     This class provides a simple interface for running SAM2 mask extraction
464 |     on directories of images using configuration parameters.
465 |     """
466 | 
467 |     def __init__(self, config: Dict[str, Any]) -> None:
468 |         """
469 |         Initialize the SAM2 inference wrapper.
470 | 
471 |         Args:
472 |             config: Configuration dictionary containing:
473 |                 - checkpoint_path: Path to SAM2 model checkpoint
474 |                 - model_config: Path to model configuration file
475 |                 - bbox: Bounding box for segmentation [x1, y1, x2, y2]
476 |                 - device: Device to run inference on
477 |         """
478 |         print(config)
479 |         self.checkpoint_path = config['checkpoint_path']
480 |         self.model_config = config['model_config']
481 |         self.bbox = config['bbox']
482 |         self.device = config['device']
483 | 
484 |     def run(self, rgb_path: str, mask_path: str) -> None:
485 |         """
486 |         Run mask extraction on a directory of images.
487 | 
488 |         Args:
489 |             rgb_path: Path to directory containing RGB images
490 |             mask_path: Path to directory where masks will be saved
491 |         """
492 |         process_directory_masks(
493 |             rgb_path=rgb_path,
494 |             mask_path=mask_path,
495 |             bbox=self.bbox,
496 |             checkpoint_path=self.checkpoint_path,
497 |             model_config=self.model_config
498 |         )
499 | 
500 | 
501 | def run_mask_extraction(
502 |     config: Dict[str, Any],
503 |     exp_path: Path,
504 |     rgb_path: Path,
505 |     mask_path: Path
506 | ) -> bool:
507 |     """
508 |     Set up and run mask extraction with error handling.
509 | 
510 |     Args:
511 |         config: Configuration dictionary for SAM2 inference
512 |         exp_path: Path to experiment directory
513 |         rgb_path: Path to RGB frames directory
514 |         mask_path: Path where masks will be saved
515 | 
516 |     Returns:
517 |         True if mask extraction was successful, False otherwise
518 |     """
519 |     # Create parent directories if they don't exist
520 |     mask_path.parent.mkdir(parents=True, exist_ok=True)
521 |     mask_path.mkdir(exist_ok=True)
522 |     logger.info(f"Mask extraction directory: {mask_path}")
523 | 
524 |     # Check if mask images already exist
525 |     mask_images = list(mask_path.glob('*.png'))
526 |     rgb_images = list(rgb_path.glob('*.png'))
527 |     if mask_images and len(mask_images) == len(rgb_images):
528 |         logger.info("Mask images already exist, skipping mask extraction")
529 |         return True
530 | 
531 |     # Run mask extraction
532 |     logger.info("Running mask extraction...")
533 |     sam2_infer = Sam2Infer(config)
534 | 
535 |     try:
536 |         sam2_infer.run(str(rgb_path), str(mask_path))
537 |         logger.info("Mask extraction completed successfully")
538 |         return True
539 |     except Exception as e:
540 |         logger.error(f"Error running mask extraction: {e}")
541 |         return False
542 | 
543 | 
544 | if __name__ == "__main__":
545 |     """Example usage of the SAM2 inference module."""
546 |     sam2_checkpoint = (
547 |         "/workspace/3d-object-reconstruction/data/weights/sam2/"
548 |         "sam2.1_hiera_large.pt"
549 |     )
550 |     model_cfg = (
551 |         "/workspace/3d-object-reconstruction/data/weights/sam2/"
552 |         "sam2.1_hiera_l.yaml"
553 |     )
554 | 
555 |     # Example for processing a video directory with bounding box
556 |     video_dir = "/workspace/3d-object-reconstruction/data/samples/retail_item/left/"
557 |     output_dir = "/workspace/3d-object-reconstruction/data/samples/retail_item/masks/"
558 | 
559 |     # Define a bounding box [x1, y1, x2, y2]
560 |     bbox = [1144, 627, 2227, 2232]
561 | 
562 |     # Process the directory
563 |     process_directory_masks(
564 |         rgb_path=video_dir,
565 |         mask_path=output_dir,
566 |         bbox=bbox,
567 |         checkpoint_path=sam2_checkpoint,
568 |         model_config=model_cfg
569 |     )
570 | 


--------------------------------------------------------------------------------