├── core ├── __init__.py ├── __pycache__ │ ├── config.cpython-37.pyc │ ├── config.cpython-38.pyc │ ├── config.cpython-39.pyc │ ├── meters.cpython-39.pyc │ ├── optim.cpython-310.pyc │ ├── optim.cpython-311.pyc │ ├── optim.cpython-37.pyc │ ├── optim.cpython-38.pyc │ ├── optim.cpython-39.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── config.cpython-310.pyc │ ├── config.cpython-311.pyc │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── data_util.cpython-310.pyc │ ├── data_util.cpython-311.pyc │ ├── data_util.cpython-37.pyc │ ├── data_util.cpython-38.pyc │ ├── data_util.cpython-39.pyc │ ├── runner_utils.cpython-310.pyc │ ├── runner_utils.cpython-311.pyc │ ├── runner_utils.cpython-37.pyc │ ├── runner_utils.cpython-38.pyc │ └── runner_utils.cpython-39.pyc ├── optim.py ├── meters.py ├── config.py ├── data_util.py └── runner_utils.py ├── models ├── __init__.py ├── PGBP │ ├── __init__.py │ ├── __pycache__ │ │ ├── PGBP.cpython-38.pyc │ │ ├── EAMAT.cpython-310.pyc │ │ ├── EAMAT.cpython-311.pyc │ │ ├── EAMAT.cpython-37.pyc │ │ ├── EAMAT.cpython-38.pyc │ │ ├── EAMAT.cpython-39.pyc │ │ ├── EAMAT1.cpython-38.pyc │ │ ├── EAMAT2.cpython-38.pyc │ │ ├── EAMAT3.cpython-38.pyc │ │ ├── EAMAT4.cpython-38.pyc │ │ ├── EAMAT5.cpython-38.pyc │ │ ├── EAMAT6.cpython-38.pyc │ │ ├── EAMAT7.cpython-38.pyc │ │ ├── EAMAT8.cpython-38.pyc │ │ ├── EAMAT9.cpython-38.pyc │ │ ├── fusion.cpython-37.pyc │ │ ├── fusion.cpython-38.pyc │ │ ├── fusion.cpython-39.pyc │ │ ├── gauss.cpython-38.pyc │ │ ├── layers.cpython-37.pyc │ │ ├── layers.cpython-38.pyc │ │ ├── layers.cpython-39.pyc │ │ ├── EAMAT10.cpython-38.pyc │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── attention.cpython-37.pyc │ │ ├── attention.cpython-38.pyc │ │ ├── attention.cpython-39.pyc │ │ ├── decoder.cpython-38.pyc │ │ ├── decoder1.cpython-38.pyc │ │ ├── decoder2.cpython-38.pyc │ │ ├── decoder3.cpython-38.pyc │ │ ├── decoder4.cpython-38.pyc │ │ ├── decoder5.cpython-38.pyc │ │ ├── decoder6.cpython-38.pyc │ │ ├── decoder7.cpython-38.pyc │ │ ├── decoder8.cpython-38.pyc │ │ ├── encoder.cpython-310.pyc │ │ ├── encoder.cpython-311.pyc │ │ ├── encoder.cpython-37.pyc │ │ ├── encoder.cpython-38.pyc │ │ ├── encoder.cpython-39.pyc │ │ ├── fusion.cpython-310.pyc │ │ ├── fusion.cpython-311.pyc │ │ ├── layers.cpython-310.pyc │ │ ├── layers.cpython-311.pyc │ │ ├── matcher.cpython-38.pyc │ │ ├── operation.cpython-37.pyc │ │ ├── operation.cpython-38.pyc │ │ ├── operation.cpython-39.pyc │ │ ├── EAMAT_text1.cpython-38.pyc │ │ ├── EAMAT_text2.cpython-38.pyc │ │ ├── EAMAT_text3.cpython-38.pyc │ │ ├── EAMAT_text4.cpython-38.pyc │ │ ├── EAMATtext1.cpython-38.pyc │ │ ├── EAMATtext2.cpython-38.pyc │ │ ├── EAMATtext3.cpython-38.pyc │ │ ├── EAMATtext4.cpython-38.pyc │ │ ├── attention.cpython-310.pyc │ │ ├── attention.cpython-311.pyc │ │ ├── operation.cpython-310.pyc │ │ ├── operation.cpython-311.pyc │ │ ├── slidewindow.cpython-38.pyc │ │ ├── span_utils.cpython-38.pyc │ │ ├── EAMATseprate.cpython-38.pyc │ │ ├── phraseEncoder.cpython-38.pyc │ │ ├── triplet_loss.cpython-310.pyc │ │ ├── triplet_loss.cpython-311.pyc │ │ ├── triplet_loss.cpython-37.pyc │ │ ├── triplet_loss.cpython-38.pyc │ │ └── triplet_loss.cpython-39.pyc │ ├── phraseEncoder.py │ ├── gauss.py │ ├── operation.py │ ├── slidewindow.py │ ├── span_utils.py │ ├── matcher.py │ ├── triplet_loss.py │ ├── layers.py │ ├── encoder.py │ ├── fusion.py │ ├── decoder.py │ ├── attention.py │ └── PGBP.py └── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ └── __init__.cpython-39.pyc ├── datasets ├── __pycache__ │ ├── tacos.cpython-310.pyc │ ├── tacos.cpython-311.pyc │ ├── tacos.cpython-37.pyc │ ├── tacos.cpython-38.pyc │ ├── tacos.cpython-39.pyc │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── charades.cpython-310.pyc │ ├── charades.cpython-311.pyc │ ├── charades.cpython-37.pyc │ ├── charades.cpython-38.pyc │ ├── charades.cpython-39.pyc │ ├── BaseDataset.cpython-310.pyc │ ├── BaseDataset.cpython-311.pyc │ ├── BaseDataset.cpython-37.pyc │ ├── BaseDataset.cpython-38.pyc │ ├── BaseDataset.cpython-39.pyc │ ├── activitynet.cpython-310.pyc │ ├── activitynet.cpython-311.pyc │ ├── activitynet.cpython-37.pyc │ ├── activitynet.cpython-38.pyc │ ├── activitynet.cpython-39.pyc │ ├── charades_len.cpython-38.pyc │ └── charades_mom.cpython-38.pyc ├── charades_len.py ├── charades_mom.py ├── activitynet.py ├── charades.py ├── tacos.py ├── __init__.py └── BaseDataset.py ├── conda.sh ├── requirements.txt ├── experiments ├── charades │ └── PGBP.yaml ├── tacos │ └── PGBP.yaml ├── activitynet │ └── PGBP.yaml ├── charades_len │ └── PGBP.yaml └── charades_mom │ └── PGBP.yaml └── README.md /core/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .PGBP import PGBP 2 | -------------------------------------------------------------------------------- /models/PGBP/__init__.py: -------------------------------------------------------------------------------- 1 | from .PGBP import PGBP 2 | -------------------------------------------------------------------------------- /core/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /core/__pycache__/config.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-39.pyc -------------------------------------------------------------------------------- /core/__pycache__/meters.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/meters.cpython-39.pyc -------------------------------------------------------------------------------- /core/__pycache__/optim.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-310.pyc -------------------------------------------------------------------------------- /core/__pycache__/optim.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-311.pyc -------------------------------------------------------------------------------- /core/__pycache__/optim.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/optim.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-38.pyc -------------------------------------------------------------------------------- /core/__pycache__/optim.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-39.pyc -------------------------------------------------------------------------------- /core/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /core/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /core/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /core/__pycache__/config.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-311.pyc -------------------------------------------------------------------------------- /core/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /core/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /core/__pycache__/data_util.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-310.pyc -------------------------------------------------------------------------------- /core/__pycache__/data_util.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-311.pyc -------------------------------------------------------------------------------- /core/__pycache__/data_util.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/data_util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-38.pyc -------------------------------------------------------------------------------- /core/__pycache__/data_util.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/tacos.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-310.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/tacos.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-311.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/tacos.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-37.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/tacos.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/tacos.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/PGBP.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/PGBP.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /core/__pycache__/runner_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-310.pyc -------------------------------------------------------------------------------- /core/__pycache__/runner_utils.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-311.pyc -------------------------------------------------------------------------------- /core/__pycache__/runner_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/runner_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-38.pyc -------------------------------------------------------------------------------- /core/__pycache__/runner_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/charades.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-310.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/charades.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-311.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/charades.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-37.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/charades.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/charades.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT1.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT2.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT3.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT3.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT4.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT4.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT5.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT5.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT6.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT6.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT7.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT7.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT8.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT8.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT9.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT9.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/fusion.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/fusion.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/fusion.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/gauss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/gauss.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/layers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/layers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/layers.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/BaseDataset.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-310.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/BaseDataset.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-311.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/BaseDataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-37.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/BaseDataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/BaseDataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/activitynet.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-310.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/activitynet.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-311.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/activitynet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-37.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/activitynet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/activitynet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-39.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/charades_len.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades_len.cpython-38.pyc -------------------------------------------------------------------------------- /datasets/__pycache__/charades_mom.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades_mom.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT10.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT10.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/attention.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/attention.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder1.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder2.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder3.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder3.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder4.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder4.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder5.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder5.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder6.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder6.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder7.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder7.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/decoder8.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder8.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/encoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/encoder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/encoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/encoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/encoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/fusion.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/fusion.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/layers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/layers.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/matcher.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/matcher.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/operation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/operation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/operation.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-39.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT_text1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text1.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT_text2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text2.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT_text3.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text3.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMAT_text4.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text4.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMATtext1.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext1.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMATtext2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext2.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMATtext3.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext3.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMATtext4.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext4.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/attention.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/attention.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/operation.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/operation.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/slidewindow.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/slidewindow.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/span_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/span_utils.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/EAMATseprate.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATseprate.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/phraseEncoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/phraseEncoder.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/triplet_loss.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-310.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/triplet_loss.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-311.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/triplet_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-37.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/triplet_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-38.pyc -------------------------------------------------------------------------------- /models/PGBP/__pycache__/triplet_loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-39.pyc -------------------------------------------------------------------------------- /conda.sh: -------------------------------------------------------------------------------- 1 | conda create --name PGBP python=3 2 | conda activate PGBP 3 | conda install pytorch torchvision cudatoolkit -c pytorch 4 | pip install easydict torchtext h5py nltk prettytable black transformers tensorboard 5 | -------------------------------------------------------------------------------- /models/PGBP/phraseEncoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class PhraseEncodeNet(nn.Module): 5 | 6 | def __init__(self, dim): 7 | super(PhraseEncodeNet, self).__init__() 8 | self.unigram_conv = nn.Conv1d(dim, dim, 1, stride=1, padding=0) 9 | self.bigram_conv = nn.Conv1d(dim, dim, 2, stride=1, padding=1, dilation=2) 10 | self.trigram_conv = nn.Conv1d(dim, dim, 3, stride=1, padding=2, dilation=2) 11 | self.txt_linear = nn.Linear(dim * 3, dim) 12 | # padding,dilation设定保证L不变 13 | def forward(self, x): 14 | bs, _, dimc = x.size() 15 | words = x.transpose(-1, -2) # B, C, L 16 | unigrams = self.unigram_conv(words) 17 | bigrams = self.bigram_conv(words) # B, C, L 18 | trigrams = self.trigram_conv(words) 19 | phrase = torch.cat((unigrams, bigrams, trigrams), dim=1) 20 | phrase = phrase.transpose(-1, -2).view(bs, -1, dimc * 3) 21 | phrase = self.txt_linear(phrase) 22 | return phrase 23 | 24 | -------------------------------------------------------------------------------- /models/PGBP/gauss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def generate_gaussian_tensor(inter_label, key_frames, hp_sigma): 4 | """ 5 | Generate a tensor with each batch as a Gaussian sequence. 6 | 7 | :param B: Batch size. 8 | :param L: Length of each sequence. 9 | :param key_frames: Tensor of shape (B,) containing key frames. 10 | :param variances: Tensor of shape (B,) containing variances. 11 | :return: Tensor with shape (B, L) containing Gaussian sequences. 12 | """ 13 | # Generate a range of values from 0 to L-1 14 | B,L = inter_label.shape 15 | variances = hp_sigma * torch.sum(inter_label,dim =1) 16 | x_values = torch.arange(0, L, 1).float().cuda() 17 | 18 | # Repeat key_frames and variances for each batch 19 | key_frames = key_frames.view(-1, 1).repeat(1, L) 20 | variances = variances.view(-1, 1).repeat(1, L) 21 | 22 | # Calculate Gaussian values using the norm.pdf function 23 | gaussian_values = torch.exp(-(x_values - key_frames)**2 / (2 * variances**2)) 24 | return gaussian_values 25 | 26 | 27 | -------------------------------------------------------------------------------- /core/optim.py: -------------------------------------------------------------------------------- 1 | from transformers import AdamW, get_linear_schedule_with_warmup 2 | 3 | from .config import config 4 | 5 | 6 | def build_optimizer_and_scheduler(model, lr, num_train_steps, 7 | warmup_proportion): 8 | no_decay = ['bias', 'layer_norm', 9 | 'LayerNorm'] # no decay for parameters of layer norm and bias 10 | optimizer_grouped_parameters = [{ 11 | 'params': [ 12 | p for n, p in model.named_parameters() 13 | if not any(nd in n for nd in no_decay) 14 | ], 15 | 'weight_decay': 16 | config.TRAIN.WEIGHT_DECAY 17 | }, { 18 | 'params': [ 19 | p for n, p in model.named_parameters() 20 | if any(nd in n for nd in no_decay) 21 | ], 22 | 'weight_decay': 23 | 0.0 24 | }] 25 | optimizer = AdamW(optimizer_grouped_parameters, lr=lr) 26 | scheduler = get_linear_schedule_with_warmup( 27 | optimizer, num_train_steps * warmup_proportion, num_train_steps) 28 | return optimizer, scheduler -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==2.0.0 2 | black==23.10.1 3 | certifi==2023.7.22 4 | charset-normalizer==3.3.2 5 | click==8.1.7 6 | cmake==3.25.0 7 | contourpy==1.1.1 8 | cycler==0.12.1 9 | easydict==1.11 10 | einops==0.7.0 11 | filelock==3.13.1 12 | fonttools==4.44.3 13 | fsspec==2023.10.0 14 | grpcio==1.59.2 15 | h5py==3.10.0 16 | huggingface-hub==0.17.3 17 | idna==3.4 18 | importlib-metadata==6.8.0 19 | importlib-resources==6.1.1 20 | Jinja2==3.1.2 21 | joblib==1.3.2 22 | jstyleson==0.0.2 23 | kiwisolver==1.4.5 24 | lit==15.0.7 25 | Markdown==3.5.1 26 | MarkupSafe==2.1.3 27 | mpmath==1.3.0 28 | mypy-extensions==1.0.0 29 | networkx==3.0 30 | nltk==3.8.1 31 | numpy==1.18.5 32 | packaging==23.2 33 | pandas==1.1.5 34 | pathspec==0.11.2 35 | Pillow==10.1.0 36 | platformdirs==3.11.0 37 | prettytable==3.9.0 38 | protobuf==3.20.0 39 | pyparsing==3.1.1 40 | python-dateutil==2.8.2 41 | pytz==2023.3.post1 42 | PyYAML==6.0.1 43 | regex==2023.10.3 44 | requests==2.31.0 45 | safetensors==0.4.0 46 | six==1.16.0 47 | sympy==1.12 48 | tensorboard==1.15.0 49 | tokenizers==0.14.1 50 | tomli==2.0.1 51 | torch==2.0.0+cu118 52 | torchaudio==2.0.1+cu118 53 | torchdata==0.6.0 54 | torchtext==0.15.1 55 | torchvision==0.15.1+cu118 56 | tqdm==4.66.1 57 | transformers==4.35.0 58 | triton==2.0.0 59 | typing_extensions==4.8.0 60 | urllib3==2.0.7 61 | vpdb==1.0.0 62 | wcwidth==0.2.9 63 | Werkzeug==3.0.1 64 | zipp==3.17.0 65 | -------------------------------------------------------------------------------- /models/PGBP/operation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import math 6 | 7 | def cw2se(cw, fix_out_of_bound=False): 8 | # 创建一个与输入张量 cw 相同形状的全零张量 se 9 | se = torch.zeros_like(cw) 10 | 11 | # 计算起始坐标 12 | se[..., 0] = cw[..., 0] - cw[..., 1] / 2 13 | 14 | # 计算结束坐标 15 | se[..., 1] = cw[..., 0] + cw[..., 1] / 2 16 | 17 | # 如果开启修复越界的选项 18 | if fix_out_of_bound: 19 | # 将小于 0.0 的起始坐标修正为 0.0 20 | se[..., 0][se[..., 0] < 0.0] = 0.0 21 | # 将大于 1.0 的结束坐标修正为 1.0 22 | se[..., 1][se[..., 1] > 1.0] = 1.0 23 | return se 24 | 25 | def mask_logits(inputs, mask, mask_value=-1e30): 26 | mask = mask.type(torch.float32) 27 | return inputs + (1.0 - mask) * mask_value 28 | 29 | 30 | class Conv1D(nn.Module): 31 | def __init__(self, 32 | in_dim, 33 | out_dim, 34 | kernel_size=1, 35 | stride=1, 36 | padding=0, 37 | bias=True): 38 | super(Conv1D, self).__init__() 39 | self.conv1d = nn.Conv1d(in_channels=in_dim, 40 | out_channels=out_dim, 41 | kernel_size=kernel_size, 42 | padding=padding, 43 | stride=stride, 44 | bias=bias) 45 | 46 | def forward(self, x): 47 | # suppose all the input with shape (batch_size, seq_len, dim) 48 | x = x.transpose(1, 2) # (batch_size, dim, seq_len) 49 | x = self.conv1d(x) 50 | return x.transpose(1, 2) # (batch_size, seq_len, dim) -------------------------------------------------------------------------------- /datasets/charades_len.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the Charades-STA dataset """ 2 | import os 3 | import csv 4 | 5 | import h5py 6 | import numpy as np 7 | import torch 8 | from torch import nn 9 | import torch.nn.functional as F 10 | import torch.utils.data as data 11 | import torchtext 12 | import json 13 | from . import average_to_fixed_length 14 | from .BaseDataset import BaseDataset 15 | from core.config import config 16 | 17 | 18 | class Charades_len(BaseDataset): 19 | def __init__(self, split): 20 | # statistics for all video length 21 | # min:12 max:390 mean: 62, std:18 22 | # max sentence length:train->10, test->10 23 | super(Charades_len, self).__init__(split) 24 | 25 | def __len__(self): 26 | return len(self.annotations) 27 | 28 | def get_annotation(self): 29 | 30 | anno_file = open( 31 | os.path.join(self.anno_dirs['Charades'], 32 | "{}_len_80.jsonl".format(self.split)), 'r') 33 | annotations = [] 34 | # max_sentence_length = 0 35 | for line in anno_file: 36 | line_obj = json.loads(line.strip()) 37 | sent = line_obj["query"] 38 | vid = line_obj["vid"] 39 | times = line_obj["relevant_windows"][0] 40 | duration = line_obj["duration"] 41 | annotations.append({ 42 | 'video': vid, 43 | 'times': times, 44 | 'description': sent, 45 | 'duration': duration, 46 | 'dataset': 'Charades_len' 47 | }) 48 | anno_file.close() 49 | # print("charade max sentence length: ", max_sentence_length) 50 | return annotations -------------------------------------------------------------------------------- /datasets/charades_mom.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the Charades-STA dataset """ 2 | import os 3 | import csv 4 | 5 | import h5py 6 | import numpy as np 7 | import torch 8 | from torch import nn 9 | import torch.nn.functional as F 10 | import torch.utils.data as data 11 | import torchtext 12 | import json 13 | from . import average_to_fixed_length 14 | from .BaseDataset import BaseDataset 15 | from core.config import config 16 | 17 | 18 | class Charades_mom(BaseDataset): 19 | def __init__(self, split): 20 | # statistics for all video length 21 | # min:12 max:390 mean: 62, std:18 22 | # max sentence length:train->10, test->10 23 | super(Charades_mom, self).__init__(split) 24 | 25 | def __len__(self): 26 | return len(self.annotations) 27 | 28 | def get_annotation(self): 29 | 30 | anno_file = open( 31 | os.path.join(self.anno_dirs['Charades'], 32 | "{}_mom_80.jsonl".format(self.split)), 'r') 33 | annotations = [] 34 | # max_sentence_length = 0 35 | for line in anno_file: 36 | line_obj = json.loads(line.strip()) 37 | sent = line_obj["query"] 38 | vid = line_obj["vid"] 39 | times = line_obj["relevant_windows"][0] 40 | duration = line_obj["duration"] 41 | annotations.append({ 42 | 'video': vid, 43 | 'times': times, 44 | 'description': sent, 45 | 'duration': duration, 46 | 'dataset': 'Charades_mom' 47 | }) 48 | anno_file.close() 49 | # print("charade max sentence length: ", max_sentence_length) 50 | return annotations -------------------------------------------------------------------------------- /datasets/activitynet.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the ActivityNet Captions dataset """ 2 | import os 3 | import json 4 | 5 | import h5py 6 | import torch 7 | from torch import nn 8 | import torch.nn.functional as F 9 | import torch.utils.data as data 10 | import torchtext 11 | 12 | from .BaseDataset import BaseDataset 13 | from . import average_to_fixed_length 14 | from core.config import config 15 | 16 | 17 | class ActivityNet(BaseDataset): 18 | def __init__(self, split): 19 | # statistics for all video length 20 | # min:2 medium: max:1415 mean: 204, std:120 21 | # max sentence length:train-->73, test-->73 22 | super(ActivityNet, self).__init__(split) 23 | 24 | def __len__(self): 25 | return len(self.annotations) 26 | 27 | 28 | def get_annotation(self): 29 | 30 | with open( 31 | os.path.join(self.anno_dirs['ActivityNet'], 32 | '{}_data.json'.format(self.split)), 'r') as f: 33 | annotations = json.load(f) 34 | anno_pairs = [] 35 | for video_anno in annotations: 36 | 37 | vid = video_anno[0] 38 | duration = video_anno[1] 39 | timestamp = video_anno[2] 40 | sentence = video_anno[3] 41 | 42 | if timestamp[0] < timestamp[1]: 43 | anno_pairs.append({ 44 | 'video': 45 | vid, 46 | 'duration': 47 | duration, 48 | 'times': 49 | [max(timestamp[0], 0), 50 | min(timestamp[1], duration)], 51 | 'description': 52 | sentence, 53 | 'dataset': 54 | 'ActivityNet' 55 | }) 56 | # print("activitynet max sentence length: ", max_sentence_length) 57 | return anno_pairs -------------------------------------------------------------------------------- /datasets/charades.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the Charades-STA dataset """ 2 | import os 3 | import csv 4 | 5 | import h5py 6 | import numpy as np 7 | import torch 8 | from torch import nn 9 | import torch.nn.functional as F 10 | import torch.utils.data as data 11 | import torchtext 12 | 13 | from . import average_to_fixed_length 14 | from .BaseDataset import BaseDataset 15 | from core.config import config 16 | 17 | 18 | class Charades(BaseDataset): 19 | def __init__(self, split): 20 | # statistics for all video length 21 | # min:12 max:390 mean: 62, std:18 22 | # max sentence length:train->10, test->10 23 | super(Charades, self).__init__(split) 24 | 25 | def __len__(self): 26 | return len(self.annotations) 27 | 28 | def get_annotation(self): 29 | self.durations = {} 30 | with open( 31 | os.path.join(self.anno_dirs['Charades'], 32 | 'Charades_v1_{}.csv'.format(self.split))) as f: 33 | reader = csv.DictReader(f) 34 | for row in reader: 35 | self.durations[row['id']] = float(row['length']) 36 | 37 | anno_file = open( 38 | os.path.join(self.anno_dirs['Charades'], 39 | "charades_sta_{}.txt".format(self.split)), 'r') 40 | annotations = [] 41 | # max_sentence_length = 0 42 | for line in anno_file: 43 | anno, sent = line.split("##") 44 | sent = sent.split('.\n')[0] 45 | vid, s_time, e_time = anno.split(" ") 46 | s_time = float(s_time) 47 | e_time = min(float(e_time), self.durations[vid]) 48 | if s_time < e_time: 49 | annotations.append({ 50 | 'video': vid, 51 | 'times': [s_time, e_time], 52 | 'description': sent, 53 | 'duration': self.durations[vid], 54 | 'dataset': 'Charades' 55 | }) 56 | anno_file.close() 57 | # print("charade max sentence length: ", max_sentence_length) 58 | return annotations -------------------------------------------------------------------------------- /datasets/tacos.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the TACoS dataset """ 2 | import os 3 | import json 4 | 5 | import h5py 6 | import torch 7 | from torch import nn 8 | import torch.nn.functional as F 9 | import torch.utils.data as data 10 | import torchtext 11 | 12 | from . import average_to_fixed_length 13 | from .BaseDataset import BaseDataset 14 | from core.config import config 15 | 16 | 17 | class TACoS(BaseDataset): 18 | def __init__(self, split): 19 | # statistics for all video length 20 | # min:90 max:2578 mean: 528, std:436 21 | # max sentence length:train-->46, test-->50 22 | super(TACoS, self).__init__(split) 23 | 24 | def __len__(self): 25 | return len(self.annotations) 26 | 27 | def get_annotation(self): 28 | # val_1.json is renamed as val.json, val_2.json is renamed as test.json 29 | with open( 30 | os.path.join(self.anno_dirs['TACoS'], 31 | '{}.json'.format(self.split)), 'r') as f: 32 | annotations = json.load(f) 33 | anno_pairs = [] 34 | # max_sentence_length = 0 35 | for vid, video_anno in annotations.items(): 36 | duration = video_anno['num_frames'] / video_anno['fps'] 37 | for timestamp, sentence in zip(video_anno['timestamps'], 38 | video_anno['sentences']): 39 | if timestamp[0] < timestamp[1]: 40 | anno_pairs.append({ 41 | 'video': 42 | vid, 43 | # vid[:-4], 44 | 'duration': 45 | duration, 46 | 'times': [ 47 | max(timestamp[0] / video_anno['fps'], 0), 48 | min(timestamp[1] / video_anno['fps'], duration) 49 | ], 50 | 'description': 51 | sentence, 52 | 'dataset': 53 | 'TACoS' 54 | }) 55 | # print("tacos max sentence length: ", max_sentence_length) 56 | return anno_pairs -------------------------------------------------------------------------------- /experiments/charades/PGBP.yaml: -------------------------------------------------------------------------------- 1 | WORKERS: 4 2 | 3 | SEED: 328 4 | 5 | DATASET: 6 | NAME: Charades 7 | NO_VAL: True 8 | NORMALIZE: False 9 | num_pairs: 15 10 | num_clips: 256 11 | 12 | MODEL: 13 | NAME: PGBP 14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7 15 | PARAMS: 16 | aligned_len: True 17 | sementic_fu: True 18 | use_keyword: True 19 | cost_class: 4.0 20 | cost_span: 10.0 21 | cost_giou: 1.0 22 | eos_coef: 0.1 23 | content_prior: "learned" 24 | neg: True 25 | top_k: 6 26 | pos: True 27 | top_k0: 1 28 | fusion_attention: True 29 | num_queries: 10 30 | windowsize: 10 31 | video_feature_dim: 1024 32 | query_feature_dim: 300 33 | max_len_query_tag: 8 34 | dim: 512 35 | query_position: False 36 | video_position: True 37 | query_attention_layers: 1 38 | video_attention_layers: 1 39 | query_attention: "MultiLSTMAttention" 40 | video_attention: "MultiHeadAttention" 41 | early_attention: "MultiHeadAttention" 42 | detr_attention: "DETR_Decoder" 43 | detr_layers: 5 44 | early_attention_layers: 1 45 | post_attention_layers: 2 46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention] 47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion] 48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion 49 | beta: 2 50 | MULTI_SCALE: True 51 | MULTI_SCALE_LEN: 1 52 | num_heads: 8 53 | num_layers: 1 54 | num_step: 3 55 | bi_direction: True 56 | kernels: [3, 5, 7] 57 | drop_rate: 0.5 58 | DEBUG: False 59 | 60 | modulate_t_attn: True 61 | bbox_embed_diff_each_layer: False 62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise' 63 | query_dim: 2 64 | return_intermediate: True 65 | feedforward: 1024 66 | dropout: 0.1 67 | activation: "relu" 68 | normalize_before: False 69 | keep_query_pos: False 70 | rm_self_attn_decoder: False 71 | 72 | TRAIN: 73 | BATCH_SIZE: 16 74 | LR: 0.00005 75 | WEIGHT_DECAY: 0.01 76 | MAX_EPOCH: 50 77 | MILE_STONE: [80] 78 | GAMMA: 0.1 79 | CONTINUE: False 80 | 81 | LOSS: 82 | LOCALIZATION: 10.0 83 | MATCH: 10.0 84 | DISTANCE: 0.0 85 | KL: 100.0 86 | EARLY: 1.0 87 | contrast: 1.0 88 | cont: 1.0 89 | hy_sigma: 1.0 90 | contrast_weight: True 91 | bce: 4.0 92 | iou: 1.0 93 | 94 | TEST: 95 | BATCH_SIZE: 32 96 | EVAL_TRAIN: True 97 | -------------------------------------------------------------------------------- /experiments/tacos/PGBP.yaml: -------------------------------------------------------------------------------- 1 | WORKERS: 2 2 | 3 | SEED: 12345 4 | 5 | DATASET: 6 | NAME: TACoS 7 | NO_VAL: True 8 | NORMALIZE: False 9 | num_pairs: 15 10 | num_clips: 500 11 | 12 | MODEL: 13 | NAME: PGBP 14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7 15 | PARAMS: 16 | aligned_len: True 17 | sementic_fu: True 18 | use_keyword: True 19 | cost_class: 4.0 20 | cost_span: 10.0 21 | cost_giou: 1.0 22 | eos_coef: 0.1 23 | content_prior: "learned" 24 | neg: True 25 | top_k: 5 26 | pos: True 27 | top_k0: 3 28 | fusion_attention: True 29 | num_queries: 10 30 | windowsize: 10 31 | video_feature_dim: 4096 32 | query_feature_dim: 300 33 | max_len_query_tag: 8 34 | dim: 512 35 | query_position: False 36 | video_position: True 37 | query_attention_layers: 1 38 | video_attention_layers: 1 39 | query_attention: "MultiLSTMAttention" 40 | video_attention: "MultiHeadAttention" 41 | early_attention: "MultiHeadAttention" 42 | detr_attention: "DETR_Decoder" 43 | detr_layers: 5 44 | early_attention_layers: 1 45 | post_attention_layers: 3 46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention] 47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion] 48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion 49 | beta: 2 50 | MULTI_SCALE: True 51 | MULTI_SCALE_LEN: 1 52 | num_heads: 8 53 | num_layers: 1 54 | num_step: 3 55 | bi_direction: True 56 | kernels: [3, 5, 7] 57 | drop_rate: 0.5 58 | DEBUG: False 59 | 60 | modulate_t_attn: True 61 | bbox_embed_diff_each_layer: False 62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise' 63 | query_dim: 2 64 | return_intermediate: True 65 | feedforward: 1024 66 | dropout: 0.1 67 | activation: "relu" 68 | normalize_before: False 69 | keep_query_pos: False 70 | rm_self_attn_decoder: False 71 | 72 | TRAIN: 73 | BATCH_SIZE: 16 74 | LR: 0.00005 75 | WEIGHT_DECAY: 0.01 76 | MAX_EPOCH: 50 77 | MILE_STONE: [80] 78 | GAMMA: 0.1 79 | CONTINUE: False 80 | 81 | LOSS: 82 | LOCALIZATION: 10.0 83 | MATCH: 10.0 84 | DISTANCE: 0.0 85 | KL: 100.0 86 | EARLY: 1.0 87 | contrast: 1.0 88 | cont: 1.0 89 | hy_sigma: 1.0 90 | contrast_weight: True 91 | bce: 4.0 92 | iou: 1.0 93 | 94 | TEST: 95 | BATCH_SIZE: 32 96 | EVAL_TRAIN: True 97 | -------------------------------------------------------------------------------- /experiments/activitynet/PGBP.yaml: -------------------------------------------------------------------------------- 1 | WORKERS: 4 2 | 3 | SEED: 12345 4 | 5 | DATASET: 6 | NAME: ActivityNet 7 | NO_VAL: True 8 | NORMALIZE: True 9 | num_pairs: 15 10 | num_clips: 256 11 | 12 | MODEL: 13 | NAME: PGBP 14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7 15 | PARAMS: 16 | aligned_len: True 17 | sementic_fu: True 18 | use_keyword: True 19 | cost_class: 4.0 20 | cost_span: 10.0 21 | cost_giou: 1.0 22 | eos_coef: 0.1 23 | content_prior: "learned" 24 | neg: True 25 | top_k: 5 26 | pos: True 27 | top_k0: 1 28 | fusion_attention: True 29 | num_queries: 10 30 | windowsize: 10 31 | video_feature_dim: 1024 32 | query_feature_dim: 300 33 | max_len_query_tag: 8 34 | dim: 512 35 | query_position: False 36 | video_position: True 37 | query_attention_layers: 1 38 | video_attention_layers: 1 39 | query_attention: "MultiLSTMAttention" 40 | video_attention: "MultiHeadAttention" 41 | early_attention: "MultiHeadAttention" 42 | detr_attention: "DETR_Decoder" 43 | detr_layers: 5 44 | early_attention_layers: 1 45 | post_attention_layers: 2 46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention] 47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion] 48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion 49 | beta: 2 50 | MULTI_SCALE: True 51 | MULTI_SCALE_LEN: 1 52 | num_heads: 8 53 | num_layers: 1 54 | num_step: 3 55 | bi_direction: True 56 | kernels: [3, 5, 7] 57 | drop_rate: 0.5 58 | DEBUG: False 59 | 60 | modulate_t_attn: True 61 | bbox_embed_diff_each_layer: False 62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise' 63 | query_dim: 2 64 | return_intermediate: True 65 | feedforward: 1024 66 | dropout: 0.1 67 | activation: "relu" 68 | normalize_before: False 69 | keep_query_pos: False 70 | rm_self_attn_decoder: False 71 | 72 | TRAIN: 73 | BATCH_SIZE: 64 74 | LR: 0.00005 75 | WEIGHT_DECAY: 0.01 76 | MAX_EPOCH: 50 77 | MILE_STONE: [80] 78 | GAMMA: 0.1 79 | CONTINUE: False 80 | 81 | LOSS: 82 | LOCALIZATION: 10.0 83 | MATCH: 10.0 84 | DISTANCE: 0.0 85 | KL: 100.0 86 | EARLY: 1.0 87 | contrast: 1.0 88 | cont: 1.0 89 | hy_sigma: 1.0 90 | contrast_weight: True 91 | bce: 4.0 92 | iou: 1.0 93 | 94 | TEST: 95 | BATCH_SIZE: 64 96 | EVAL_TRAIN: True 97 | -------------------------------------------------------------------------------- /experiments/charades_len/PGBP.yaml: -------------------------------------------------------------------------------- 1 | WORKERS: 4 2 | 3 | SEED: 328 4 | 5 | DATASET: 6 | NAME: Charades_len 7 | NO_VAL: True 8 | NORMALIZE: True 9 | num_pairs: 15 10 | num_clips: 256 11 | 12 | MODEL: 13 | NAME: PGBP 14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7 15 | PARAMS: 16 | aligned_len: True 17 | sementic_fu: True 18 | use_keyword: True 19 | cost_class: 4.0 20 | cost_span: 10.0 21 | cost_giou: 1.0 22 | eos_coef: 0.1 23 | content_prior: "learned" 24 | neg: True 25 | top_k: 6 26 | pos: True 27 | top_k0: 1 28 | fusion_attention: True 29 | num_queries: 10 30 | windowsize: 10 31 | video_feature_dim: 4096 32 | query_feature_dim: 300 33 | max_len_query_tag: 8 34 | dim: 512 35 | query_position: False 36 | video_position: True 37 | query_attention_layers: 1 38 | video_attention_layers: 1 39 | query_attention: "MultiLSTMAttention" 40 | video_attention: "MultiHeadAttention" 41 | early_attention: "MultiHeadAttention" 42 | detr_attention: "DETR_Decoder" 43 | detr_layers: 5 44 | early_attention_layers: 1 45 | post_attention_layers: 2 46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention] 47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion] 48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion 49 | beta: 2 50 | MULTI_SCALE: True 51 | MULTI_SCALE_LEN: 1 52 | num_heads: 8 53 | num_layers: 1 54 | num_step: 3 55 | bi_direction: True 56 | kernels: [3, 5, 7] 57 | drop_rate: 0.5 58 | DEBUG: False 59 | 60 | modulate_t_attn: True 61 | bbox_embed_diff_each_layer: False 62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise' 63 | query_dim: 2 64 | return_intermediate: True 65 | feedforward: 1024 66 | dropout: 0.1 67 | activation: "relu" 68 | normalize_before: False 69 | keep_query_pos: False 70 | rm_self_attn_decoder: False 71 | 72 | TRAIN: 73 | BATCH_SIZE: 16 74 | LR: 0.00005 75 | WEIGHT_DECAY: 0.01 76 | MAX_EPOCH: 50 77 | MILE_STONE: [80] 78 | GAMMA: 0.1 79 | CONTINUE: False 80 | 81 | LOSS: 82 | LOCALIZATION: 10.0 83 | MATCH: 10.0 84 | DISTANCE: 0.0 85 | KL: 100.0 86 | EARLY: 1.0 87 | contrast: 1.0 88 | cont: 1.0 89 | hy_sigma: 1.0 90 | contrast_weight: True 91 | bce: 4.0 92 | iou: 1.0 93 | 94 | TEST: 95 | BATCH_SIZE: 32 96 | EVAL_TRAIN: True 97 | -------------------------------------------------------------------------------- /experiments/charades_mom/PGBP.yaml: -------------------------------------------------------------------------------- 1 | WORKERS: 4 2 | 3 | SEED: 328 4 | 5 | DATASET: 6 | NAME: Charades_mom 7 | NO_VAL: True 8 | NORMALIZE: True 9 | num_pairs: 15 10 | num_clips: 256 11 | 12 | MODEL: 13 | NAME: PGBP 14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7 15 | PARAMS: 16 | aligned_len: True 17 | sementic_fu: True 18 | use_keyword: True 19 | cost_class: 4.0 20 | cost_span: 10.0 21 | cost_giou: 1.0 22 | eos_coef: 0.1 23 | content_prior: "learned" 24 | neg: True 25 | top_k: 6 26 | pos: True 27 | top_k0: 1 28 | fusion_attention: True 29 | num_queries: 10 30 | windowsize: 10 31 | video_feature_dim: 4096 32 | query_feature_dim: 300 33 | max_len_query_tag: 8 34 | dim: 512 35 | query_position: False 36 | video_position: True 37 | query_attention_layers: 1 38 | video_attention_layers: 1 39 | query_attention: "MultiLSTMAttention" 40 | video_attention: "MultiHeadAttention" 41 | early_attention: "MultiHeadAttention" 42 | detr_attention: "DETR_Decoder" 43 | detr_layers: 5 44 | early_attention_layers: 1 45 | post_attention_layers: 2 46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention] 47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion] 48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion 49 | beta: 2 50 | MULTI_SCALE: True 51 | MULTI_SCALE_LEN: 1 52 | num_heads: 8 53 | num_layers: 1 54 | num_step: 3 55 | bi_direction: True 56 | kernels: [3, 5, 7] 57 | drop_rate: 0.5 58 | DEBUG: False 59 | 60 | modulate_t_attn: True 61 | bbox_embed_diff_each_layer: False 62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise' 63 | query_dim: 2 64 | return_intermediate: True 65 | feedforward: 1024 66 | dropout: 0.1 67 | activation: "relu" 68 | normalize_before: False 69 | keep_query_pos: False 70 | rm_self_attn_decoder: False 71 | 72 | TRAIN: 73 | BATCH_SIZE: 16 74 | LR: 0.00005 75 | WEIGHT_DECAY: 0.01 76 | MAX_EPOCH: 50 77 | MILE_STONE: [80] 78 | GAMMA: 0.1 79 | CONTINUE: False 80 | 81 | LOSS: 82 | LOCALIZATION: 10.0 83 | MATCH: 10.0 84 | DISTANCE: 0.0 85 | KL: 100.0 86 | EARLY: 1.0 87 | contrast: 1.0 88 | cont: 1.0 89 | hy_sigma: 1.0 90 | contrast_weight: True 91 | bce: 4.0 92 | iou: 1.0 93 | 94 | TEST: 95 | BATCH_SIZE: 32 96 | EVAL_TRAIN: True 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aggregate and Discriminate: Pseudo Clips-Guided Boundary Perception for Video Moment Retrieval 2 | 3 | This is implementation for the paper "Aggregate and Discriminate: Pseudo Clips-Guided Boundary Perception for Video Moment Retrieval" (**TMM 2024**) 4 | 5 | ```shell 6 | # preparing environment 7 | bash conda.sh 8 | ``` 9 | ## Introduction 10 | Video moment retrieval (VMR) aims to localize a video segment in an untrimmed video, that is semantically relevant to a language query. The challenge of this task lies in effectively aligning the intricate and information-dense video modality with the succinctly summarized textual modality, and further localizing the starting and ending timestamps of the target moments. Previous works have attempted to achieve multi-granularity alignment of video and query in a coarse to fine manner, yet these efforts still fall short in addressing the inherent disparities in representation and information density between videos and queries, leading to modal misalignments. In this paper, we propose a progressive video moment retrieval framework, initially retrieving the most relevant and irrelevant video clips to the query as semantic guidance, thereby bridging the semantic gap between video modality and language modality. Futher more, we introduce a pseudo clips guided aggregation module to aggregate densely relevant moment clips closer together and propose a discriminative boundary-enhanced decoder with the guidance of pseudo clips to push the semantically confusing proposals away. Extensive experiments on the Charades-STA, ActivityNet Captions and TACoS datasets demonstrate that our method outperforms existing methods. 11 |
12 | image 13 |
14 | 15 | ## Dataset Preparation 16 | We use [VSLNet's](https://github.com/IsaacChanghau/VSLNet) data. The visual features can be download [here](https://app.box.com/s/h0sxa5klco6qve5ahnz50ly2nksmuedw), for CharadesSTA we use the "new" fold, and for TACoS we use the "old" fold, annotation and other details can be found [here](https://github.com/IsaacChanghau/VSLNet/tree/master/prepare) 17 | and then modify the line 81~91 of "dataset/BaseDataset.py" to your own path. 18 | 19 | ## Quick Start 20 | **Train** 21 | ```shell script 22 | python main.py --cfg experiments/activitynet/PGBP.yaml --mode train 23 | python main.py --cfg experiments/charades/PGBP.yaml --mode train 24 | python main.py --cfg experiments/tacos/PGBP.yaml --mode train 25 | 26 | python main.py --cfg experiments/charades_len/PGBP.yaml --mode train 27 | python main.py --cfg experiments/charades_mom/PGBP.yaml --mode train 28 | ``` 29 | a new fold "results" are created. 30 | 31 | ## Citation 32 | If you feel this project helpful to your research, please cite our work. 33 | -------------------------------------------------------------------------------- /core/meters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | class CatMeter: 6 | ''' 7 | Concatenate Meter for torch.Tensor 8 | ''' 9 | def __init__(self): 10 | self.reset() 11 | 12 | def reset(self): 13 | self.val = None 14 | 15 | def update(self, val): 16 | if self.val is None: 17 | self.val = val 18 | else: 19 | self.val = torch.cat([self.val, val], dim=0) 20 | 21 | def get_val(self): 22 | return self.val 23 | 24 | def get_val_numpy(self): 25 | return self.val.data.cpu().numpy() 26 | 27 | 28 | class MultiItemAverageMeter: 29 | def __init__(self): 30 | self.reset() 31 | 32 | def reset(self): 33 | self.content = {} 34 | 35 | def update(self, val): 36 | ''' 37 | :param val: dict, keys are strs, values are torch.Tensor or np.array 38 | ''' 39 | for key in list(val.keys()): 40 | value = val[key] 41 | if key not in list(self.content.keys()): 42 | self.content[key] = {'avg': value, 'sum': value, 'count': 1.0} 43 | else: 44 | self.content[key]['sum'] += value 45 | self.content[key]['count'] += 1.0 46 | self.content[key]['avg'] = self.content[key]['sum'] / \ 47 | self.content[key]['count'] 48 | 49 | def get_val(self): 50 | keys = list(self.content.keys()) 51 | values = [] 52 | for key in keys: 53 | val = self.content[key]['avg'] 54 | if isinstance(val, torch.Tensor): 55 | val = val.data.cpu().numpy() 56 | values.append(val) 57 | return keys, values 58 | 59 | def get_str(self): 60 | 61 | result = '' 62 | keys, values = self.get_val() 63 | 64 | for key, value in zip(keys, values): 65 | result += key 66 | result += ': ' 67 | if isinstance(value, np.ndarray): 68 | value = np.round(value, 5) 69 | result += str(value) 70 | result += '; ' 71 | 72 | return result 73 | 74 | 75 | # class AverageMeter: 76 | # """ 77 | # Average Meter 78 | # """ 79 | 80 | # def __init__(self): 81 | # self.reset() 82 | 83 | # def reset(self): 84 | # self.sum = 0 85 | # self.count = 0 86 | 87 | # def update(self, val): 88 | # self.sum += val 89 | # self.count += 1 90 | 91 | # def get_val(self): 92 | # return self.sum / self.count 93 | 94 | 95 | class AverageMeter(object): 96 | """Computes and stores the average and current value""" 97 | def __init__(self): 98 | self.reset() 99 | 100 | def reset(self): 101 | self.val = 0 102 | self.avg = 0 103 | self.sum = 0 104 | self.count = 0 105 | 106 | def update(self, val, n=1): 107 | self.val = val 108 | self.sum += val * n 109 | self.count += n 110 | self.avg = self.sum / self.count 111 | 112 | def get_val(self): 113 | return self.sum / self.count 114 | -------------------------------------------------------------------------------- /core/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import yaml 6 | from easydict import EasyDict as edict 7 | 8 | config = edict() 9 | 10 | config.WORKERS = 16 11 | config.SEED = 328 12 | config.LOG_DIR = '' 13 | config.MODEL_DIR = '' 14 | config.VERBOSE = False 15 | config.TAG = None 16 | 17 | # common params for NETWORK 18 | config.MODEL = edict() 19 | config.MODEL.NAME = '' 20 | config.MODEL.CHECKPOINT = '' # The checkpoint for the best performance 21 | config.MODEL.PARAMS = None 22 | 23 | # DATASET related params 24 | config.DATASET = edict() 25 | config.DATASET.NAME = '' 26 | config.DATASET.DATASETS = [] 27 | config.DATASET.NO_VAL = True 28 | config.DATASET.NUM_SAMPLE_CLIPS = 128 29 | config.DATASET.SPLIT = '' 30 | config.DATASET.NORMALIZE = False 31 | config.DATASET.EXTEND_INNRE = 0.0 # extend the inner action label 32 | config.DATASET.EXTEND_TIME = False # extend TIME length of the input for bias 33 | config.DATASET.FLIP_TIME = False # flip the input in time direction 34 | config.DATASET.num_pairs = 10 35 | config.DATASET.num_clips = 256 36 | # train 37 | config.TRAIN = edict() 38 | config.TRAIN.LR = 0.001 39 | config.TRAIN.WEIGHT_DECAY = 0.0001 40 | config.TRAIN.FACTOR = 0.8 41 | config.TRAIN.PATIENCE = 5 42 | config.TRAIN.GAMMA = 0.5 43 | config.TRAIN.MILE_STONE = [10, 15] 44 | config.TRAIN.MAX_EPOCH = 20 45 | config.TRAIN.BATCH_SIZE = 4 46 | config.TRAIN.PER_NEGATIVE_PAIRS_INBATCH = 3 47 | config.TRAIN.SHUFFLE = True 48 | config.TRAIN.CONTINUE = False 49 | config.TRAIN.MILESTONES = [7,15] 50 | 51 | config.LOSS = edict() 52 | config.LOSS.NAME = 'bce_loss' 53 | config.LOSS.MATCH = 1.0 54 | config.LOSS.DISTANCE = 1.0 55 | config.LOSS.KL = 1.0 56 | config.LOSS.EARLY = 1.0 57 | config.LOSS.LOCALIZATION = 1.0 58 | config.LOSS.CLIP_NORM = 1.0 59 | config.LOSS.DCOR = 1.0 60 | config.LOSS.contrast = 1.0 61 | config.LOSS.cont = 1.0 62 | config.LOSS.iou = 1.0 63 | config.LOSS.saliency_margin = 0.2 64 | config.LOSS.hy_sigma = 1.0 65 | config.LOSS.contrast_weight = True 66 | config.LOSS.PARAMS = None 67 | config.LOSS.bce = 1.0 68 | # test 69 | config.TEST = edict() 70 | config.TEST.RECALL = [] 71 | config.TEST.TIOU = [] 72 | config.TEST.NMS_THRESH = 0.4 73 | config.TEST.INTERVAL = 1 74 | config.TEST.EVAL_TRAIN = False 75 | config.TEST.BATCH_SIZE = 1 76 | config.TEST.TOP_K = 10 77 | config.TEST.SHUFFLE_VIDEO_FRAME = False 78 | 79 | 80 | def _update_dict(cfg, value): 81 | for k, v in value.items(): 82 | if k in cfg: 83 | if k == 'PARAMS': 84 | cfg[k] = v 85 | elif isinstance(v, dict): 86 | _update_dict(cfg[k], v) 87 | else: 88 | cfg[k] = v 89 | else: 90 | raise ValueError("{} not exist in config.py".format(k)) 91 | 92 | 93 | def update_config(config_file): 94 | with open(config_file) as f: 95 | exp_config = edict(yaml.load(f, Loader=yaml.FullLoader)) 96 | for k, v in exp_config.items(): 97 | if k in config: 98 | if isinstance(v, dict): 99 | _update_dict(config[k], v) 100 | else: 101 | config[k] = v 102 | else: 103 | raise ValueError("{} not exist in config.py".format(k)) 104 | -------------------------------------------------------------------------------- /models/PGBP/slidewindow.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def find_most_relevant_frame(probabilities, mask, window_size): 4 | """ 5 | This function finds the most relevant frame in a batch of videos based on the probabilities of each frame 6 | being relevant to the text. It uses a sliding window approach to find a continuous sequence of frames 7 | with the highest average probability. The mask ensures that only valid values are considered. 8 | 9 | :param probabilities: Batched tensor of probabilities (shape: [B, L]). 10 | :param mask: Batched tensor of masks (shape: [B, L]) where 1 indicates a valid value and 0 indicates invalid. 11 | :param window_size: Size of the sliding window. 12 | :return: The index of the frame with the highest probability for each batch. 13 | """ 14 | batch_size, L = probabilities.shape 15 | 16 | # Initialize arrays to store results 17 | indices_of_max_frames = torch.zeros(batch_size, dtype=int).cuda() 18 | visual_len = torch.sum(mask,dim=1).long() 19 | for batch_index in range(batch_size): 20 | # Slide the window across the valid probabilities 21 | max_avg_probability = 0 22 | index_of_max_frame = 0 23 | probability = probabilities[batch_index] 24 | if visual_len[batch_index] < window_size: 25 | index_of_max_frame = torch.max(probability[0:visual_len[batch_index]],dim = 0)[1] 26 | else: 27 | for start_index in range(visual_len[batch_index] - window_size + 1): 28 | # Compute the average probability for the current window 29 | window_avg = torch.mean(probability[start_index:start_index + window_size]) 30 | 31 | # If the current window's average probability is greater than the max found so far, update max 32 | if window_avg > max_avg_probability: 33 | max_avg_probability = window_avg 34 | index_of_max_frame = torch.max(probability[start_index:start_index + window_size],dim = 0)[1] 35 | index_of_max_frame = index_of_max_frame + start_index 36 | indices_of_max_frames[batch_index] = index_of_max_frame 37 | if (indices_of_max_frames >= visual_len).any(): 38 | print("indices_of_max_frames out of boundary") 39 | return indices_of_max_frames 40 | 41 | # Example usage: 42 | 43 | # torch.manual_seed(42) # For reproducibility 44 | # B = 3 # Batch size 45 | # L = 100 # Length of each video in frames 46 | # probabilities_batched = torch.rand(B, L) # Random probabilities 47 | # mask_batched = torch.randint(0, 2, size=(B, L)) # Random binary mask 48 | # # Define a window size, e.g., corresponding to 1 second of video at 30 fps 49 | # window_size = 30 50 | 51 | # # Find the index of the most relevant frame for each batch 52 | # index_of_max_frames_batched = find_most_relevant_frame_batched(probabilities_batched, mask_batched, window_size) 53 | 54 | # index_of_max_frames_batched 55 | def get_neg_sample(pos_ind,mask,pred): 56 | B,L = mask.shape 57 | mask1 = mask.clone() 58 | for i in range(B): 59 | mask1[i, pos_ind[i]:] = 0.0 60 | mask2 = mask-mask1 61 | neg1_value,neg1 = torch.min(pred.masked_fill(~mask1.bool(), float('1.0')), dim=1) 62 | neg2_value,neg2 = torch.min(pred.masked_fill(~mask2.bool(), float('1.0')), dim=1) 63 | condition1 = (neg1_value == 1.0) 64 | neg1 = torch.where(condition1, neg2, neg1) 65 | condition2 = (neg2_value == 1.0) 66 | neg2 = torch.where(condition2, neg1, neg2) 67 | return neg1,neg2 -------------------------------------------------------------------------------- /models/PGBP/span_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def span_xx_to_cxw(xx_spans): 5 | """ 6 | Args: 7 | xx_spans: tensor, (#windows, 2) or (..., 2), each row is a window of format (st, ed) 8 | 9 | Returns: 10 | cxw_spans: tensor, (#windows, 2), each row is a window of format (center=(st+ed)/2, width=(ed-st)) 11 | >>> spans = torch.Tensor([[0, 1], [0.2, 0.4]]) 12 | >>> span_xx_to_cxw(spans) 13 | tensor([[0.5000, 1.0000], 14 | [0.3000, 0.2000]]) 15 | >>> spans = torch.Tensor([[[0, 1], [0.2, 0.4]]]) 16 | >>> span_xx_to_cxw(spans) 17 | tensor([[[0.5000, 1.0000], 18 | [0.3000, 0.2000]]]) 19 | """ 20 | center = xx_spans.sum(-1) * 0.5 21 | width = xx_spans[..., 1] - xx_spans[..., 0] 22 | return torch.stack([center, width], dim=-1) 23 | 24 | 25 | def span_cxw_to_xx(cxw_spans): 26 | """ 27 | Args: 28 | cxw_spans: tensor, (#windows, 2) or (..., 2), the last dim is a row denoting a window of format (center, width) 29 | 30 | >>> spans = torch.Tensor([[0.5000, 1.0000], [0.3000, 0.2000]]) 31 | >>> span_cxw_to_xx(spans) 32 | tensor([[0.0000, 1.0000], 33 | [0.2000, 0.4000]]) 34 | >>> spans = torch.Tensor([[[0.5000, 1.0000], [0.3000, 0.2000]]]) 35 | >>> span_cxw_to_xx(spans) 36 | tensor([[[0.0000, 1.0000], 37 | [0.2000, 0.4000]]]) 38 | """ 39 | x1 = cxw_spans[..., 0] - 0.5 * cxw_spans[..., 1] 40 | x2 = cxw_spans[..., 0] + 0.5 * cxw_spans[..., 1] 41 | return torch.stack([x1, x2], dim=-1) 42 | 43 | 44 | def temporal_iou(spans1, spans2): 45 | """ 46 | Args: 47 | spans1: (N, 2) torch.Tensor, each row defines a span [st, ed] 48 | spans2: (M, 2) torch.Tensor, ... 49 | 50 | Returns: 51 | iou: (N, M) torch.Tensor 52 | union: (N, M) torch.Tensor 53 | >>> test_spans1 = torch.Tensor([[0, 0.2], [0.5, 1.0]]) 54 | >>> test_spans2 = torch.Tensor([[0, 0.3], [0., 1.0]]) 55 | >>> temporal_iou(test_spans1, test_spans2) 56 | (tensor([[0.6667, 0.2000], 57 | [0.0000, 0.5000]]), 58 | tensor([[0.3000, 1.0000], 59 | [0.8000, 1.0000]])) 60 | """ 61 | areas1 = spans1[:, 1] - spans1[:, 0] # (N, ) 62 | areas2 = spans2[:, 1] - spans2[:, 0] # (M, ) 63 | 64 | left = torch.max(spans1[:, None, 0], spans2[:, 0]) # (N, M) 65 | right = torch.min(spans1[:, None, 1], spans2[:, 1]) # (N, M) 66 | 67 | inter = (right - left).clamp(min=0) # (N, M) 68 | union = areas1[:, None] + areas2 - inter # (N, M) 69 | 70 | iou = inter / union 71 | return iou, union 72 | 73 | 74 | def temporal_intersection_over_pred(gt_spans, pred_spans): 75 | """ intersection over the second input spans 76 | Args: 77 | gt_spans: (N, 2), 78 | pred_spans: (M, 2) 79 | 80 | Returns: 81 | 82 | """ 83 | left = torch.max(gt_spans[:, None, 0], pred_spans[:, 0]) 84 | right = torch.min(gt_spans[:, None, 1], pred_spans[:, 1]) 85 | 86 | inter = (right - left).clamp(min=0) # (N, M) 87 | inter_over_pred = inter / (pred_spans[:, 1] - pred_spans[:, 0]) 88 | return inter_over_pred 89 | 90 | 91 | def generalized_temporal_iou(spans1, spans2): 92 | """ 93 | Generalized IoU from https://giou.stanford.edu/ 94 | Also reference to DETR implementation of generalized_box_iou 95 | https://github.com/facebookresearch/detr/blob/master/util/box_ops.py#L40 96 | 97 | Args: 98 | spans1: (N, 2) torch.Tensor, each row defines a span in xx format [st, ed] 99 | spans2: (M, 2) torch.Tensor, ... 100 | 101 | Returns: 102 | giou: (N, M) torch.Tensor 103 | 104 | >>> test_spans1 = torch.Tensor([[0, 0.2], [0.5, 1.0]]) 105 | >>> test_spans2 = torch.Tensor([[0, 0.3], [0., 1.0]]) 106 | >>> generalized_temporal_iou(test_spans1, test_spans2) 107 | tensor([[ 0.6667, 0.2000], 108 | [-0.2000, 0.5000]]) 109 | """ 110 | spans1 = spans1.float() 111 | spans2 = spans2.float() 112 | assert (spans1[:, 1] >= spans1[:, 0]).all() 113 | assert (spans2[:, 1] >= spans2[:, 0]).all() 114 | iou, union = temporal_iou(spans1, spans2) 115 | 116 | left = torch.min(spans1[:, None, 0], spans2[:, 0]) # (N, M) 117 | right = torch.max(spans1[:, None, 1], spans2[:, 1]) # (N, M) 118 | enclosing_area = (right - left).clamp(min=0) # (N, M) 119 | 120 | return iou - (enclosing_area - union) / enclosing_area 121 | 122 | 123 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils import rnn 4 | from core.config import config 5 | import numpy as np 6 | 7 | 8 | def collate_fn(batch): 9 | batch_word_vectors = [b[0]['word_vectors'] for b in batch] 10 | # batch_pos_tags = [b[0]['pos_tags'] for b in batch] 11 | batch_txt_mask = [b[0]['txt_mask'] for b in batch] 12 | batch_vis_feats = [b[0]['visual_input'] for b in batch] 13 | batch_vis_mask = [b[0]['vis_mask'] for b in batch] 14 | batch_start_label = [b[0]['start_label'] for b in batch] 15 | batch_end_label = [b[0]['end_label'] for b in batch] 16 | batch_start_frame = [b[0]['start_frame'] for b in batch] 17 | batch_end_frame = [b[0]['end_frame'] for b in batch] 18 | batch_internel_label = [b[0]['internel_label'] for b in batch] 19 | batch_extend_pre = [b[0]['extend_pre'] for b in batch] 20 | batch_extend_suf = [b[0]['extend_suf'] for b in batch] 21 | batch_keyword_mask = [b[0]['keyword_mask'] for b in batch] 22 | batch_negative_indices =np.array([b[0]['selected_negative_indices'] for b in batch]) 23 | batch_positive_indices = np.array([b[0]['selected_positive_indices'] for b in batch]) 24 | batch_time = [b[1]["times"] for b in batch] 25 | batch_duration = [b[1]["duration"] for b in batch] 26 | annotations = [b[1] for b in batch] 27 | visual_len = torch.from_numpy(np.array([b[0]['visual_len'] for b in batch])).float() 28 | visual_scale = visual_len / torch.max(visual_len) 29 | batch_data = { 30 | 'batch_word_vectors': 31 | nn.utils.rnn.pad_sequence(batch_word_vectors, batch_first=True), 32 | # 'batch_pos_tags': 33 | # rnn.pad_sequence(batch_pos_tags, batch_first=True), 34 | 'batch_txt_mask': 35 | nn.utils.rnn.pad_sequence(batch_txt_mask, batch_first=True), 36 | 'batch_vis_feats': 37 | nn.utils.rnn.pad_sequence(batch_vis_feats, batch_first=True).float(), 38 | 'batch_vis_mask': 39 | nn.utils.rnn.pad_sequence(batch_vis_mask, batch_first=True).float(), 40 | 'batch_start_label': 41 | nn.utils.rnn.pad_sequence(batch_start_label, batch_first=True).float(), 42 | 'batch_end_label': 43 | nn.utils.rnn.pad_sequence(batch_end_label, batch_first=True).float(), 44 | 'batch_internel_label': 45 | nn.utils.rnn.pad_sequence(batch_internel_label, 46 | batch_first=True).float(), 47 | 'batch_start_frame': 48 | torch.tensor(batch_start_frame).long(), 49 | 'batch_end_frame': 50 | torch.tensor(batch_end_frame).long(), 51 | 'batch_extend_pre': 52 | torch.tensor(batch_extend_pre).long(), 53 | 'batch_extend_suf': 54 | torch.tensor(batch_extend_suf).long(), 55 | "batch_keyword_mask": 56 | nn.utils.rnn.pad_sequence(batch_keyword_mask, 57 | batch_first=True).float(), 58 | "batch_negative_indices": 59 | torch.from_numpy(batch_negative_indices).long(), 60 | "batch_positive_indices": 61 | torch.from_numpy(batch_positive_indices).long(), 62 | "batch_start_time": 63 | torch.tensor(batch_time).float()[:,0], 64 | "batch_end_time": 65 | torch.tensor(batch_time).float()[:,1], 66 | "batch_duration": 67 | torch.tensor(batch_duration).float(), 68 | "visual_scale": 69 | visual_scale 70 | 71 | } 72 | 73 | return batch_data, annotations 74 | 75 | 76 | def average_to_fixed_length(visual_input, num_sample_clips=0): 77 | if num_sample_clips == 0: 78 | num_sample_clips = config.DATASET.NUM_SAMPLE_CLIPS 79 | num_clips = visual_input.shape[0] 80 | idxs = torch.arange(0, num_sample_clips + 1, 81 | 1.0) / num_sample_clips * num_clips 82 | idxs = torch.min(torch.round(idxs).long(), torch.tensor(num_clips - 1)) 83 | new_visual_input = [] 84 | for i in range(num_sample_clips): 85 | s_idx, e_idx = idxs[i].item(), idxs[i + 1].item() 86 | if s_idx < e_idx: 87 | new_visual_input.append( 88 | torch.mean(visual_input[s_idx:e_idx], dim=0)) 89 | else: 90 | new_visual_input.append(visual_input[s_idx]) 91 | new_visual_input = torch.stack(new_visual_input, dim=0) 92 | return new_visual_input 93 | 94 | 95 | from datasets.activitynet import ActivityNet 96 | from datasets.charades import Charades 97 | from datasets.charades_len import Charades_len 98 | from datasets.charades_mom import Charades_mom 99 | from datasets.tacos import TACoS 100 | -------------------------------------------------------------------------------- /core/data_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import pickle 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | 9 | def load_json(filename): 10 | with open(filename, mode='r', encoding='utf-8') as f: 11 | data = json.load(f) 12 | return data 13 | 14 | 15 | def save_json(data, filename, save_pretty=False, sort_keys=False): 16 | with open(filename, mode='w', encoding='utf-8') as f: 17 | if save_pretty: 18 | f.write(json.dumps(data, indent=4, sort_keys=sort_keys)) 19 | else: 20 | json.dump(data, f) 21 | 22 | 23 | def load_lines(filename): 24 | with open(filename, mode='r', encoding='utf-8') as f: 25 | return [e.strip("\n") for e in f.readlines()] 26 | 27 | 28 | def save_lines(data, filename): 29 | with open(filename, mode='w', encoding='utf-8') as f: 30 | f.write("\n".join(data)) 31 | 32 | 33 | def load_pickle(filename): 34 | with open(filename, mode='rb') as handle: 35 | data = pickle.load(handle) 36 | return data 37 | 38 | 39 | def save_pickle(data, filename): 40 | with open(filename, mode='wb') as handle: 41 | pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) 42 | 43 | 44 | def compute_overlap(pred, gt): 45 | # check format 46 | assert isinstance(pred, list) and isinstance(gt, list) 47 | pred_is_list = isinstance(pred[0], list) 48 | gt_is_list = isinstance(gt[0], list) 49 | pred = pred if pred_is_list else [pred] 50 | gt = gt if gt_is_list else [gt] 51 | # compute overlap 52 | pred, gt = np.array(pred), np.array(gt) 53 | inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0]) 54 | inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1]) 55 | inter = np.maximum(0.0, inter_right - inter_left) 56 | union_left = np.minimum(pred[:, 0, None], gt[None, :, 0]) 57 | union_right = np.maximum(pred[:, 1, None], gt[None, :, 1]) 58 | union = np.maximum(1e-12, union_right - union_left) 59 | overlap = 1.0 * inter / union 60 | # reformat output 61 | overlap = overlap if gt_is_list else overlap[:, 0] 62 | overlap = overlap if pred_is_list else overlap[0] 63 | return overlap 64 | 65 | 66 | # def time_to_index(start_time, end_time, num_units, duration): 67 | # s_times = np.arange(0, num_units).astype( 68 | # np.float32) / float(num_units) * duration 69 | # e_times = np.arange(1, num_units + 1).astype( 70 | # np.float32) / float(num_units) * duration 71 | # candidates = np.stack([ 72 | # np.repeat(s_times[:, None], repeats=num_units, axis=1), 73 | # np.repeat(e_times[None, :], repeats=num_units, axis=0) 74 | # ], 75 | # axis=2).reshape((-1, 2)) 76 | # overlaps = compute_overlap(candidates.tolist(), 77 | # [start_time, end_time]).reshape( 78 | # num_units, num_units) 79 | # start_index = np.argmax(overlaps) // num_units 80 | # end_index = np.argmax(overlaps) % num_units 81 | # return start_index, end_index, overlaps 82 | 83 | 84 | # def index_to_time(start_index, end_index, num_units, extend_pre, extend_suf, 85 | # duration): 86 | # if start_index <= extend_pre: 87 | # start_index = extend_pre 88 | # if end_index <= extend_pre: 89 | # end_index = extend_pre 90 | # s_times = np.arange(0, num_units).astype( 91 | # np.float32) * duration / float(num_units) 92 | # e_times = np.arange(1, num_units + 1).astype( 93 | # np.float32) * duration / float(num_units) 94 | # start_time = s_times[start_index - extend_pre] 95 | # end_time = e_times[end_index - extend_pre] 96 | # return start_time, end_time 97 | 98 | def index_to_time(start_index, end_index, num_units, extend_pre, extend_suf, 99 | duration,pos_index): 100 | p_times = np.arange(0, num_units).astype( 101 | np.float32) * duration / float(num_units) 102 | pos_time = p_times[pos_index - extend_pre] 103 | start_time = start_index * duration 104 | end_time = end_index * duration 105 | return start_time, end_time,pos_time 106 | 107 | def index_to_time1(num_units, extend_pre, extend_suf, 108 | duration,pos_index): 109 | p_times = np.arange(0, num_units).astype( 110 | np.float32) * duration / float(num_units) 111 | pos_time = p_times[pos_index - extend_pre] 112 | return pos_time 113 | 114 | def index_to_time2(start_index, end_index, num_units, extend_pre, extend_suf, 115 | duration): 116 | start_time = start_index * duration 117 | end_time = end_index * duration 118 | return start_time, end_time 119 | -------------------------------------------------------------------------------- /models/PGBP/matcher.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Modules to compute the matching cost and solve the corresponding LSAP. 4 | """ 5 | import torch 6 | from scipy.optimize import linear_sum_assignment 7 | from torch import nn 8 | import torch.nn.functional as F 9 | from .span_utils import generalized_temporal_iou, span_cxw_to_xx 10 | 11 | class HungarianMatcher(nn.Module): 12 | """This class computes an assignment between the targets and the predictions of the network 13 | 14 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 15 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 16 | while the others are un-matched (and thus treated as non-objects). 17 | """ 18 | def __init__(self, cost_class: float = 1, cost_span: float = 1, cost_giou: float = 1, 19 | span_loss_type: str = "l1"): 20 | """Creates the matcher 21 | 22 | Params: 23 | cost_span: This is the relative weight of the L1 error of the span coordinates in the matching cost 24 | cost_giou: This is the relative weight of the giou loss of the spans in the matching cost 25 | """ 26 | super().__init__() 27 | self.cost_class = cost_class 28 | self.cost_span = cost_span 29 | self.cost_giou = cost_giou 30 | self.span_loss_type = span_loss_type 31 | self.foreground_label = 0 32 | assert cost_class != 0 or cost_span != 0 or cost_giou != 0, "all costs cant be 0" 33 | 34 | @torch.no_grad() 35 | def forward(self, pred_logits,pred_spans, tgt_spans): 36 | """ Performs the matching 37 | 38 | Params: 39 | outputs: This is a dict that contains at least these entries: 40 | "pred_spans": Tensor of dim [batch_size, num_queries, 2] with the predicted span coordinates, 41 | in normalized (cx, w) format 42 | ""pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 43 | 44 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 45 | "spans": Tensor of dim [num_target_spans, 2] containing the target span coordinates. The spans are 46 | in normalized (cx, w) format 47 | 48 | Returns: 49 | A list of size batch_size, containing tuples of (index_i, index_j) where: 50 | - index_i is the indices of the selected predictions (in order) 51 | - index_j is the indices of the corresponding selected targets (in order) 52 | For each batch element, it holds: 53 | len(index_i) = len(index_j) = min(num_queries, num_target_spans) 54 | """ 55 | bs, num_queries = pred_spans.shape[:2] 56 | # Also concat the target labels and spans 57 | out_prob = pred_logits.flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] 58 | tgt_ids = torch.full([len(tgt_spans)], self.foreground_label) # [total #spans in the batch] 59 | # Compute the classification cost. Contrary to the loss, we don't use the NLL, 60 | # but approximate it in 1 - prob[target class]. 61 | # The 1 is a constant that doesn't change the matching, it can be omitted. 62 | cost_class = -out_prob[:, tgt_ids] # [batch_size * num_queries, total #spans in the batch] 63 | 64 | if self.span_loss_type == "l1": 65 | # We flatten to compute the cost matrices in a batch 66 | out_spans = pred_spans.flatten(0, 1) # [batch_size * num_queries, 2] 67 | 68 | # Compute the L1 cost between spans 69 | cost_span = torch.cdist(out_spans, tgt_spans, p=1) # [batch_size * num_queries, total #spans in the batch] 70 | 71 | # Compute the giou cost between spans 72 | # [batch_size * num_queries, total #spans in the batch] 73 | cost_giou = - generalized_temporal_iou(out_spans,tgt_spans) 74 | # Final cost matrix 75 | # import ipdb; ipdb.set_trace() 76 | C = self.cost_span * cost_span + self.cost_giou * cost_giou + self.cost_class * cost_class 77 | C = C.view(bs, num_queries, -1).cpu() 78 | 79 | sizes = [1]*bs 80 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 81 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 82 | 83 | 84 | def build_matcher(args): 85 | return HungarianMatcher( 86 | cost_span=args.set_cost_span, cost_giou=args.set_cost_giou, 87 | cost_class=args.set_cost_class, span_loss_type=args.span_loss_type, max_v_l=args.max_v_l 88 | ) 89 | -------------------------------------------------------------------------------- /models/PGBP/triplet_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def pairwise_distances(embeddings, squared=False): 5 | """ 6 | ||a-b||^2 = |a|^2 - 2* + |b|^2 7 | """ 8 | # get dot product (batch_size, batch_size) 9 | dot_product = embeddings.mm(embeddings.t()) 10 | 11 | # a vector 12 | square_sum = dot_product.diag() 13 | 14 | distances = square_sum.unsqueeze( 15 | 1) - 2 * dot_product + square_sum.unsqueeze(0) 16 | 17 | distances = distances.clamp(min=1e-16) 18 | 19 | if not squared: 20 | epsilon = 1e-16 21 | mask = torch.eq(distances, 0).float() 22 | distances += (mask + epsilon) 23 | distances = torch.sqrt(distances) 24 | distances *= (1 - mask) 25 | 26 | return distances 27 | 28 | 29 | def get_valid_positive_mask(labels): 30 | """ 31 | To be a valid positive pair (a,p), 32 | - a and p are different embeddings 33 | - a and p have the same label 34 | """ 35 | indices_equal = torch.eye(labels.size(0)).byte() 36 | indices_not_equal = ~indices_equal 37 | 38 | label_equal = torch.eq(labels.unsqueeze(1), labels.unsqueeze(0)) 39 | 40 | mask = indices_not_equal & label_equal 41 | return mask 42 | 43 | 44 | def get_valid_negative_mask(labels): 45 | """ 46 | To be a valid negative pair (a,n), 47 | - a and n are different embeddings 48 | - a and n have the different label 49 | """ 50 | indices_equal = torch.eye(labels.size(0)).byte() 51 | indices_not_equal = ~indices_equal 52 | 53 | label_not_equal = torch.ne(labels.unsqueeze(1), labels.unsqueeze(0)) 54 | 55 | mask = indices_not_equal & label_not_equal 56 | return mask 57 | 58 | 59 | def get_valid_triplets_mask(labels): 60 | """ 61 | To be valid, a triplet (a,p,n) has to satisfy: 62 | - a,p,n are distinct embeddings 63 | - a and p have the same label, while a and n have different label 64 | """ 65 | indices_equal = torch.eye(labels.size(0)).byte().to(labels.device) 66 | indices_not_equal = ~indices_equal 67 | i_ne_j = indices_not_equal.unsqueeze(2) 68 | i_ne_k = indices_not_equal.unsqueeze(1) 69 | j_ne_k = indices_not_equal.unsqueeze(0) 70 | distinct_indices = i_ne_j & i_ne_k & j_ne_k 71 | 72 | label_equal = torch.eq(labels.unsqueeze(1), labels.unsqueeze(0)) 73 | i_eq_j = label_equal.unsqueeze(2) 74 | i_eq_k = label_equal.unsqueeze(1) 75 | i_ne_k = ~i_eq_k 76 | valid_labels = i_eq_j & i_ne_k 77 | 78 | mask = distinct_indices & valid_labels 79 | return mask 80 | 81 | 82 | def batch_all_triplet_loss(labels, embeddings, margin, squared=False): 83 | """ 84 | get triplet loss for all valid triplets and average over those triplets whose loss is positive. 85 | """ 86 | 87 | distances = pairwise_distances(embeddings, squared=squared) 88 | 89 | anchor_positive_dist = distances.unsqueeze(2) 90 | anchor_negative_dist = distances.unsqueeze(1) 91 | triplet_loss = anchor_positive_dist - anchor_negative_dist + margin 92 | 93 | # get a 3D mask to filter out invalid triplets 94 | mask = get_valid_triplets_mask(labels) 95 | 96 | triplet_loss = triplet_loss * mask.float() 97 | triplet_loss.clamp_(min=0) 98 | 99 | # count the number of positive triplets 100 | epsilon = 1e-16 101 | num_positive_triplets = (triplet_loss > 0).float().sum() 102 | num_valid_triplets = mask.float().sum() 103 | fraction_positive_triplets = num_positive_triplets / (num_valid_triplets + 104 | epsilon) 105 | 106 | triplet_loss = triplet_loss.sum() / (num_positive_triplets + epsilon) 107 | 108 | return triplet_loss, fraction_positive_triplets 109 | 110 | 111 | def batch_hard_triplet_loss(labels, embeddings, margin, squared=False): 112 | """ 113 | - compute distance matrix 114 | - for each anchor a0, find the (a0,p0) pair with greatest distance s.t. a0 and p0 have the same label 115 | - for each anchor a0, find the (a0,n0) pair with smallest distance s.t. a0 and n0 have different label 116 | - compute triplet loss for each triplet (a0, p0, n0), average them 117 | """ 118 | distances = pairwise_distances(embeddings, squared=squared) 119 | 120 | mask_positive = get_valid_positive_mask(labels) 121 | hardest_positive_dist = (distances * mask_positive.float()).max(dim=1)[0] 122 | 123 | mask_negative = get_valid_negative_mask(labels) 124 | max_negative_dist = distances.max(dim=1, keepdim=True)[0] 125 | distances = distances + max_negative_dist * (~mask_negative).float() 126 | hardest_negative_dist = distances.min(dim=1)[0] 127 | 128 | triplet_loss = (hardest_positive_dist - hardest_negative_dist + 129 | margin).clamp(min=0) 130 | triplet_loss = triplet_loss.mean() 131 | 132 | return triplet_loss 133 | -------------------------------------------------------------------------------- /models/PGBP/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import math 6 | 7 | from .operation import Conv1D, mask_logits 8 | 9 | 10 | class PositionEmbeddingSine(nn.Module): 11 | """ 12 | This is a more standard version of the position embedding, very similar to the one 13 | used by the Attention is all you need paper, generalized to work on images. (To 1D sequences) 14 | """ 15 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 16 | super().__init__() 17 | self.num_pos_feats = num_pos_feats 18 | self.temperature = temperature 19 | self.normalize = normalize 20 | if scale is not None and normalize is False: 21 | raise ValueError("normalize should be True if scale is passed") 22 | if scale is None: 23 | scale = 2 * math.pi 24 | self.scale = scale 25 | 26 | def forward(self, x, mask): 27 | """ 28 | Args: 29 | x: torch.tensor, (batch_size, L, d) 30 | mask: torch.tensor, (batch_size, L), with 1 as valid 31 | 32 | Returns: 33 | 34 | """ 35 | assert mask is not None 36 | x_embed = mask.cumsum(1, dtype=torch.float32) # (bsz, L) 37 | if self.normalize: 38 | eps = 1e-6 39 | x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale 40 | 41 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 42 | # dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 43 | dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='trunc') / self.num_pos_feats) 44 | pos_x = x_embed[:, :, None] / dim_t # (bsz, L, num_pos_feats) 45 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) # (bsz, L, num_pos_feats*2) 46 | # import ipdb; ipdb.set_trace() 47 | return pos_x # .permute(0, 2, 1) # (bsz, num_pos_feats*2, L) 48 | 49 | class TransformerPositionalEmbedding(nn.Module): 50 | def __init__(self, dim, max_len=512): 51 | super().__init__() 52 | 53 | # Compute the positional encodings once in log space. 54 | pe = torch.zeros(max_len, dim).float() 55 | pe.require_grad = False 56 | 57 | position = torch.arange(0, max_len).float().unsqueeze(1) 58 | div_term = (torch.arange(0, dim, 2).float() * 59 | -(math.log(10000.0) / dim)).exp() 60 | 61 | pe[:, 0::2] = torch.sin(position * div_term) 62 | pe[:, 1::2] = torch.cos(position * div_term) 63 | 64 | pe = pe.unsqueeze(0) 65 | self.register_buffer('pe', pe) 66 | 67 | def forward(self, x): 68 | return self.pe[:, :x.size(1)] 69 | 70 | 71 | class PositionalEmbedding(nn.Module): 72 | """Construct the embeddings from word, position and token_type embeddings.""" 73 | def __init__(self, embedding_dim, num_embeddings): 74 | super(PositionalEmbedding, self).__init__() 75 | self.position_embeddings = nn.Embedding(num_embeddings, embedding_dim) 76 | 77 | def forward(self, inputs): 78 | bsz, seq_length = inputs.shape[:2] 79 | position_ids = torch.arange(seq_length, 80 | dtype=torch.long, 81 | device=inputs.device) 82 | position_ids = position_ids.unsqueeze(0).repeat(bsz, 1) # (N, L) 83 | position_embeddings = self.position_embeddings(position_ids) 84 | return position_embeddings 85 | 86 | 87 | class Projection(nn.Module): 88 | def __init__(self, in_dim, dim, drop_rate=0.0): 89 | super(Projection, self).__init__() 90 | self.drop = nn.Dropout(p=drop_rate) 91 | self.projection = Conv1D(in_dim=in_dim, 92 | out_dim=dim, 93 | kernel_size=1, 94 | stride=1, 95 | bias=True, 96 | padding=0) 97 | self.layer_norm = nn.LayerNorm(dim, eps=1e-6) 98 | 99 | def forward(self, input_features): 100 | # the input feature with shape (batch_size, seq_len, in_dim) 101 | input_features = self.drop(input_features) 102 | output = self.projection(input_features) # (batch_size, seq_len, dim) 103 | output = self.layer_norm(output) 104 | return output 105 | 106 | 107 | class Prediction(nn.Module): 108 | def __init__(self, in_dim, hidden_dim, out_dim, drop_rate=0.): 109 | super(Prediction, self).__init__() 110 | self.fc1 = Conv1D(in_dim=in_dim, 111 | out_dim=hidden_dim, 112 | kernel_size=1, 113 | stride=1, 114 | padding=0, 115 | bias=True) 116 | self.dropout = nn.Dropout(p=drop_rate) 117 | self.fc2 = Conv1D(in_dim=hidden_dim, 118 | out_dim=out_dim, 119 | kernel_size=1, 120 | stride=1, 121 | padding=0, 122 | bias=True) 123 | 124 | def forward(self, input_feature): 125 | output = self.fc1(input_feature) 126 | output = F.gelu(output) 127 | output = self.dropout(output) 128 | output = self.fc2(output) 129 | return output 130 | 131 | class MLP(nn.Module): 132 | 133 | def __init__(self, dims, dropout=0.1) -> None: 134 | super().__init__() 135 | # assert num_layers > 1, "this class is intended for multiple linear layers" 136 | # dims = dims 137 | num_layers = len(dims) - 1 138 | self.layers = nn.ModuleList([nn.Linear(dims[i], dims[i + 1]) for i in range(num_layers)]) 139 | self.do = nn.Dropout(dropout) 140 | 141 | def forward(self, x): 142 | for idx, layer in enumerate(self.layers): 143 | x = layer(x) 144 | if idx != len(self.layers) - 1: 145 | x = F.gelu(x) 146 | x = self.do(x) 147 | return x -------------------------------------------------------------------------------- /models/PGBP/encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .operation import Conv1D 5 | 6 | 7 | class LSTMEncoder(nn.Module): 8 | def __init__(self, 9 | in_dim, 10 | out_dim, 11 | num_layers, 12 | bi_direction=False, 13 | drop_rate=0.0): 14 | super(LSTMEncoder, self).__init__() 15 | 16 | self.layers_norm1 = nn.LayerNorm(in_dim, eps=1e-6) 17 | self.layers_norm2 = nn.LayerNorm(out_dim, eps=1e-6) 18 | 19 | self.dropout = nn.Dropout(p=drop_rate) 20 | self.encoder = nn.LSTM(in_dim, 21 | out_dim // 2 if bi_direction else out_dim, 22 | num_layers=num_layers, 23 | bidirectional=bi_direction, 24 | dropout=drop_rate, 25 | batch_first=True) 26 | 27 | self.linear = Conv1D(in_dim=out_dim, 28 | out_dim=out_dim, 29 | kernel_size=1, 30 | stride=1, 31 | bias=True, 32 | padding=0) 33 | 34 | def forward(self, input_feature): 35 | input_feature = self.layers_norm1(input_feature) 36 | output, _ = self.encoder(input_feature) 37 | output = self.layers_norm2(output) 38 | output = self.dropout(output) 39 | output = self.linear(output) 40 | return output 41 | 42 | 43 | class MultiStepLSTMEncoder(nn.Module): 44 | def __init__(self, 45 | in_dim, 46 | out_dim, 47 | num_layers, 48 | num_step=1, 49 | bi_direction=False, 50 | drop_rate=0.0): 51 | super(MultiStepLSTMEncoder, self).__init__() 52 | 53 | self.num_step = num_step 54 | self.out_dim = out_dim 55 | self.layers_norm = nn.LayerNorm(in_dim, eps=1e-6) 56 | 57 | self.dropout = nn.Dropout(p=drop_rate) 58 | 59 | self.encoder = nn.ModuleList([ 60 | nn.LSTM(in_dim, 61 | out_dim // 2 if bi_direction else out_dim, 62 | num_layers=num_layers, 63 | bidirectional=bi_direction, 64 | dropout=drop_rate, 65 | batch_first=True) for _ in range(num_step) 66 | ]) 67 | self.linear = Conv1D(in_dim=int(num_step * out_dim), 68 | out_dim=out_dim, 69 | kernel_size=1, 70 | stride=1, 71 | bias=True, 72 | padding=0) 73 | 74 | def forward(self, input_feature): 75 | input_feature = self.layers_norm(input_feature) 76 | B, seq_len, _ = input_feature.shape 77 | # assert seq_len // self.num_step == 0, "length of sequence({}) must be devided by num_step({})".format( 78 | # seq_len, self.num_step) 79 | output = [] 80 | for i in range(self.num_step): 81 | encoder_i = self.encoder[i] 82 | output_i = input_feature.new_zeros([B, seq_len, self.out_dim]) 83 | input_i_len = (seq_len // (i + 1)) * (i + 1) 84 | for j in range(i + 1): 85 | input_j = input_feature[:, j:input_i_len:(i + 1), :] 86 | output_j, _ = encoder_i(input_j) 87 | output_i[:, j:input_i_len:(i + 1), :] = output_j 88 | output_i = self.dropout(output_i) 89 | output.append(output_i) 90 | output = torch.cat(output, dim=2) 91 | output = self.linear(output) 92 | return output 93 | 94 | class TemporalContextModule(nn.Module): 95 | def __init__(self, in_dim, out_dim, kernels=[3], drop_rate=0.): 96 | super(TemporalContextModule, self).__init__() 97 | self.dropout = nn.Dropout(p=drop_rate) 98 | self.temporal_convs = nn.ModuleList([ 99 | Conv1D(in_dim=in_dim, 100 | out_dim=out_dim, 101 | kernel_size=s, 102 | stride=1, 103 | padding=s // 2, 104 | bias=True) for s in kernels 105 | ]) 106 | self.out_layer = Conv1D(in_dim=out_dim * len(kernels), 107 | out_dim=out_dim, 108 | kernel_size=1, 109 | stride=1, 110 | padding=0, 111 | bias=True) 112 | 113 | def forward(self, input_feature): 114 | intermediate = [] 115 | for layer in self.temporal_convs: 116 | intermediate.append(layer(input_feature)) 117 | intermediate = torch.cat(intermediate, dim=-1) 118 | out = self.out_layer(intermediate) 119 | return out 120 | 121 | 122 | class MultiStepGRUEncoder(nn.Module): 123 | def __init__(self, 124 | in_dim, 125 | out_dim, 126 | num_layers, 127 | num_step=1, 128 | bi_direction=False, 129 | drop_rate=0.0): 130 | super(MultiStepGRUEncoder, self).__init__() 131 | 132 | self.num_step = num_step 133 | self.out_dim = out_dim 134 | self.layers_norm = nn.LayerNorm(in_dim, eps=1e-6) 135 | 136 | self.dropout = nn.Dropout(p=drop_rate) 137 | 138 | self.encoder = nn.ModuleList([ 139 | nn.GRU(in_dim, 140 | out_dim // 2 if bi_direction else out_dim, 141 | num_layers=num_layers, 142 | bidirectional=bi_direction, 143 | dropout=drop_rate, 144 | batch_first=True) for _ in range(num_step) 145 | ]) 146 | self.linear = Conv1D(in_dim=int(num_step * out_dim), 147 | out_dim=out_dim, 148 | kernel_size=1, 149 | stride=1, 150 | bias=True, 151 | padding=0) 152 | 153 | def forward(self, input_feature): 154 | input_feature = self.layers_norm(input_feature) 155 | B, seq_len, _ = input_feature.shape 156 | # assert seq_len // self.num_step == 0, "length of sequence({}) must be devided by num_step({})".format( 157 | # seq_len, self.num_step) 158 | output = [] 159 | for i in range(self.num_step): 160 | encoder_i = self.encoder[i] 161 | output_i = input_feature.new_zeros([B, seq_len, self.out_dim]) 162 | input_i_len = (seq_len // (i + 1)) * (i + 1) 163 | for j in range(i + 1): 164 | input_j = input_feature[:, j:input_i_len:(i + 1), :] 165 | output_j, _ = encoder_i(input_j) 166 | output_i[:, j:input_i_len:(i + 1), :] = output_j 167 | output_i = self.dropout(output_i) 168 | output.append(output_i) 169 | output = torch.cat(output, dim=2) 170 | output = self.linear(output) 171 | return output -------------------------------------------------------------------------------- /core/runner_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import random 4 | import numpy as np 5 | import torch 6 | from torch.cuda.profiler import start 7 | import torch.utils.data 8 | import torch.backends.cudnn 9 | from tqdm import tqdm 10 | from prettytable import PrettyTable 11 | 12 | from .data_util import index_to_time,index_to_time1,index_to_time2 13 | 14 | 15 | def set_th_config(seed): 16 | random.seed(seed) 17 | np.random.seed(seed) 18 | torch.manual_seed(seed) 19 | torch.cuda.manual_seed(seed) 20 | torch.cuda.manual_seed_all(seed) 21 | torch.backends.cudnn.benchmark = False 22 | torch.backends.cudnn.deterministic = True 23 | torch.backends.cudnn.enabled = True 24 | 25 | 26 | def dcor(x, y): 27 | m, _ = x.shape 28 | assert len(x.shape) == 2 29 | assert len(y.shape) == 2 30 | 31 | dx = pairwise_dist(x) 32 | dy = pairwise_dist(y) 33 | 34 | dx_m = dx - dx.mean(dim=0)[None, :] - dx.mean(dim=1)[:, None] + dx.mean() 35 | dy_m = dy - dy.mean(dim=0)[None, :] - dy.mean(dim=1)[:, None] + dy.mean() 36 | 37 | dcov2_xy = (dx_m * dy_m).sum() / float(m * m) 38 | dcov2_xx = (dx_m * dx_m).sum() / float(m * m) 39 | dcov2_yy = (dy_m * dy_m).sum() / float(m * m) 40 | 41 | dcor = torch.sqrt(dcov2_xy) / torch.sqrt( 42 | (torch.sqrt(dcov2_xx) * torch.sqrt(dcov2_yy)).clamp(min=0) + 1e-10) 43 | 44 | return dcor 45 | 46 | 47 | def pairwise_dist(x): 48 | #x should be two dimensional 49 | instances_norm = torch.sum(x**2, -1).reshape((-1, 1)) 50 | output = -2 * torch.mm(x, x.t()) + instances_norm + instances_norm.t() 51 | return torch.sqrt(output.clamp(min=0) + 1e-10) 52 | 53 | 54 | def filter_checkpoints(model_dir, suffix='t7', max_to_keep=5): 55 | model_paths = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix))) 56 | if len(model_paths) > max_to_keep: 57 | model_file_dict = dict() 58 | suffix_len = len(suffix) + 1 59 | for model_path in model_paths: 60 | step = int( 61 | os.path.basename(model_path).split('_')[1][0:-suffix_len]) 62 | model_file_dict[step] = model_path 63 | sorted_tuples = sorted(model_file_dict.items()) 64 | unused_tuples = sorted_tuples[0:-max_to_keep] 65 | for _, model_path in unused_tuples: 66 | os.remove(model_path) 67 | 68 | 69 | def get_last_checkpoint(model_dir, suffix='t7'): 70 | model_filenames = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix))) 71 | model_file_dict = dict() 72 | suffix_len = len(suffix) + 1 73 | for model_filename in model_filenames: 74 | step = int( 75 | os.path.basename(model_filename).split('_')[1][0:-suffix_len]) 76 | model_file_dict[step] = model_filename 77 | sorted_tuples = sorted(model_file_dict.items()) 78 | last_checkpoint = sorted_tuples[-1] 79 | return last_checkpoint[1] 80 | 81 | 82 | def convert_length_to_mask(lengths): 83 | max_len = lengths.max().item() 84 | mask = torch.arange(max_len, device=lengths.device).expand( 85 | lengths.size()[0], max_len) < lengths.unsqueeze(1) 86 | mask = mask.float() 87 | return mask 88 | 89 | 90 | def calculate_iou_accuracy(ious, threshold): 91 | total_size = float(len(ious)) 92 | count = 0 93 | for iou in ious: 94 | if iou >= threshold: 95 | count += 1 96 | return float(count) / total_size * 100.0 97 | 98 | 99 | def calculate_iou(i0, i1): 100 | union = (min(i0[0], i1[0]), max(i0[1], i1[1])) 101 | inter = (max(i0[0], i1[0]), min(i0[1], i1[1])) 102 | iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0]) 103 | return max(0.0,iou) 104 | 105 | 106 | def cal_statistics(preds, durations): 107 | start_fre = [0] * 10 108 | end_fre = [0] * 10 109 | duration_fre = [0] * 10 110 | start_end_fre = [[0] * 10 for _ in range(10)] 111 | tb = PrettyTable() 112 | tb.field_names = [ 113 | "type", "0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", 114 | "1.0" 115 | ] 116 | for pred, duration in zip(preds, durations): 117 | start_f = int(pred[0] / duration * 10) 118 | end_f = min(int(pred[1] / duration * 10), 9) 119 | duration_f = min(int((pred[1] - pred[0]) / duration * 10), 9) 120 | start_fre[start_f] += 1 121 | end_fre[end_f] += 1 122 | duration_fre[duration_f] += 1 123 | start_end_fre[start_f][end_f] += 1 124 | assert len(preds) == len(durations) 125 | all_len = len(durations) 126 | for i in range(10): 127 | start_fre[i] /= all_len 128 | end_fre[i] /= all_len 129 | duration_fre[i] /= all_len 130 | for j in range(10): 131 | start_end_fre[i][j] /= all_len 132 | start_end_fre[i][j] = "{:.6f}".format(start_end_fre[i][j]) 133 | start_fre = ["{:.6f}".format(s) for s in start_fre] 134 | end_fre = ["{:.6f}".format(s) for s in end_fre] 135 | duration_fre = ["{:.6f}".format(s) for s in duration_fre] 136 | tb.add_row(["start_fre"] + start_fre) 137 | tb.add_row(["end_fre"] + end_fre) 138 | tb.add_row(["duration_fre"] + duration_fre) 139 | tb.add_row(["--"] * 11) 140 | for i in range(10): 141 | tb.add_row([str((i + 1) / 10)] + start_end_fre[i]) 142 | return tb.get_string() 143 | 144 | 145 | def eval_test(model, 146 | data_loader, 147 | device, 148 | mode='test', 149 | epoch=None, 150 | global_step=None): 151 | ious = [] 152 | with torch.no_grad(): 153 | for idx, batch_data in tqdm(enumerate(data_loader), 154 | total=len(data_loader), 155 | desc='evaluate {}'.format(mode)): 156 | data, annos = batch_data 157 | batch_word_vectors = data['batch_word_vectors'].to(device) 158 | batch_txt_mask = data['batch_txt_mask'].squeeze().to(device) 159 | batch_vis_feats = data['batch_vis_feats'].to(device) 160 | batch_vis_mask = data['batch_vis_mask'].squeeze().to(device) 161 | 162 | # compute predicted results 163 | _, start_logits, end_logits = model(batch_word_vectors, 164 | batch_txt_mask, 165 | batch_vis_feats, 166 | batch_vis_mask) 167 | start_indices, end_indices = model.extract_index( 168 | start_logits, end_logits) 169 | start_indices = start_indices.cpu().numpy() 170 | end_indices = end_indices.cpu().numpy() 171 | batch_vis_mask = batch_vis_mask.cpu().numpy() 172 | for vis_mask, start_index, end_index, anno in zip( 173 | batch_vis_mask, start_indices, end_indices, annos): 174 | start_time, end_time = index_to_time(start_index, end_index, 175 | vis_mask.sum(), 176 | anno["duration"]) 177 | iou = calculate_iou(i0=[start_time, end_time], 178 | i1=anno['times']) 179 | ious.append(iou) 180 | r1i3 = calculate_iou_accuracy(ious, threshold=0.3) 181 | r1i5 = calculate_iou_accuracy(ious, threshold=0.5) 182 | r1i7 = calculate_iou_accuracy(ious, threshold=0.7) 183 | mi = np.mean(ious) * 100.0 184 | # write the scores 185 | score_str = "Epoch {}, Step {}:\n".format(epoch, global_step) 186 | score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3) 187 | score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5) 188 | score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7) 189 | score_str += "mean IoU: {:.2f}\n".format(mi) 190 | return r1i3, r1i5, r1i7, mi, score_str -------------------------------------------------------------------------------- /models/PGBP/fusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn.modules.container import ModuleList 5 | import numpy as np 6 | import math 7 | from .attention import TemporalMaxer,Cross_Attention,MultiHeadAttention 8 | from .operation import Conv1D, mask_logits 9 | 10 | 11 | class CQFusion(nn.Module): 12 | def __init__(self, configs, drop_rate=0.0): 13 | dim = configs.dim 14 | super(CQFusion, self).__init__() 15 | w4C = torch.empty(dim, 1) 16 | w4Q = torch.empty(dim, 1) 17 | w4mlu = torch.empty(1, 1, dim) 18 | nn.init.xavier_uniform_(w4C) 19 | nn.init.xavier_uniform_(w4Q) 20 | nn.init.xavier_uniform_(w4mlu) 21 | self.w4C = nn.Parameter(w4C, requires_grad=True) 22 | self.w4Q = nn.Parameter(w4Q, requires_grad=True) 23 | self.w4mlu = nn.Parameter(w4mlu, requires_grad=True) 24 | self.dropout = nn.Dropout(p=drop_rate) 25 | self.cqa_linear = Conv1D(in_dim=4 * dim, 26 | out_dim=dim, 27 | kernel_size=1, 28 | stride=1, 29 | padding=0, 30 | bias=True) 31 | 32 | def forward(self, context, query, c_mask, q_mask): 33 | score = self.trilinear_attention( 34 | context, query) # (batch_size, c_seq_len, q_seq_len) 35 | score_ = torch.softmax(mask_logits(score, q_mask.unsqueeze(1)), 36 | dim=2) # (batch_size, c_seq_len, q_seq_len) 37 | score_t = torch.softmax(mask_logits(score, c_mask.unsqueeze(2)), 38 | dim=1) # (batch_size, c_seq_len, q_seq_len) 39 | score_t = score_t.transpose(1, 2) # (batch_size, q_seq_len, c_seq_len) 40 | c2q = torch.matmul(score_, query) # (batch_size, c_seq_len, dim) 41 | q2c = torch.matmul(torch.matmul(score_, score_t), 42 | context) # (batch_size, c_seq_len, dim) 43 | output = torch.cat( 44 | [context, c2q, 45 | torch.mul(context, c2q), 46 | torch.mul(context, q2c)], 47 | dim=2) 48 | output = self.cqa_linear(output) # (batch_size, c_seq_len, dim) 49 | return output * c_mask.unsqueeze(2) 50 | 51 | def trilinear_attention(self, context, query): 52 | batch_size, c_seq_len, dim = context.shape 53 | batch_size, q_seq_len, dim = query.shape 54 | context = self.dropout(context) 55 | query = self.dropout(query) 56 | subres0 = torch.matmul(context, self.w4C).expand( 57 | [-1, -1, q_seq_len]) # (batch_size, c_seq_len, q_seq_len) 58 | subres1 = torch.matmul(query, self.w4Q).transpose(1, 2).expand( 59 | [-1, c_seq_len, -1]) 60 | subres2 = torch.matmul(context * self.w4mlu, query.transpose(1, 2)) 61 | res = subres0 + subres1 + subres2 # (batch_size, c_seq_len, q_seq_len) 62 | return res 63 | 64 | class multiscale_Fusion(nn.Module): 65 | def __init__(self, configs): 66 | super(multiscale_Fusion, self).__init__() 67 | self.branch = nn.ModuleList() 68 | self.fusion = nn.ModuleList() 69 | self.fusion.append(Cross_Attention(configs)) 70 | self.MULTI_SCALE = configs.MULTI_SCALE 71 | if configs.MULTI_SCALE == True: 72 | for idx in range(configs.MULTI_SCALE_LEN): 73 | self.branch.append(TemporalMaxer(kernel_size=3, 74 | stride=2, 75 | padding=1, 76 | n_embd=configs.dim)) 77 | self.fusion.append(Cross_Attention(configs)) 78 | self.attention = MultiHeadAttention(configs) 79 | 80 | def forward(self, context, query, c_mask, q_mask): 81 | b,l,d = context.shape 82 | fusion = self.fusion[0](context,query,c_mask,q_mask) 83 | if self.MULTI_SCALE == True: 84 | for i in range(len(self.branch)): 85 | if i == 0: 86 | multi_feature,multi_feature_mask = self.branch[i](context,c_mask) 87 | else: 88 | multi_feature,multi_feature_mask = self.branch[i](multi_feature,multi_feature_mask) 89 | multi_fusion = self.fusion[i+1](multi_feature,query,multi_feature_mask,q_mask) 90 | fusion = torch.cat((fusion,multi_fusion),dim = 1) 91 | c_mask = torch.cat((c_mask,multi_feature_mask),dim = 1) 92 | fusion = self.attention(fusion,c_mask) 93 | fusion = fusion[:,:l,:] 94 | c_mask = c_mask[:,:l] 95 | return fusion 96 | 97 | 98 | class multiscale_CQFusion(nn.Module): 99 | def __init__(self, configs): 100 | super(multiscale_CQFusion, self).__init__() 101 | self.branch = nn.ModuleList() 102 | self.fusion = nn.ModuleList() 103 | self.fusion.append(CQFusion(configs)) 104 | self.MULTI_SCALE = configs.MULTI_SCALE 105 | if configs.MULTI_SCALE == True: 106 | for idx in range(configs.MULTI_SCALE_LEN): 107 | self.branch.append(TemporalMaxer(kernel_size=3, 108 | stride=2, 109 | padding=1, 110 | n_embd=configs.dim)) 111 | self.fusion.append(CQFusion(configs)) 112 | self.attention = MultiHeadAttention(configs) 113 | 114 | def forward(self, context, query, c_mask, q_mask): 115 | b,l,d = context.shape 116 | fusion = self.fusion[0](context,query,c_mask,q_mask) 117 | if self.MULTI_SCALE == True: 118 | for i in range(len(self.branch)): 119 | if i == 0: 120 | multi_feature,multi_feature_mask = self.branch[i](context,c_mask) 121 | else: 122 | multi_feature,multi_feature_mask = self.branch[i](multi_feature,multi_feature_mask) 123 | multi_fusion = self.fusion[i+1](multi_feature,query,multi_feature_mask,q_mask) 124 | #修改 125 | # fusion = self.muti_fuse[i](fusion,multi_fusion,multi_feature_mask) 126 | fusion = torch.cat((fusion,multi_fusion),dim = 1) 127 | c_mask = torch.cat((c_mask,multi_feature_mask),dim = 1) 128 | fusion = self.attention(fusion,c_mask) 129 | fusion = fusion[:,:l,:] 130 | c_mask = c_mask[:,:l] 131 | fusion = fusion * c_mask.unsqueeze(2) 132 | return fusion 133 | 134 | 135 | class multiscale_CQFusion1(nn.Module): 136 | def __init__(self, configs): 137 | super(multiscale_CQFusion1, self).__init__() 138 | self.branch = nn.ModuleList() 139 | self.fusion = nn.ModuleList() 140 | self.muti_fuse = nn.ModuleList() 141 | self.fusion.append(CQFusion(configs)) 142 | self.MULTI_SCALE = configs.MULTI_SCALE 143 | self.fusion_attention = configs.fusion_attention 144 | if configs.MULTI_SCALE == True: 145 | for idx in range(configs.MULTI_SCALE_LEN): 146 | self.branch.append(TemporalMaxer(kernel_size=3, 147 | stride=2, 148 | padding=1, 149 | n_embd=configs.dim)) 150 | self.fusion.append(CQFusion(configs)) 151 | self.muti_fuse.append(MutiFuse(configs)) 152 | self.attention = MultiHeadAttention(configs) 153 | 154 | def forward(self, context, query, c_mask, q_mask): 155 | b,l,d = context.shape 156 | fusion = self.fusion[0](context,query,c_mask,q_mask) 157 | if self.fusion_attention is True: 158 | fusion = self.attention(fusion,c_mask) 159 | if self.MULTI_SCALE == True: 160 | for i in range(len(self.branch)): 161 | if i == 0: 162 | multi_feature,multi_feature_mask = self.branch[i](context,c_mask) 163 | else: 164 | multi_feature,multi_feature_mask = self.branch[i](multi_feature,multi_feature_mask) 165 | multi_fusion = self.fusion[i+1](multi_feature,query,multi_feature_mask,q_mask) 166 | #修改 167 | fusion = self.muti_fuse[i](fusion,multi_fusion,multi_feature_mask) 168 | fusion = fusion * c_mask.unsqueeze(2) 169 | return fusion 170 | 171 | class MutiFuse(nn.Module): 172 | def __init__(self, cfg): 173 | super(MutiFuse, self).__init__() 174 | self.txt_softmax = nn.Softmax(1) 175 | self.txt_linear1 = nn.Linear(cfg.dim, 1) 176 | self.layernorm = nn.LayerNorm(cfg.dim, eps=1e-6) 177 | 178 | def forward(self, vis_encoded, txt_encoded,txt_mask): 179 | # vis_encoded: B, C, T 180 | # txt_encoded: B, L, C 181 | vis_encoded = vis_encoded.permute(0,2,1) 182 | txt_attn = self.txt_softmax(self.txt_linear1(txt_encoded)) # B, L, 1 183 | txt_attn = txt_attn * txt_mask.unsqueeze(2) 184 | txt_pool = torch.sum(txt_attn * txt_encoded, dim=1)[:,:,None] # B, C, 1 185 | # 先计算注意力权重,并在词维度进行sum,最后得到的是一个二维,方便计算所以增加一个第三维度 186 | vis_fused =self.layernorm((txt_pool * vis_encoded).permute(0,2,1)) + vis_encoded.permute(0,2,1) # B, C, T 187 | return vis_fused -------------------------------------------------------------------------------- /datasets/BaseDataset.py: -------------------------------------------------------------------------------- 1 | """ Dataset loader for the ActivityNet Captions dataset """ 2 | import os 3 | import json 4 | import h5py 5 | from nltk.tag import pos_tag 6 | import torch 7 | from torch import nn 8 | from torch._C import _resolve_type_from_object 9 | import torch.nn.functional as F 10 | import torch.utils.data as data 11 | import torchtext 12 | import numpy as np 13 | 14 | from . import average_to_fixed_length 15 | from core.config import config 16 | import nltk 17 | 18 | if nltk.data.find('taggers/averaged_perceptron_tagger') is not None: 19 | print("averged_perceptron_tagger has been downloaded.") 20 | else: 21 | nltk.download("averaged_perceptron_tagger") # download data for the first time run 22 | 23 | 24 | class BaseDataset(data.Dataset): 25 | vocab = torchtext.vocab.pretrained_aliases["glove.840B.300d"]() 26 | vocab.itos.extend([""]) 27 | vocab.stoi[""] = vocab.vectors.shape[0] 28 | vocab.vectors = torch.cat([vocab.vectors, torch.zeros(1, vocab.dim)], dim=0) 29 | word_embedding = nn.Embedding.from_pretrained(vocab.vectors) 30 | 31 | # CC 并列连词 0 NNS 名词复数 1 UH 感叹词 2 32 | # CD 基数词 3 NNP 专有名词 1 VB 动词原型 4 33 | # DT 限定符 5 NNP 专有名词复数 1 VBD 动词过去式 4 34 | # EX 存在词 6 PDT 前置限定词 5 VBG 动名词或现在分词 4 35 | # FW 外来词 7 POS 所有格结尾 8 VBN 动词过去分词 4 36 | # IN 介词或从属连词 9 PRP 人称代词 10 VBP 非第三人称单数的现在时 4 37 | # JJ 形容词 11 PRP$ 所有格代词 17 VBZ 第三人称单数的现在时 4 38 | # JJR 比较级的形容词 11 RB 副词 12 WDT 以wh开头的限定词 18 39 | # JJS 最高级的形容词 11 RBR 副词比较级 12 WP 以wh开头的代词 19 40 | # LS 列表项标记 13 RBS 副词最高级 12 WP$ 以wh开头的所有格代词 20 41 | # MD 情态动词 4 RP 小品词 14 WRB 以wh开头的副词 21 42 | # NN 名词单数 1 SYM 符号 15 TO to 16 43 | # ',': 22, '.': 23, 44 | 45 | # 三个数据集的词性占比,charades, activitynet, TACoS 46 | # 1: 38.6, 25.8, 27.8 || 4: 20, 18, 16 || 5: 22, 16, 20 || 9: 11.4, 15,6, 11 47 | # 10: 2, 3.5, 5.7 || 12: 0.6, 2.8, 1.5 || 14: 2.1, 1.1, 3.3 || 16: 0.9, 2, 1.1 48 | # 17: 1.3, 1.3, 0.35 || 22: 0.03, 1 , 1,2 || 23: 0, 7.1, 8.3 49 | 50 | pos_tags = { 51 | "NNS": 0, 52 | "NNP": 0, 53 | "NN": 0, 54 | "VB": 1, 55 | "VBD": 1, 56 | "VBN": 1, 57 | "VBP": 1, 58 | "VBG": 1, 59 | "VBZ": 1, 60 | "MD": 1, 61 | "IN": 2, 62 | "JJ": 0, 63 | "PRP": 0, 64 | "JJR": 7, 65 | "JJS": 7, 66 | "RB": 1, 67 | "RBR": 1, 68 | "RBS": 1, 69 | "LS": 7, 70 | "RP": 0, 71 | "SYM": 7, 72 | "TO": 5, 73 | "PRP$": 0, 74 | "WDT": 5, 75 | "WP": 3, 76 | "WP$": 3, 77 | "WRB": 1, 78 | } 79 | 80 | def __init__(self, split): 81 | super(BaseDataset, self).__init__() 82 | 83 | self.anno_dirs = {} 84 | self.anno_dirs["Charades"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA" 85 | self.anno_dirs["Charades_len"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA" 86 | self.anno_dirs["Charades_mom"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA" 87 | self.anno_dirs["ActivityNet"] = "/media/HardDisk_A/users/zzb/dataset/ActivityNet" 88 | self.anno_dirs["TACoS"] = "/media/HardDisk_A/users/zzb/dataset/TACoS" 89 | self.feature_dirs = {} 90 | self.feature_dirs["Charades"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA" 91 | self.feature_dirs["Charades_mom"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA" 92 | self.feature_dirs["ActivityNet"] = "/media/HardDisk_A/users/zzb/dataset/ActivityNet" 93 | self.feature_dirs["TACoS"] = "/media/HardDisk_A/users/zzb/dataset/TACoS" 94 | self.feature_dirs["Charades_len"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA" 95 | self.input_type = {} 96 | self.input_type["Charades"] = "i3d_adam_epoch16_25fps" 97 | self.input_type["Charades_len"] = "vgg_rgb_features" 98 | self.input_type["Charades_mom"] = "vgg_rgb_features" 99 | self.input_type["ActivityNet"] = "cmcs_features" 100 | self.input_type["TACoS"] = "tall_c3d_features" 101 | self.split = split 102 | self.num_pairs = config.DATASET.num_pairs 103 | self.annotations = self.get_annotation() 104 | self.num_clips = config.DATASET.num_clips 105 | 106 | self.epsilon = 1e-10 107 | 108 | def __getitem__(self, index): 109 | video_id = self.annotations[index]["video"] 110 | gt_s_time, gt_e_time = self.annotations[index]["times"] 111 | sentence = self.annotations[index]["description"] 112 | duration = self.annotations[index]["duration"] 113 | dataset = self.annotations[index]["dataset"] 114 | # words = sentence.split() 115 | # 分词 116 | words = nltk.word_tokenize(sentence) 117 | if len(words) >= 30: 118 | words = words[:30] 119 | words_tags = nltk.pos_tag(words) 120 | word_idxs, pos_tags,keyword_mask,keyword_idxs = [], [],[],[] 121 | # print(sentence) 122 | for keyword, tag in words_tags: 123 | if tag in self.pos_tags.keys(): 124 | keyword_idxs.append(self.vocab.stoi.get(keyword.lower(), 400000)) 125 | pos_tags.append(self.pos_tags[tag] + 1) 126 | # print(word, self.pos_tags[tag] + 1) 127 | keyword_idxs = torch.tensor(keyword_idxs, dtype=torch.long) 128 | # print(sentence) 129 | for word in words: 130 | word_idxs.append(self.vocab.stoi.get(word.lower(), 400000)) 131 | word_idxs = torch.tensor(word_idxs, dtype=torch.long) 132 | word_vectors = self.word_embedding(word_idxs) 133 | keyword_mask = [1 if v in keyword_idxs else 0 for v in word_idxs] 134 | 135 | ( 136 | visual_input, 137 | visual_mask, 138 | extend_pre, 139 | extend_suf, 140 | flip_in_time_direction, 141 | ) = self.get_video_features(video_id, dataset) 142 | 143 | feat_length = visual_input.shape[0] 144 | ori_feat_length = feat_length - extend_pre - extend_suf 145 | fps = ori_feat_length / duration 146 | start_frame = int(fps * gt_s_time) 147 | end_frame = int(fps * gt_e_time) 148 | if end_frame >= ori_feat_length: 149 | end_frame = ori_feat_length - 1 150 | if start_frame > end_frame: 151 | start_frame = end_frame 152 | 153 | if flip_in_time_direction: 154 | start_frame, end_frame = ( 155 | ori_feat_length - 1 - end_frame, 156 | ori_feat_length - 1 - start_frame, 157 | ) 158 | assert start_frame <= end_frame 159 | assert 0 <= start_frame < ori_feat_length 160 | assert 0 <= end_frame < ori_feat_length 161 | start_frame += extend_pre 162 | end_frame += extend_pre 163 | 164 | start_label = np.ones(feat_length, dtype=np.float32) * self.epsilon 165 | end_label = np.ones(feat_length, dtype=np.float32) * self.epsilon 166 | 167 | y = (1 - (ori_feat_length - 3) * self.epsilon - 0.5) / 2 168 | 169 | if start_frame > 0: 170 | start_label[start_frame - 1] = y 171 | if start_frame < feat_length - 1: 172 | start_label[start_frame + 1] = y 173 | start_label[start_frame] = 0.5 174 | 175 | if end_frame > 0: 176 | end_label[end_frame - 1] = y 177 | if end_frame < feat_length - 1: 178 | end_label[end_frame + 1] = y 179 | end_label[end_frame] = 0.5 180 | # ---- above part is for ACRM use only------ 181 | 182 | internel_label = np.zeros(feat_length, dtype=np.float32) 183 | extend_inner_len = round( 184 | config.DATASET.EXTEND_INNRE * float(end_frame - start_frame + 1) 185 | ) 186 | if extend_inner_len > 0: 187 | st_ = max(0, start_frame - extend_inner_len) 188 | et_ = min(end_frame + extend_inner_len, feat_length - 1) 189 | internel_label[st_ : (et_ + 1)] = 1.0 190 | else: 191 | internel_label[start_frame:(end_frame+1)] = 1.0 192 | 193 | if np.all(internel_label==1.0): 194 | choice = np.random.choice([0, -1]) 195 | internel_label[choice] = 0.0 196 | neg_label = 1.0 - internel_label 197 | if len(internel_label) ==1: 198 | internel_label[0] = neg_label[0] = 1.0 199 | positive_indices = np.nonzero(internel_label)[0] # 获取正样本的索引 200 | if len(positive_indices) == 0: 201 | print("wrong") 202 | positive_indices = positive_indices.tolist() 203 | np.random.shuffle(positive_indices) 204 | if len(positive_indices) >= self.num_pairs: 205 | selected_positive_indices = positive_indices[:self.num_pairs] # 随机选择 num_pairs 个正样本的索引 206 | else: 207 | selected_positive_indices = positive_indices 208 | while len(selected_positive_indices) < self.num_pairs: 209 | random_positive_indices = np.random.choice(positive_indices) 210 | selected_positive_indices = np.hstack((selected_positive_indices, random_positive_indices)) 211 | 212 | # 随机选择相应的负样本的索引 213 | negative_indices = np.nonzero(neg_label)[0] # 获取正样本的索引 214 | if len(negative_indices) == 0: 215 | print("wrong") 216 | negative_indices = negative_indices.tolist() 217 | np.random.shuffle(negative_indices) 218 | if len(negative_indices) >=self.num_pairs: 219 | selected_negative_indices = negative_indices[:self.num_pairs] # 随机选择 num_pairs 个正样本的索引 220 | else: 221 | selected_negative_indices = negative_indices 222 | while len(selected_negative_indices) < self.num_pairs: 223 | random_negative_indices = np.random.choice(negative_indices) 224 | selected_negative_indices = np.hstack((selected_negative_indices, random_negative_indices)) 225 | 226 | start_frame = np.array(start_frame) 227 | end_frame = np.array(end_frame) 228 | extend_pre = np.array(extend_pre) 229 | extend_suf = np.array(extend_suf) 230 | item = { 231 | "visual_input": visual_input, 232 | "vis_mask": visual_mask, 233 | "word_vectors": word_vectors, 234 | # "pos_tags": pos_tags, 235 | "txt_mask": torch.ones(word_vectors.shape[0], 1), 236 | "start_label": torch.from_numpy(start_label), 237 | "end_label": torch.from_numpy(end_label), 238 | "internel_label": torch.from_numpy(internel_label), 239 | "start_frame": torch.from_numpy(start_frame), 240 | "end_frame": torch.from_numpy(end_frame), 241 | "extend_pre": torch.from_numpy(extend_pre), 242 | "extend_suf": torch.from_numpy(extend_suf), 243 | "keyword_mask":torch.tensor(keyword_mask), 244 | "selected_positive_indices":np.array(selected_positive_indices), 245 | "selected_negative_indices":np.array(selected_negative_indices), 246 | "visual_len": len(visual_input) 247 | 248 | } 249 | return item, self.annotations[index] 250 | 251 | def __len__(self): 252 | return len(self.annotations) 253 | 254 | def get_video_features(self, vid, dataset): 255 | with h5py.File(os.path.join(self.feature_dirs[dataset], '{}.hdf5'.format(self.input_type[dataset])), 'r') as f: 256 | if dataset == "ActivityNet" and self.input_type["ActivityNet"]=="sub_activitynet_v1-3.c3d": 257 | features = torch.from_numpy(f[vid]['c3d_features'][:]) 258 | else: 259 | features = torch.from_numpy(f[vid][:]) 260 | if dataset != "Charades": 261 | if features.shape[0] > self.num_clips: 262 | features = average_to_fixed_length(features, num_sample_clips=self.num_clips) 263 | frame_rate = 1 264 | features = features[list(range(0, features.shape[0], frame_rate))] 265 | if config.DATASET.NORMALIZE: 266 | features = F.normalize(features, dim=1) 267 | 268 | # flip the input in time direction 269 | flip_in_time_direction = False # use for start/end label flip 270 | if ( 271 | self.split == "train" 272 | and config.DATASET.FLIP_TIME 273 | and np.random.random() < 0.5 274 | ): 275 | features = torch.flip(features, dims=[0]) 276 | flip_in_time_direction = True 277 | 278 | length = features.shape[0] 279 | prefix, suffix = 0, 0 280 | # add a mean_feature in front of and end of the video to double the time length 281 | if ( 282 | self.split == "train" 283 | and config.DATASET.EXTEND_TIME 284 | and np.random.random() < 0.7 285 | ): 286 | # mean_feature = torch.mean(features, dim=0) 287 | # extend_feature = mean_feature.unsqueeze(0).repeat((prefix, 1)) # add mean feature 288 | # extend_feature = torch.zeros((prefix, features.shape[1])) # add zeros feature 289 | # --->add another_features start<--- 290 | index = np.random.randint(len(self.annotations)) # another_video 291 | video_id = self.annotations[index]["video"] 292 | while video_id == vid: 293 | index = np.random.randint(len(self.annotations)) # another_video 294 | video_id = self.annotations[index]["video"] 295 | featurePath = os.path.join(self.feature_dirs[dataset], video_id + ".npy") 296 | another_features = np.load(featurePath) 297 | another_features = np.squeeze(another_features) 298 | another_features = torch.from_numpy(another_features).float() 299 | # 特征长度最长为1500lenth 300 | if another_features.shape[0] > 1500: 301 | another_features = average_to_fixed_length( 302 | another_features, num_sample_clips=1500 303 | ) 304 | another_features = another_features[ 305 | list(range(0, another_features.shape[0], frame_rate)) 306 | ] 307 | prefix = round(np.random.random() * another_features.shape[0]) 308 | extend_feature = another_features[:prefix] 309 | assert extend_feature.shape[0] == prefix 310 | # --->add another_features end<--- 311 | features = torch.cat([extend_feature, features], dim=0) 312 | vis_mask = torch.ones((features.shape[0], 1)) 313 | 314 | return features, vis_mask, prefix, suffix, flip_in_time_direction 315 | 316 | def get_annotation(self, dataset): 317 | raise NotImplementedError 318 | -------------------------------------------------------------------------------- /models/PGBP/decoder.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn, Tensor 6 | import math 7 | from .operation import Conv1D, mask_logits 8 | from torchvision.ops import RoIAlign 9 | from .layers import Prediction 10 | 11 | class MultiheadAttention(nn.Module): 12 | def __init__(self, dim,num_heads,dropout,dim_v): 13 | super(MultiheadAttention, self).__init__() 14 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % ( 15 | dim, num_heads) 16 | self.head_size, self.num_heads, self.dim = int( 17 | dim / num_heads), num_heads, dim 18 | self.head_size_v = int(dim_v/num_heads) 19 | self.dim_v = dim_v 20 | self.dropout = nn.Dropout(p=dropout) 21 | def transpose_for_scores(self, x): 22 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size) 23 | x = x.view(*new_x_shape) 24 | return x.permute(0, 2, 1, 25 | 3) # (batch_size, num_heads, w_seq_len, head_size) 26 | 27 | def transpose_for_scores_v(self, x): 28 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size_v) 29 | x = x.view(*new_x_shape) 30 | return x.permute(0, 2, 1, 31 | 3) # (batch_size, num_heads, w_seq_len, head_size) 32 | @staticmethod 33 | def combine_last_two_dim(x): 34 | old_shape = list(x.size()) 35 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]] 36 | return x.reshape(shape=new_shape) 37 | 38 | def forward(self, q,k,v, mask=None): 39 | query = self.transpose_for_scores( 40 | q.permute(1, 0, 2)) # (batch_size, num_heads, seq_len, head_size) 41 | key = self.transpose_for_scores(k.permute(1, 0, 2)) 42 | value = self.transpose_for_scores_v(v.permute(1, 0, 2)) 43 | attention_scores = torch.matmul(query, key.transpose( 44 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len) 45 | attention_scores = attention_scores / math.sqrt(self.head_size) 46 | if mask is not None: # masking 47 | mask = mask.unsqueeze(1).unsqueeze( 48 | 2) # (batch_size, 1, 1, seq_len) 49 | attention_scores = mask_logits(attention_scores, mask) 50 | attention_probs = torch.softmax( 51 | attention_scores, 52 | dim=-1) # (batch_size, num_heads, seq_len, seq_len) 53 | attention_probs = self.dropout(attention_probs) 54 | value = torch.matmul( 55 | attention_probs, 56 | value) # (batch_size, num_heads, seq_len, head_size) 57 | value = self.combine_last_two_dim(value.permute( 58 | 0, 2, 1, 3)) # (batch_size, seq_len, dim) 59 | # intermediate layer 60 | return value.permute(1, 0, 2) 61 | 62 | 63 | class TransformerDecoder(nn.Module): 64 | 65 | def __init__(self, decoder_layer, configs, norm=None): 66 | super().__init__() 67 | self.layers = _get_clones(decoder_layer, configs.detr_layers) 68 | self.detr_layers = configs.detr_layers 69 | self.norm = norm 70 | self.return_intermediate = configs.return_intermediate 71 | assert configs.return_intermediate 72 | self.query_dim = configs.query_dim 73 | self.dim = configs.dim 74 | self.norm1 = nn.LayerNorm(configs.dim) 75 | self.norm2 = nn.LayerNorm(configs.dim) 76 | assert configs.query_scale_type in ['cond_elewise', 'cond_scalar', 'fix_elewise'] 77 | self.query_scale_type = configs.query_scale_type 78 | if configs.query_scale_type == 'cond_elewise': 79 | self.query_scale = MLP(configs.dim, configs.dim, configs.dim, 2) 80 | elif configs.query_scale_type == 'cond_scalar': 81 | self.query_scale = MLP(configs.dim, configs.dim, 1, 2) 82 | elif configs.query_scale_type == 'fix_elewise': 83 | self.query_scale = nn.Embedding(configs.detr_layers, configs.dim) 84 | else: 85 | raise NotImplementedError("Unknown query_scale_type: {}".format(configs.query_scale_type)) 86 | 87 | self.ref_point_head = MLP(configs.dim, configs.dim, configs.dim, 2) 88 | 89 | # self.bbox_embed = None 90 | # for DAB-deter 91 | if configs.bbox_embed_diff_each_layer: 92 | self.bbox_embed = nn.ModuleList([MLP(configs.dim, configs.dim, 2, 3) for i in range(configs.detr_layers)]) 93 | else: 94 | self.bbox_embed = MLP(configs.dim, configs.dim, 2, 3) 95 | # init bbox_embed 96 | if configs.bbox_embed_diff_each_layer: 97 | for bbox_embed in self.bbox_embed: 98 | nn.init.constant_(bbox_embed.layers[-1].weight.data, 0) 99 | nn.init.constant_(bbox_embed.layers[-1].bias.data, 0) 100 | else: 101 | nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) 102 | nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) 103 | self.d_model =configs.dim 104 | self.modulate_t_attn = configs.modulate_t_attn 105 | self.bbox_embed_diff_each_layer = configs.bbox_embed_diff_each_layer 106 | 107 | if configs.modulate_t_attn: 108 | self.ref_anchor_head = MLP(configs.dim, configs.dim, 1, 2) 109 | 110 | if not configs.keep_query_pos: 111 | for layer_id in range(configs.detr_layers - 1): 112 | self.layers[layer_id + 1].ca_qpos_proj = None 113 | 114 | def forward(self,pos_feature,scale,tgt, memory, 115 | tgt_mask: Optional[Tensor] = None, 116 | memory_mask: Optional[Tensor] = None, 117 | tgt_key_padding_mask: Optional[Tensor] = None, 118 | memory_key_padding_mask: Optional[Tensor] = None, 119 | pos: Optional[Tensor] = None, 120 | refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2 121 | ): 122 | output =self.norm1(tgt) #torch.Size([10, 32, 256]) 123 | memory = self.norm2(memory) 124 | intermediate = [] 125 | reference_points = refpoints_unsigmoid.sigmoid() 126 | ref_points = [reference_points] 127 | # import ipdb; ipdb.set_trace() 128 | 129 | for layer_id, layer in enumerate(self.layers): #rence_points torch.Size([10, 32, 2]) 130 | obj_center = reference_points[..., :self.query_dim]#torch.Size([10, 32, 2]) 131 | # get sine embedding for the query vector 132 | query_sine_embed = gen_sineembed_for_position(obj_center,self.dim//2) 133 | # print('line230', query_sine_embed.shape) 134 | query_pos = self.ref_point_head(query_sine_embed) #torch.Size([10, 32, 256]) 135 | # print('line232',query_sine_embed.shape) 136 | # For the first decoder layer, we do not apply transformation over p_s 137 | if self.query_scale_type != 'fix_elewise': 138 | if layer_id == 0: 139 | pos_transformation = 1 140 | else: 141 | pos_transformation = self.query_scale(output) 142 | else: 143 | pos_transformation = self.query_scale.weight[layer_id] 144 | 145 | # apply transformation 146 | # print(query_sine_embed.shape) # 10 32 512 147 | query_sine_embed = query_sine_embed * pos_transformation 148 | 149 | # modulated HW attentions 150 | if self.modulate_t_attn: 151 | reft_cond = self.ref_anchor_head(output).sigmoid() # nq, bs, 1 152 | # print(reft_cond.shape, reft_cond[..., 0].shape) # 10 32 1, 10 32 153 | # print(obj_center.shape, obj_center[..., 1].shape) # 10 32 2, 10 32 154 | # print(query_sine_embed.shape) # 10 32 256 155 | 156 | query_sine_embed *= (reft_cond[..., 0] / obj_center[..., 1]).unsqueeze(-1) 157 | 158 | output = layer(pos_feature,scale,reference_points,output, memory, tgt_mask=tgt_mask, 159 | memory_mask=memory_mask, 160 | tgt_key_padding_mask=tgt_key_padding_mask, 161 | memory_key_padding_mask=memory_key_padding_mask, 162 | pos=pos, query_pos=query_pos, query_sine_embed=query_sine_embed, 163 | is_first=(layer_id == 0)) #torch.Size([10, 32, 256]) 164 | 165 | # iter update 166 | if self.bbox_embed is not None: 167 | if self.bbox_embed_diff_each_layer: 168 | tmp = self.bbox_embed[layer_id](output) 169 | else: 170 | tmp = self.bbox_embed(output) 171 | # import ipdb; ipdb.set_trace() 172 | tmp[..., :self.query_dim] += inverse_sigmoid(reference_points) 173 | new_reference_points = tmp[..., :self.query_dim].sigmoid() 174 | if layer_id != self.detr_layers - 1: 175 | ref_points.append(new_reference_points) 176 | reference_points = new_reference_points.detach() #torch.Size([10, 32, 2]) 177 | 178 | if self.return_intermediate: 179 | intermediate.append(self.norm(output)) 180 | 181 | if self.norm is not None: 182 | output = self.norm(output) 183 | if self.return_intermediate: 184 | intermediate.pop() 185 | intermediate.append(output) 186 | 187 | if self.return_intermediate: 188 | if self.bbox_embed is not None: 189 | return [ 190 | torch.stack(intermediate).transpose(1, 2), 191 | torch.stack(ref_points).transpose(1, 2), 192 | ] 193 | else: 194 | return [ 195 | torch.stack(intermediate).transpose(1, 2), 196 | reference_points.unsqueeze(0).transpose(1, 2) 197 | ] 198 | 199 | return output.unsqueeze(0) 200 | 201 | class TransformerDecoderLayer(nn.Module): 202 | 203 | def __init__(self, configs): 204 | super().__init__() 205 | # Decoder Self-Attention 206 | d_model = configs.dim 207 | nhead =configs.num_heads 208 | rm_self_attn_decoder = configs.rm_self_attn_decoder 209 | dropout = configs.dropout 210 | dim_feedforward = configs.feedforward 211 | beta = configs.beta 212 | self.sementic_fu = configs.sementic_fu 213 | self.aligned_len = configs.aligned_len 214 | 215 | if not rm_self_attn_decoder: 216 | self.sa_qcontent_proj = nn.Linear(d_model, d_model) 217 | self.sa_qpos_proj = nn.Linear(d_model, d_model) 218 | self.sa_kcontent_proj = nn.Linear(d_model, d_model) 219 | self.sa_kpos_proj = nn.Linear(d_model, d_model) 220 | self.sa_v_proj = nn.Linear(d_model, d_model) 221 | self.self_attn = MultiheadAttention(d_model, nhead,dropout,dim_v=d_model) 222 | self.norm1 = nn.LayerNorm(d_model) 223 | self.dropout1 = nn.Dropout(dropout) 224 | 225 | # Decoder Cross-Attention 226 | self.ca_qcontent_proj = nn.Linear(d_model, d_model) 227 | self.ca_qpos_proj = nn.Linear(d_model, d_model) 228 | self.ca_kcontent_proj = nn.Linear(d_model, d_model) 229 | self.ca_kpos_proj = nn.Linear(d_model, d_model) 230 | self.ca_v_proj = nn.Linear(d_model, d_model) 231 | self.ca_qpos_sine_proj = nn.Linear(d_model, d_model) 232 | self.cross_attn = MultiheadAttention(d_model * 2, nhead,dropout, dim_v=d_model) 233 | 234 | self.nhead = nhead 235 | self.rm_self_attn_decoder = rm_self_attn_decoder 236 | 237 | # Implementation of Feedforward model 238 | self.linear1 = nn.Linear(d_model, dim_feedforward) 239 | self.dropout = nn.Dropout(dropout) 240 | self.linear2 = nn.Linear(dim_feedforward, d_model) 241 | 242 | self.norm2 = nn.LayerNorm(d_model) 243 | self.norm3 = nn.LayerNorm(d_model) 244 | self.norm4 = nn.LayerNorm(d_model) 245 | self.dropout2 = nn.Dropout(dropout) 246 | self.dropout3 = nn.Dropout(dropout) 247 | self.dropout4 = nn.Dropout(dropout) 248 | 249 | self.activation = _get_activation_fn(configs.activation) 250 | self.normalize_before = configs.normalize_before 251 | self.keep_query_pos = configs.keep_query_pos 252 | if self.sementic_fu is True: 253 | self.sementic_fusion = semantic_align(d_model,dropout,beta,self.aligned_len) 254 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 255 | return tensor if pos is None else tensor + pos 256 | 257 | def forward(self, pos_feature,scale,ref_points,tgt, memory, 258 | tgt_mask: Optional[Tensor] = None, 259 | memory_mask: Optional[Tensor] = None, 260 | tgt_key_padding_mask: Optional[Tensor] = None, 261 | memory_key_padding_mask: Optional[Tensor] = None, 262 | pos: Optional[Tensor] = None, 263 | query_pos: Optional[Tensor] = None, 264 | query_sine_embed=None, 265 | is_first=False, 266 | ): 267 | 268 | # ========== Begin of Self-Attention ============= 269 | if not self.rm_self_attn_decoder: 270 | # Apply projections here 271 | # shape: num_queries x batch_size x 256 272 | q_content = self.sa_qcontent_proj(tgt) # target is the input of the first decoder layer. zero by default. 273 | q_pos = self.sa_qpos_proj(query_pos) 274 | k_content = self.sa_kcontent_proj(tgt) 275 | k_pos = self.sa_kpos_proj(query_pos) 276 | v = self.sa_v_proj(tgt) 277 | 278 | num_queries, bs, n_model = q_content.shape 279 | hw, _, _ = k_content.shape 280 | 281 | q = q_content + q_pos 282 | k = k_content + k_pos 283 | 284 | tgt2 = self.self_attn(q, k, v, mask=tgt_key_padding_mask) 285 | # ========== End of Self-Attention ============= 286 | box = ref_points.transpose(0,1) * scale.unsqueeze(1) 287 | tgt = tgt + self.dropout1(tgt2) 288 | tgt = self.norm1(tgt) 289 | if self.sementic_fu is True: 290 | tgt3 = self.sementic_fusion(memory.transpose(0,1),box,tgt.transpose(0,1),pos_feature) 291 | tgt3 =tgt + self.dropout4(tgt3) 292 | tgt = self.norm4(tgt3) 293 | # ========== Begin of Cross-Attention ============= 294 | # Apply projections here 295 | # shape: num_queries x batch_size x 256 296 | q_content = self.ca_qcontent_proj(tgt) 297 | k_content = self.ca_kcontent_proj(memory) 298 | v = self.ca_v_proj(memory) 299 | 300 | num_queries, bs, n_model = q_content.shape 301 | hw, _, _ = k_content.shape 302 | 303 | k_pos = self.ca_kpos_proj(pos) 304 | 305 | # For the first decoder layer, we concatenate the positional embedding predicted from 306 | # the object query (the positional embedding) into the original query (key) in DETR. 307 | if is_first or self.keep_query_pos: 308 | q_pos = self.ca_qpos_proj(query_pos) 309 | q = q_content + q_pos 310 | k = k_content + k_pos 311 | else: 312 | q = q_content 313 | k = k_content 314 | 315 | q = q.view(num_queries, bs, self.nhead, n_model // self.nhead) 316 | query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed) 317 | query_sine_embed = query_sine_embed.view(num_queries, bs, self.nhead, n_model // self.nhead) 318 | q = torch.cat([q, query_sine_embed], dim=3).view(num_queries, bs, n_model * 2) 319 | k = k.view(hw, bs, self.nhead, n_model // self.nhead) 320 | k_pos = k_pos.view(hw, bs, self.nhead, n_model // self.nhead) 321 | k = torch.cat([k, k_pos], dim=3).view(hw, bs, n_model * 2) 322 | 323 | tgt2 = self.cross_attn(q,k,v,mask=memory_key_padding_mask) 324 | # ========== End of Cross-Attention ============= 325 | 326 | tgt = tgt + self.dropout2(tgt2) 327 | tgt = self.norm2(tgt) 328 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) 329 | tgt = tgt + self.dropout3(tgt2) 330 | tgt = self.norm3(tgt) 331 | return tgt 332 | 333 | def _get_activation_fn(activation): 334 | """Return an activation function given a string""" 335 | if activation == "relu": 336 | return F.relu 337 | if activation == "gelu": 338 | return F.gelu 339 | if activation == "glu": 340 | return F.glu 341 | if activation == "prelu": 342 | return nn.PReLU() 343 | if activation == "selu": 344 | return F.selu 345 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 346 | 347 | class MLP(nn.Module): 348 | """ Very simple multi-layer perceptron (also called FFN)""" 349 | 350 | def __init__(self, input_dim, hidden_dim, output_dim, detr_layers): 351 | super().__init__() 352 | self.detr_layers = detr_layers 353 | h = [hidden_dim] * (detr_layers - 1) 354 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 355 | 356 | def forward(self, x): 357 | for i, layer in enumerate(self.layers): 358 | x = F.relu(layer(x)) if i < self.detr_layers - 1 else layer(x) 359 | return x 360 | 361 | def inverse_sigmoid(x, eps=1e-3): 362 | x = x.clamp(min=0, max=1) 363 | x1 = x.clamp(min=eps) 364 | x2 = (1 - x).clamp(min=eps) 365 | return torch.log(x1/x2) 366 | 367 | def gen_sineembed_for_position(pos_tensor,dim): 368 | # n_query, bs, _ = pos_tensor.size() 369 | # sineembed_tensor = torch.zeros(n_query, bs, 256) 370 | scale = 2 * math.pi 371 | dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device) 372 | # dim_t = 10000 ** (2 * (dim_t // 2) / dim) 373 | dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode='trunc') / dim) 374 | center_embed = pos_tensor[:, :, 0] * scale 375 | pos_x = center_embed[:, :, None] / dim_t 376 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) 377 | 378 | span_embed = pos_tensor[:, :, 1] * scale 379 | pos_w = span_embed[:, :, None] / dim_t 380 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) 381 | 382 | pos = torch.cat((pos_x, pos_w), dim=2) 383 | return pos 384 | 385 | def _get_clones(module, N): 386 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 387 | 388 | class semantic_align(nn.Module): 389 | def __init__(self, dim, dropout,beta,aligned_len): 390 | super().__init__() 391 | self.aligned_len = aligned_len 392 | self.gate = Prediction(in_dim= 2*dim, hidden_dim= dim, out_dim=2,drop_rate=dropout) 393 | self.softmax = nn.Softmax(2) 394 | self.contrast1 = ContrastBlock(dim,beta) 395 | self.contrast2 = ContrastBlock(dim,beta) 396 | def forward(self,features,quires_box,quires_features,pos_feature): 397 | B, L1, _ = quires_box.shape 398 | _,L,C = features.shape 399 | batch_feature = [] 400 | roi_start = torch.round(((quires_box[..., 0] - quires_box[..., 1] / 2)*L).clamp(0, L-1)).long() 401 | roi_end = torch.round(((quires_box[..., 0] + quires_box[..., 1] / 2)*L).clamp(0, L-1)).long() 402 | start_features = torch.gather(features, dim=1, index=roi_start.unsqueeze(-1).expand(-1, -1, C)) 403 | start_features = self.contrast1(start_features,pos_feature).unsqueeze(-2) 404 | end_features = torch.gather(features, dim=1, index=roi_end.unsqueeze(-1).expand(-1, -1, C)) 405 | end_features = self.contrast2(end_features,pos_feature).unsqueeze(-2) 406 | boundary_features = torch.cat((start_features,end_features),dim = -2) 407 | if self.aligned_len: 408 | pool_boundary_features = torch.mean(boundary_features, dim=2, keepdim=False) 409 | else: 410 | pool_boundary_features,_ = torch.max(boundary_features, dim=2, keepdim=False) 411 | x = torch.cat([pool_boundary_features ,quires_features],dim = -1) 412 | gate =self.softmax(self.gate(x)) 413 | x = pool_boundary_features*gate[...,0:1] + quires_features*gate[...,1:2] 414 | return x.transpose(0,1) 415 | 416 | 417 | class ContrastBlock(nn.Module): 418 | def __init__(self, dim, beta): 419 | super(ContrastBlock, self).__init__() 420 | self.conv1 = nn.Conv1d(in_channels=dim, 421 | out_channels=dim//beta, 422 | kernel_size=1, 423 | stride=1, 424 | padding=0, 425 | bias=True) 426 | self.conv2 = nn.Conv1d(in_channels=dim//beta, 427 | out_channels=dim, 428 | kernel_size=1, 429 | stride=1, 430 | padding=0, 431 | bias=True) 432 | self.activation = nn.ReLU() 433 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 434 | 435 | def forward(self,v_1,v_2): 436 | v_1 = v_1.transpose(1, 2) 437 | v_2 = v_2.transpose(1, 2) 438 | v = v_1 * v_2 439 | v = self.conv1(v) 440 | v = self.activation(v) 441 | v = torch.sigmoid(self.layer_norm1(self.conv2(v).transpose(1, 2))) 442 | v = v * v_1.transpose(1, 2) 443 | return v 444 | # class semantic_align(nn.Module): 445 | # def __init__(self, dim,beta,aligned_len): 446 | # super().__init__() 447 | # self.aligned_len = aligned_len 448 | # self.gate = nn.Linear(in_features=2*dim,out_features=2) 449 | # self.softmax = nn.Softmax(2) 450 | # self.contrast = ContrastBlock(dim,beta) 451 | # def forward(self, features,quires_box,quires_features,pos_feature): 452 | # B, L1, _ = quires_box.shape 453 | # _,L,C = features.shape 454 | # batch_feature = [] 455 | # roi_start = torch.round(((quires_box[..., 0] - quires_box[..., 1] / 2)*L).clamp(0, L-1)).long() 456 | # roi_end = torch.round(((quires_box[..., 0] + quires_box[..., 1] / 2)*L).clamp(0, L-1)).long() 457 | # start_features = torch.gather(features, dim=1, index=roi_start.unsqueeze(-1).expand(-1, -1, C)).unsqueeze(-2) 458 | # end_features = torch.gather(features, dim=1, index=roi_end.unsqueeze(-1).expand(-1, -1, C)).unsqueeze(-2) 459 | # boundary_features = torch.cat((start_features,end_features),dim = -2) 460 | # boundary_features = self.contrast(boundary_features,pos_feature) 461 | # if self.aligned_len: 462 | # pool_boundary_features = torch.mean(boundary_features, dim=2, keepdim=False) 463 | # else: 464 | # pool_boundary_features,_ = torch.max(boundary_features, dim=2, keepdim=False) 465 | # x = torch.cat([pool_boundary_features ,quires_features],dim = -1) 466 | # gate =self.softmax(self.gate(x)) 467 | # x = pool_boundary_features*gate[...,0:1] + quires_features*gate[...,1:2] 468 | # return x.transpose(0,1) 469 | 470 | 471 | # class ContrastBlock(nn.Module): 472 | # def __init__(self, dim, beta): 473 | # super(ContrastBlock, self).__init__() 474 | # self.conv1 = nn.Conv1d(in_channels=dim, 475 | # out_channels=dim//beta, 476 | # kernel_size=1, 477 | # stride=1, 478 | # padding=0, 479 | # bias=True) 480 | # self.conv2 = nn.Conv1d(in_channels=dim//beta, 481 | # out_channels=dim, 482 | # kernel_size=1, 483 | # stride=1, 484 | # padding=0, 485 | # bias=True) 486 | # self.activation = nn.ReLU() 487 | # self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 488 | 489 | # def forward(self,v_1,v_2): 490 | # v_1 = v_1.transpose(1, 2) 491 | # v_2 = v_2.transpose(1, 2) 492 | # v = v_1 * v_2 493 | # v = self.conv1(v) 494 | # v = self.activation(v) 495 | # v = torch.sigmoid(self.layer_norm1(self.conv2(v).transpose(1, 2))) 496 | # v = v * v_1.transpose(1, 2) 497 | # return v -------------------------------------------------------------------------------- /models/PGBP/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import torch.nn.functional as F 5 | from .operation import Conv1D, mask_logits 6 | from .encoder import MultiStepLSTMEncoder, TemporalContextModule 7 | from .phraseEncoder import PhraseEncodeNet 8 | 9 | class TemporalMaxer(nn.Module): 10 | def __init__( 11 | self, 12 | kernel_size, 13 | stride, 14 | padding, 15 | n_embd): 16 | super().__init__() 17 | self.ds_pooling = nn.MaxPool1d( 18 | kernel_size, stride=stride, padding=padding) 19 | 20 | self.stride = stride 21 | 22 | def forward(self, x, mask): 23 | 24 | # out, out_mask = self.channel_att(x, mask) 25 | x = x.permute(0,2,1) 26 | mask = mask.unsqueeze(1) 27 | if self.stride > 1: 28 | # downsample the mask using nearest neighbor 29 | out_mask = F.interpolate( 30 | mask.to(x.dtype), size=(x.size(-1)+self.stride-1)//self.stride, mode='nearest') 31 | else: 32 | # masking out the features 33 | out_mask = mask 34 | 35 | out = self.ds_pooling(x) * out_mask.to(x.dtype) 36 | out = out.permute(0,2,1) 37 | return out, out_mask.squeeze(1) 38 | 39 | class DETR_Decoder(nn.Module): 40 | def __init__(self, configs): 41 | super(DETR_Decoder, self).__init__() 42 | dim = configs.dim 43 | num_heads = configs.num_heads 44 | drop_rate = configs.drop_rate 45 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % ( 46 | dim, num_heads) 47 | self.head_size, self.num_heads, self.dim = int( 48 | dim / num_heads), num_heads, dim 49 | self.attention = Cross_Attention(configs) 50 | self.dropout = nn.Dropout(p=drop_rate) 51 | self.query = Conv1D(in_dim=dim, 52 | out_dim=dim, 53 | kernel_size=1, 54 | stride=1, 55 | padding=0, 56 | bias=True) 57 | self.key = Conv1D(in_dim=dim, 58 | out_dim=dim, 59 | kernel_size=1, 60 | stride=1, 61 | padding=0, 62 | bias=True) 63 | self.value = Conv1D(in_dim=dim, 64 | out_dim=dim, 65 | kernel_size=1, 66 | stride=1, 67 | padding=0, 68 | bias=True) 69 | # self.value_visual = None 70 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 71 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6) 72 | self.out_layer1 = Conv1D(in_dim=dim, 73 | out_dim=dim, 74 | kernel_size=1, 75 | stride=1, 76 | padding=0, 77 | bias=True) 78 | self.output_activation = nn.GELU() 79 | self.out_layer2 = Conv1D(in_dim=dim, 80 | out_dim=dim, 81 | kernel_size=1, 82 | stride=1, 83 | padding=0, 84 | bias=True) 85 | 86 | def transpose_for_scores(self, x): 87 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size) 88 | x = x.view(*new_x_shape) 89 | return x.permute(0, 2, 1, 90 | 3) # (batch_size, num_heads, w_seq_len, head_size) 91 | 92 | @staticmethod 93 | def combine_last_two_dim(x): 94 | old_shape = list(x.size()) 95 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]] 96 | return x.reshape(shape=new_shape) 97 | 98 | def forward(self, memory, x,mask = None): 99 | output = self.layer_norm1(memory) 100 | query = self.transpose_for_scores( 101 | self.query(output)) # (batch_size, num_heads, seq_len, head_size) 102 | key = self.transpose_for_scores(self.key(output)) 103 | value = self.transpose_for_scores(self.value(output)) 104 | attention_scores = torch.matmul(query, key.transpose( 105 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len) 106 | attention_scores = attention_scores / math.sqrt(self.head_size) 107 | attention_probs = torch.softmax( 108 | attention_scores, 109 | dim=-1) # (batch_size, num_heads, seq_len, seq_len) 110 | attention_probs = self.dropout(attention_probs) 111 | value = torch.matmul( 112 | attention_probs, 113 | value) # (batch_size, num_heads, seq_len, head_size) 114 | value = self.combine_last_two_dim(value.permute( 115 | 0, 2, 1, 3)) # (batch_size, seq_len, dim) 116 | # intermediate layer 117 | output = self.dropout(value) 118 | residual = output + memory 119 | residual = self.layer_norm2(residual) 120 | output = self.attention(residual,x,mask) 121 | return output 122 | 123 | class Cross_Attention(nn.Module): 124 | def __init__(self, configs): 125 | super(Cross_Attention, self).__init__() 126 | dim = configs.dim 127 | num_heads = configs.num_heads 128 | drop_rate = configs.drop_rate 129 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % ( 130 | dim, num_heads) 131 | self.head_size, self.num_heads, self.dim = int( 132 | dim / num_heads), num_heads, dim 133 | self.dropout = nn.Dropout(p=drop_rate) 134 | self.query = Conv1D(in_dim=dim, 135 | out_dim=dim, 136 | kernel_size=1, 137 | stride=1, 138 | padding=0, 139 | bias=True) 140 | self.key = Conv1D(in_dim=dim, 141 | out_dim=dim, 142 | kernel_size=1, 143 | stride=1, 144 | padding=0, 145 | bias=True) 146 | self.value = Conv1D(in_dim=dim, 147 | out_dim=dim, 148 | kernel_size=1, 149 | stride=1, 150 | padding=0, 151 | bias=True) 152 | # self.value_visual = None 153 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 154 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6) 155 | self.layer_norm3 = nn.LayerNorm(dim, eps=1e-6) 156 | self.out_layer1 = Conv1D(in_dim=dim, 157 | out_dim=dim, 158 | kernel_size=1, 159 | stride=1, 160 | padding=0, 161 | bias=True) 162 | self.output_activation = nn.GELU() 163 | self.out_layer2 = Conv1D(in_dim=dim, 164 | out_dim=dim, 165 | kernel_size=1, 166 | stride=1, 167 | padding=0, 168 | bias=True) 169 | 170 | def transpose_for_scores(self, x): 171 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size) 172 | x = x.view(*new_x_shape) 173 | return x.permute(0, 2, 1, 174 | 3) # (batch_size, num_heads, w_seq_len, head_size) 175 | 176 | @staticmethod 177 | def combine_last_two_dim(x): 178 | old_shape = list(x.size()) 179 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]] 180 | return x.reshape(shape=new_shape) 181 | 182 | def forward(self, memory,x,mask = None): 183 | output = self.layer_norm1(memory) 184 | x = self.layer_norm3(x) 185 | query = self.transpose_for_scores( 186 | self.query(output)) # (batch_size, num_heads, seq_len, head_size) 187 | key = self.transpose_for_scores(self.key(x)) 188 | value = self.transpose_for_scores(self.value(x)) 189 | attention_scores = torch.matmul(query, key.transpose( 190 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len) 191 | attention_scores = attention_scores / math.sqrt(self.head_size) 192 | if mask is not None: # masking 193 | mask = mask.unsqueeze(1).unsqueeze( 194 | 2) # (batch_size, 1, 1, seq_len) 195 | attention_scores = mask_logits(attention_scores, mask) 196 | attention_probs = torch.softmax( 197 | attention_scores, 198 | dim=-1) # (batch_size, num_heads, seq_len, seq_len) 199 | attention_probs = self.dropout(attention_probs) 200 | value = torch.matmul( 201 | attention_probs, 202 | value) # (batch_size, num_heads, seq_len, head_size) 203 | value = self.combine_last_two_dim(value.permute( 204 | 0, 2, 1, 3)) # (batch_size, seq_len, dim) 205 | # intermediate layer 206 | output = self.dropout(value) 207 | residual = output + memory 208 | output = self.layer_norm2(residual) 209 | output = self.out_layer1(output) 210 | output = self.output_activation(output) 211 | output = self.dropout(output) 212 | output = self.out_layer2(output) + residual 213 | return output 214 | 215 | class MultiHeadAttention(nn.Module): 216 | def __init__(self, configs): 217 | super(MultiHeadAttention, self).__init__() 218 | dim = configs.dim 219 | num_heads = configs.num_heads 220 | drop_rate = configs.drop_rate 221 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % ( 222 | dim, num_heads) 223 | self.head_size, self.num_heads, self.dim = int( 224 | dim / num_heads), num_heads, dim 225 | self.dropout = nn.Dropout(p=drop_rate) 226 | self.query = Conv1D(in_dim=dim, 227 | out_dim=dim, 228 | kernel_size=1, 229 | stride=1, 230 | padding=0, 231 | bias=True) 232 | self.key = Conv1D(in_dim=dim, 233 | out_dim=dim, 234 | kernel_size=1, 235 | stride=1, 236 | padding=0, 237 | bias=True) 238 | self.value = Conv1D(in_dim=dim, 239 | out_dim=dim, 240 | kernel_size=1, 241 | stride=1, 242 | padding=0, 243 | bias=True) 244 | # self.value_visual = None 245 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 246 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6) 247 | self.out_layer1 = Conv1D(in_dim=dim, 248 | out_dim=dim, 249 | kernel_size=1, 250 | stride=1, 251 | padding=0, 252 | bias=True) 253 | self.output_activation = nn.GELU() 254 | self.out_layer2 = Conv1D(in_dim=dim, 255 | out_dim=dim, 256 | kernel_size=1, 257 | stride=1, 258 | padding=0, 259 | bias=True) 260 | 261 | def transpose_for_scores(self, x): 262 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size) 263 | x = x.view(*new_x_shape) 264 | return x.permute(0, 2, 1, 265 | 3) # (batch_size, num_heads, w_seq_len, head_size) 266 | 267 | @staticmethod 268 | def combine_last_two_dim(x): 269 | old_shape = list(x.size()) 270 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]] 271 | return x.reshape(shape=new_shape) 272 | 273 | def forward(self, x, mask=None): 274 | output = self.layer_norm1(x) # (batch_size, seq_len, dim) 275 | # output = self.dropout(output) 276 | # multi-head attention layer 277 | query = self.transpose_for_scores( 278 | self.query(output)) # (batch_size, num_heads, seq_len, head_size) 279 | key = self.transpose_for_scores(self.key(output)) 280 | value = self.transpose_for_scores(self.value(output)) 281 | attention_scores = torch.matmul(query, key.transpose( 282 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len) 283 | attention_scores = attention_scores / math.sqrt(self.head_size) 284 | if mask is not None: # masking 285 | mask = mask.unsqueeze(1).unsqueeze( 286 | 2) # (batch_size, 1, 1, seq_len) 287 | attention_scores = mask_logits(attention_scores, mask) 288 | attention_probs = torch.softmax( 289 | attention_scores, 290 | dim=-1) # (batch_size, num_heads, seq_len, seq_len) 291 | attention_probs = self.dropout(attention_probs) 292 | value = torch.matmul( 293 | attention_probs, 294 | value) # (batch_size, num_heads, seq_len, head_size) 295 | value = self.combine_last_two_dim(value.permute( 296 | 0, 2, 1, 3)) # (batch_size, seq_len, dim) 297 | # intermediate layer 298 | output = self.dropout(value) 299 | residual = x + output 300 | output = self.layer_norm2(residual) 301 | output = self.out_layer1(output) 302 | output = self.output_activation(output) 303 | output = self.dropout(output) 304 | output = self.out_layer2(output) + residual 305 | return output 306 | 307 | 308 | class MultiLSTMAttention(nn.Module): 309 | def __init__(self, configs): 310 | super(MultiLSTMAttention, self).__init__() 311 | dim = configs.dim 312 | num_heads = configs.num_heads 313 | drop_rate = configs.drop_rate 314 | num_layers = configs.num_layers 315 | num_step = configs.num_step 316 | bi_direction = configs.bi_direction 317 | 318 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % ( 319 | dim, num_heads) 320 | self.head_size, self.num_heads, self.dim = int( 321 | dim / num_heads), num_heads, dim 322 | self.dropout = nn.Dropout(p=drop_rate) 323 | self.query = MultiStepLSTMEncoder(in_dim=dim, 324 | out_dim=dim, 325 | num_layers=num_layers, 326 | num_step=num_step, 327 | bi_direction=bi_direction, 328 | drop_rate=drop_rate) 329 | self.key = MultiStepLSTMEncoder(in_dim=dim, 330 | out_dim=dim, 331 | num_layers=num_layers, 332 | num_step=num_step, 333 | bi_direction=bi_direction, 334 | drop_rate=drop_rate) 335 | self.value = MultiStepLSTMEncoder(in_dim=dim, 336 | out_dim=dim, 337 | num_layers=num_layers, 338 | num_step=num_step, 339 | bi_direction=bi_direction, 340 | drop_rate=drop_rate) 341 | # self.value_visual = None 342 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 343 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6) 344 | self.out_layer1 = Conv1D(in_dim=dim, 345 | out_dim=dim, 346 | kernel_size=1, 347 | stride=1, 348 | padding=0, 349 | bias=True) 350 | self.output_activation = nn.GELU() 351 | self.out_layer2 = Conv1D(in_dim=dim, 352 | out_dim=dim, 353 | kernel_size=1, 354 | stride=1, 355 | padding=0, 356 | bias=True) 357 | 358 | def transpose_for_scores(self, x): 359 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size) 360 | x = x.view(*new_x_shape) 361 | return x.permute(0, 2, 1, 362 | 3) # (batch_size, num_heads, w_seq_len, head_size) 363 | 364 | @staticmethod 365 | def combine_last_two_dim(x): 366 | old_shape = list(x.size()) 367 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]] 368 | return x.reshape(shape=new_shape) 369 | 370 | def forward(self, x, mask=None): 371 | output = self.layer_norm1(x) # (batch_size, seq_len, dim) 372 | # output = self.dropout(output) 373 | # multi-head attention layer 374 | query = self.transpose_for_scores( 375 | self.query(output)) # (batch_size, num_heads, seq_len, head_size) 376 | key = self.transpose_for_scores(self.key(output)) 377 | value = self.transpose_for_scores(self.value(output)) 378 | attention_scores = torch.matmul(query, key.transpose( 379 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len) 380 | attention_scores = attention_scores / math.sqrt(self.head_size) 381 | if mask is not None: # masking 382 | mask = mask.unsqueeze(1).unsqueeze( 383 | 2) # (batch_size, 1, 1, seq_len) 384 | attention_scores = mask_logits(attention_scores, mask) 385 | attention_probs = torch.softmax( 386 | attention_scores, 387 | dim=-1) # (batch_size, num_heads, seq_len, seq_len) 388 | attention_probs = self.dropout(attention_probs) 389 | value = torch.matmul( 390 | attention_probs, 391 | value) # (batch_size, num_heads, seq_len, head_size) 392 | value = self.combine_last_two_dim(value.permute( 393 | 0, 2, 1, 3)) # (batch_size, seq_len, dim) 394 | # intermediate layer 395 | output = self.dropout(value) 396 | residual = x + output 397 | output = self.layer_norm2(residual) 398 | output = self.out_layer1(output) 399 | output = self.output_activation(output) 400 | output = self.dropout(output) 401 | output = self.out_layer2(output) + residual 402 | return output 403 | 404 | 405 | class MultiConvAttention(nn.Module): 406 | def __init__(self, configs): 407 | super(MultiConvAttention, self).__init__() 408 | dim = configs.dim 409 | num_heads = configs.num_heads 410 | drop_rate = configs.drop_rate 411 | kernels = configs.kernels 412 | 413 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % ( 414 | dim, num_heads) 415 | self.head_size, self.num_heads, self.dim = int( 416 | dim / num_heads), num_heads, dim 417 | self.dropout = nn.Dropout(p=drop_rate) 418 | self.query = TemporalContextModule(in_dim=dim, 419 | out_dim=dim, 420 | kernels=kernels, 421 | drop_rate=drop_rate) 422 | self.key = TemporalContextModule(in_dim=dim, 423 | out_dim=dim, 424 | kernels=kernels, 425 | drop_rate=drop_rate) 426 | self.value = TemporalContextModule(in_dim=dim, 427 | out_dim=dim, 428 | kernels=kernels, 429 | drop_rate=drop_rate) 430 | # self.value_visual = None 431 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 432 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6) 433 | self.out_layer1 = Conv1D(in_dim=dim, 434 | out_dim=dim, 435 | kernel_size=1, 436 | stride=1, 437 | padding=0, 438 | bias=True) 439 | self.output_activation = nn.GELU() 440 | self.out_layer2 = Conv1D(in_dim=dim, 441 | out_dim=dim, 442 | kernel_size=1, 443 | stride=1, 444 | padding=0, 445 | bias=True) 446 | 447 | def transpose_for_scores(self, x): 448 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size) 449 | x = x.view(*new_x_shape) 450 | return x.permute(0, 2, 1, 451 | 3) # (batch_size, num_heads, w_seq_len, head_size) 452 | 453 | @staticmethod 454 | def combine_last_two_dim(x): 455 | old_shape = list(x.size()) 456 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]] 457 | return x.reshape(shape=new_shape) 458 | 459 | def forward(self, x, mask=None): 460 | output = self.layer_norm1(x) # (batch_size, seq_len, dim) 461 | # output = self.dropout(output) 462 | # multi-head attention layer 463 | query = self.transpose_for_scores( 464 | self.query(output)) # (batch_size, num_heads, seq_len, head_size) 465 | key = self.transpose_for_scores(self.key(output)) 466 | value = self.transpose_for_scores(self.value(output)) 467 | attention_scores = torch.matmul(query, key.transpose( 468 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len) 469 | attention_scores = attention_scores / math.sqrt(self.head_size) 470 | if mask is not None: # masking 471 | mask = mask.unsqueeze(1).unsqueeze( 472 | 2) # (batch_size, 1, 1, seq_len) 473 | attention_scores = mask_logits(attention_scores, mask) 474 | attention_probs = torch.softmax( 475 | attention_scores, 476 | dim=-1) # (batch_size, num_heads, seq_len, seq_len) 477 | attention_probs = self.dropout(attention_probs) 478 | value = torch.matmul( 479 | attention_probs, 480 | value) # (batch_size, num_heads, seq_len, head_size) 481 | value = self.combine_last_two_dim(value.permute( 482 | 0, 2, 1, 3)) # (batch_size, seq_len, dim) 483 | # intermediate layer 484 | output = self.dropout(value) 485 | residual = x + output 486 | output = self.layer_norm2(residual) 487 | output = self.out_layer1(output) 488 | output = self.output_activation(output) 489 | output = self.dropout(output) 490 | output = self.out_layer2(output) + residual 491 | return output 492 | 493 | class ConvMultiAttention(nn.Module): 494 | def __init__(self, configs): 495 | super(ConvMultiAttention, self).__init__() 496 | self.attention = MultiHeadAttention(configs) 497 | self.multi_grain = PhraseEncodeNet(configs.dim) 498 | 499 | def forward(self, x, mask=None): 500 | x = self.attention(x,mask) 501 | x = self.multi_grain(x) 502 | return x * mask.unsqueeze(2) 503 | 504 | class ContrastBlock(nn.Module): 505 | def __init__(self, dim, beta): 506 | super(ContrastBlock, self).__init__() 507 | self.conv1 = nn.Conv1d(in_channels=dim, 508 | out_channels=dim//beta, 509 | kernel_size=1, 510 | stride=1, 511 | padding=0, 512 | bias=True) 513 | self.conv2 = nn.Conv1d(in_channels=dim//beta, 514 | out_channels=dim, 515 | kernel_size=1, 516 | stride=1, 517 | padding=0, 518 | bias=True) 519 | self.activation = nn.ReLU() 520 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 521 | 522 | def forward(self,v_1,v_2): 523 | v_1 = v_1.transpose(1, 2) 524 | v_2 = v_2.transpose(1, 2) 525 | v = v_1 * v_2 526 | v = self.conv1(v) 527 | v = self.activation(v) 528 | v = torch.sigmoid(self.layer_norm1(self.conv2(v).transpose(1, 2))) 529 | v = v * v_1.transpose(1, 2) 530 | return v 531 | 532 | class MLP(nn.Module): 533 | """ Very simple multi-layer perceptron (also called FFN)""" 534 | 535 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 536 | super().__init__() 537 | self.num_layers = num_layers 538 | h = [hidden_dim] * (num_layers - 1) 539 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 540 | 541 | def forward(self, x): 542 | for i, layer in enumerate(self.layers): 543 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 544 | return x 545 | 546 | -------------------------------------------------------------------------------- /models/PGBP/PGBP.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | from tqdm import tqdm 7 | import copy 8 | from core.config import config 9 | from core.runner_utils import index_to_time2, calculate_iou, calculate_iou_accuracy, cal_statistics 10 | from . import attention 11 | from .encoder import LSTMEncoder, MultiStepLSTMEncoder, TemporalContextModule 12 | from . import fusion 13 | from .layers import Projection, Prediction, PositionalEmbedding, PositionEmbeddingSine 14 | from .operation import Conv1D, mask_logits,cw2se 15 | from .triplet_loss import batch_all_triplet_loss, pairwise_distances 16 | import random 17 | from .slidewindow import find_most_relevant_frame 18 | from .gauss import generate_gaussian_tensor 19 | from einops import repeat,rearrange 20 | from .decoder import TransformerDecoder,TransformerDecoderLayer 21 | from torchvision.ops import sigmoid_focal_loss 22 | from .matcher import HungarianMatcher 23 | # torch.set_printoptions(profile="full", linewidth=1000, precision=2) 24 | 25 | def inverse_sigmoid(x, eps=1e-3): 26 | x = x.clamp(min=0, max=1) 27 | x1 = x.clamp(min=eps) 28 | x2 = (1 - x).clamp(min=eps) 29 | return torch.log(x1/x2) 30 | 31 | 32 | class PGBP(nn.Module): 33 | def __init__(self): 34 | super(PGBP, self).__init__() 35 | configs = config.MODEL.PARAMS 36 | self.use_keyword = configs.use_keyword 37 | self.windowsize = configs.windowsize 38 | self.debug_print = configs.DEBUG 39 | self.top_k = configs.top_k 40 | self.top_k0=configs.top_k0 41 | self.neg = configs.neg 42 | self.pos =configs.pos 43 | self.detr_layers = configs.detr_layers 44 | self.content_prior = configs.content_prior 45 | self.match = HungarianMatcher(configs.cost_class,configs.cost_span,configs.cost_giou) 46 | empty_weight = torch.ones(2) 47 | empty_weight[-1] = configs.eos_coef # lower weight for background (index 1, foreground index 0) 48 | self.register_buffer('empty_weight', empty_weight) 49 | self.video_affine = Projection(in_dim=configs.video_feature_dim, 50 | dim=configs.dim, 51 | drop_rate=configs.drop_rate) 52 | 53 | self.query_affine = Projection(in_dim=configs.query_feature_dim, 54 | dim=configs.dim, 55 | drop_rate=configs.drop_rate) 56 | self.query_position = configs.query_position 57 | self.video_position = configs.video_position 58 | if self.query_position: 59 | self.q_pos_embedding = PositionalEmbedding(configs.dim, 30) 60 | if self.video_position: 61 | self.v_pos_embedding = PositionEmbeddingSine(configs.dim, normalize=True) 62 | if configs.content_prior == "learned": 63 | self.pattern = nn.Embedding(configs.num_queries, configs.dim) 64 | # self.pos_embedding = TransformerPositionalEmbedding(configs.dim, 500,drop_rate=configs.drop_rate) 65 | self.query_embeddings = nn.Embedding(configs.num_queries, 2) 66 | query_attention_layer = getattr(attention, 67 | configs.query_attention)(configs) 68 | video_attention_layer = getattr(attention, 69 | configs.video_attention)(configs) 70 | decoder_layer = TransformerDecoderLayer(configs) 71 | decoder_norm = nn.LayerNorm(configs.dim) 72 | self.detr_decoder = TransformerDecoder(decoder_layer,configs,decoder_norm) 73 | self.query_encoder = nn.Sequential(*[ 74 | copy.deepcopy(query_attention_layer) 75 | for _ in range(configs.query_attention_layers) 76 | ]) 77 | self.video_encoder = nn.Sequential(*[ 78 | copy.deepcopy(video_attention_layer) 79 | for _ in range(configs.video_attention_layers) 80 | ]) 81 | early_attention_layer = getattr(attention, 82 | configs.early_attention)(configs) 83 | self.early_encoder = nn.Sequential(*[ 84 | copy.deepcopy(early_attention_layer) 85 | for _ in range(configs.early_attention_layers) 86 | ]) 87 | self.contrastlayer = copy.deepcopy(video_attention_layer) 88 | self.fg_prediction_layer = Prediction(in_dim=configs.dim, 89 | hidden_dim=configs.dim // 2, 90 | out_dim=1, 91 | drop_rate=configs.drop_rate) 92 | self.early_fusion_layer = getattr(fusion, 93 | configs.early_fusion_module)(configs) 94 | 95 | self.fusion_layer = getattr(fusion, configs.fusion_module)(configs) 96 | 97 | post_attention_layer = getattr(attention, 98 | configs.post_attention)(configs) 99 | self.post_attention_layer = nn.Sequential(*[ 100 | copy.deepcopy(post_attention_layer) 101 | for _ in range(configs.post_attention_layers) 102 | ]) 103 | self.video_encoder2 = nn.Sequential(*[ 104 | copy.deepcopy(post_attention_layer) 105 | for _ in range(configs.video_attention_layers) 106 | ]) 107 | self.linear = nn.Linear(in_features=2*configs.dim,out_features=configs.dim,bias= True) 108 | cw_pred = Prediction(in_dim=configs.dim, 109 | hidden_dim=configs.dim // 2, 110 | out_dim=2, 111 | drop_rate=configs.drop_rate) 112 | self.cw_pred = nn.Sequential(*[ 113 | copy.deepcopy(cw_pred) 114 | for _ in range(configs.detr_layers) 115 | ]) 116 | pred_results = Prediction(in_dim=configs.dim, 117 | hidden_dim=configs.dim // 2, 118 | out_dim=2, 119 | drop_rate=configs.drop_rate) 120 | self.pred_results = nn.Sequential(*[ 121 | copy.deepcopy(pred_results) 122 | for _ in range(configs.detr_layers) 123 | ]) 124 | self.intering = Prediction(in_dim=configs.dim, 125 | hidden_dim=configs.dim // 2, 126 | out_dim=1, 127 | drop_rate=configs.drop_rate) 128 | self.pos_fused_layer =attention.ContrastBlock(configs.dim,configs.beta) 129 | self.neg_fused_layer =attention.ContrastBlock(configs.dim,configs.beta) 130 | self.pn_fused_layer =attention.ContrastBlock(configs.dim,configs.beta) 131 | 132 | def forward(self, batch_visual_scale,batch_word_vectors, batch_keyword_mask, batch_txt_mask, 133 | batch_vis_feats, batch_vis_mask): 134 | batch_vis_feats = self.video_affine(batch_vis_feats) 135 | batch_vis_feats = batch_vis_feats * batch_vis_mask.unsqueeze(2) 136 | for i, module in enumerate(self.video_encoder): 137 | if i == 0: 138 | video_features = module(batch_vis_feats, batch_vis_mask) 139 | else: 140 | video_features = module(video_features, batch_vis_mask) 141 | for i, module in enumerate(self.video_encoder2): 142 | if i == 0: 143 | video_features2 = module(batch_vis_feats, batch_vis_mask) 144 | else: 145 | video_features2 = module(video_features2, batch_vis_mask) 146 | 147 | batch_word_vectors = self.query_affine(batch_word_vectors) 148 | if self.query_position: 149 | batch_word_vectors = batch_word_vectors + self.q_pos_embedding( 150 | batch_word_vectors) 151 | batch_word_vectors = batch_word_vectors * batch_txt_mask.unsqueeze(2) 152 | for i, module in enumerate(self.query_encoder): 153 | if i == 0: 154 | query_features = module(batch_word_vectors, batch_txt_mask) 155 | else: 156 | query_features = module(query_features, batch_txt_mask) 157 | if self.use_keyword: 158 | entity_features = batch_word_vectors * batch_keyword_mask.unsqueeze(2) 159 | entity_features = query_features + entity_features 160 | else: 161 | entity_features = query_features 162 | # First stage 163 | entity_video_fused = self.early_fusion_layer(video_features, 164 | entity_features, 165 | batch_vis_mask, 166 | batch_txt_mask) 167 | for i, module in enumerate(self.early_encoder): 168 | entity_video_fused = module(entity_video_fused, batch_vis_mask) 169 | fg_prob = self.fg_prediction_layer(entity_video_fused) 170 | 171 | fg_prob1 =torch.sigmoid(fg_prob.squeeze(2)) 172 | 173 | pos_values, pos_indices = torch.topk(fg_prob1.masked_fill(~batch_vis_mask.bool(), float('0.0')), k=self.top_k0, dim=1, largest=True) 174 | neg_values, neg_indices = torch.topk(fg_prob1.masked_fill(~batch_vis_mask.bool(), float('1.0')), k=self.top_k, dim=1, largest=False) 175 | B,l,c = entity_video_fused.shape 176 | if self.top_k0>1: 177 | pos=torch.gather(entity_video_fused, dim=1, index=pos_indices.unsqueeze(-1).expand(-1, -1, c)) 178 | pos=F.max_pool1d(pos.transpose(1,2),kernel_size=self.top_k0).transpose(1,2) 179 | else: 180 | pos = torch.gather(entity_video_fused, 1, pos_indices.view(-1, 1).expand(-1, c).unsqueeze(1)) 181 | neg = torch.gather(entity_video_fused, dim=1, index=neg_indices.unsqueeze(-1).expand(-1, -1, c)) 182 | if not self.training and self.debug_print: 183 | print('fg_prob', torch.sigmoid(fg_prob)) 184 | fg_vis_feature = (video_features2 + 185 | video_features) * torch.sigmoid(fg_prob) 186 | fused_pos_feature = self.pos_fused_layer(fg_vis_feature,pos) 187 | contrast_feature = self.contrastlayer(fg_vis_feature,batch_vis_mask) 188 | if self.pos is True: 189 | contrast_feature = contrast_feature + fused_pos_feature 190 | if self.neg is True: 191 | fused_neg_feature = torch.mean(self.neg_fused_layer(neg,pos),dim= 1).unsqueeze(1) 192 | fused_pn_feature = fg_vis_feature - self.pn_fused_layer(fg_vis_feature,fused_neg_feature) 193 | contrast_feature =contrast_feature + fused_pn_feature 194 | fg_vis_feature = torch.cat((fg_vis_feature,contrast_feature),dim=2) 195 | fg_vis_feature = self.linear(fg_vis_feature) 196 | fused_action_feature = self.fusion_layer(fg_vis_feature, 197 | entity_features, 198 | batch_vis_mask, 199 | batch_txt_mask) 200 | for i, module in enumerate(self.post_attention_layer): 201 | fused_action_feature = module(fused_action_feature, batch_vis_mask) 202 | query_embeddings = self.query_embeddings.weight 203 | refpoint_embed = repeat(query_embeddings, "nq d -> b nq d", b=B).transpose(0,1) 204 | if self.content_prior == "learned": 205 | pattern = self.pattern.weight 206 | tgt = repeat(pattern, "nq d -> b nq d", b=B).transpose(0,1) 207 | else: 208 | tgt = torch.zeros(refpoint_embed.shape[0],B,c).cuda() 209 | pred_start = [] 210 | pred_end = [] 211 | results = [] 212 | memory_local = fused_action_feature.permute(1, 0, 2) 213 | pos_embed_local = self.v_pos_embedding(fused_action_feature,batch_vis_mask).permute(1, 0, 2) 214 | hs, references = self.detr_decoder(pos,batch_visual_scale,tgt, memory_local, memory_key_padding_mask=batch_vis_mask, 215 | pos=pos_embed_local, refpoints_unsigmoid=refpoint_embed) 216 | reference_before_sigmoid = inverse_sigmoid(references) 217 | for i in range(self.detr_layers): 218 | results.append(self.pred_results[i](hs[i,...]).squeeze(2)) 219 | d_cw = self.cw_pred[i](hs[i,...]) 220 | cw = (reference_before_sigmoid[i,...] + d_cw) 221 | se = cw2se(torch.sigmoid(cw)) 222 | pred_start.append(se[...,0]) 223 | pred_end.append(se[...,1]) 224 | pred_inter = self.intering(fused_action_feature).squeeze(2) 225 | 226 | 227 | return pred_start,pred_end,pred_inter, query_features, video_features2, fg_prob.squeeze( 228 | 2), video_features, batch_word_vectors, batch_vis_feats,results,pos_indices,neg_indices,\ 229 | contrast_feature 230 | 231 | def _get_src_permutation_idx(self, indices): 232 | # permute predictions following indices 233 | batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) 234 | src_idx = torch.cat([src for (src, _) in indices]) 235 | return batch_idx, src_idx # two 1D tensors of the same length 236 | 237 | def contrast_loss(self,mask,key_frame,inter_label,contrast_feature,hy_sigma=1,weight = True): 238 | b,l,c = contrast_feature.shape 239 | gauss_weights = generate_gaussian_tensor(inter_label, key_frame, hy_sigma) 240 | contrast_feature = F.normalize(contrast_feature, p=2, dim=2) 241 | key_frame_feature = torch.gather(contrast_feature, 1, key_frame.view(-1, 1).expand(-1, c).unsqueeze(1)) 242 | score = torch.bmm(contrast_feature,key_frame_feature.transpose(1,2)).squeeze(2) 243 | loss = nn.BCEWithLogitsLoss(reduction='none')(score,inter_label) 244 | if weight is True: 245 | loss = loss * gauss_weights 246 | mask = mask.type(torch.float32) 247 | loss = torch.sum(loss * mask, 248 | dim=1) / (torch.sum(mask, dim=1) + 1e-13) 249 | return loss.mean() 250 | 251 | def PNcontrast_loss(self,mask,pos_frame,neg_frame,inter_label,contrast_feature,hy_sigma=1,weight = True): 252 | if self.top_k0>1: 253 | pos_loss = self.contrast_loss(mask,pos_frame[:,0],inter_label,contrast_feature,hy_sigma,False) 254 | else: 255 | pos_loss = self.contrast_loss(mask,pos_frame,inter_label,contrast_feature,hy_sigma,False) 256 | B,l = neg_frame.shape 257 | neg_loss = 0. 258 | if self.neg is True: 259 | for i in range(l): 260 | neg_loss = neg_loss + self.contrast_loss(mask,neg_frame[:,i],(1.0-inter_label),contrast_feature,hy_sigma,weight) 261 | return pos_loss + neg_loss/l 262 | 263 | def compute_loss(self, pred_start,pred_end, pred_inter, start_labels, 264 | end_labels, inter_label, mask,duration,pred_pro): 265 | bce_loss,iou_loss,L1_loss = 0,0,0 266 | for i in range(len(pred_start)): 267 | pred_times = torch.cat([pred_start[i].unsqueeze(2),pred_end[i].unsqueeze(2)],dim=2) 268 | b,l,_ = pred_times.shape 269 | times = torch.cat([(start_labels/duration).unsqueeze(1),(end_labels/duration).unsqueeze(1)],dim=1) 270 | indices = self.match(pred_pro[i],pred_times,times) 271 | idx = self._get_src_permutation_idx(indices) 272 | src_spans = pred_times[idx] 273 | L1_loss =L1_loss + F.l1_loss(src_spans, times, reduction='none').mean() 274 | iou_loss = iou_loss + (1- self.calculate_giou(src_spans, times)[1]).mean() 275 | target_classes = torch.full(pred_pro[i].shape[:2], 1, 276 | dtype=torch.int64, device=pred_pro[i].device) 277 | target_classes[idx] = 0 278 | bce_loss = bce_loss + self.bce_rescale_loss(pred_pro[i],target_classes) 279 | 280 | inter_loss = self.compute_location_loss(pred_inter, inter_label, mask) 281 | return L1_loss, inter_loss, iou_loss,bce_loss 282 | 283 | def bce_rescale_loss(self,scores, targets): 284 | loss_value = F.cross_entropy(scores.transpose(1, 2),targets,self.empty_weight, reduction="none") 285 | loss_value = loss_value.mean() 286 | return loss_value 287 | 288 | def calculate_giou(self,box1, box2): 289 | iou,union = self.calculate_iou(box1,box2) 290 | box1_left, box1_right = box1[..., 0], box1[..., 1] 291 | box2_left, box2_right = box2[..., 0], box2[..., 1] 292 | right = torch.maximum(box2_right, box1_right) 293 | left = torch.minimum(box2_left, box1_left) 294 | enclosing_area = (right - left).clamp(min=0) 295 | giou = iou - (enclosing_area - union) / enclosing_area 296 | return iou,giou 297 | 298 | def calculate_iou(self,box1, box2): 299 | box1_left, box1_right = box1[..., 0], box1[..., 1] 300 | box2_left, box2_right = box2[..., 0], box2[..., 1] 301 | areas1 = box1_right-box1_left 302 | areas2 = box2_right-box2_left 303 | inter_left = torch.maximum(box1_left, box2_left) 304 | inter_right = torch.minimum(box1_right, box2_right) 305 | inter = (inter_right - inter_left).clamp(min=0) 306 | union = areas1 + areas2 - inter 307 | iou = inter/ union 308 | return iou,union 309 | 310 | def compute_boundary_loss(self, pred, targets): 311 | return F.cross_entropy(pred, targets.long()) 312 | 313 | def compute_location_loss(self, pred, targets, mask): 314 | weights_per_location = torch.where(targets == 0.0, targets + 1.0, 315 | 1.0 * targets) 316 | loss_per_location = nn.BCEWithLogitsLoss(reduction='none')(pred, 317 | targets) 318 | loss_per_location = loss_per_location * weights_per_location 319 | mask = mask.type(torch.float32) 320 | loss = torch.sum(loss_per_location * mask, 321 | dim=1) / (torch.sum(mask, dim=1) + 1e-13) 322 | return loss.mean() 323 | 324 | 325 | def compute_sim_loss(self, pred, pos, neg, saliency_margin = 0.2): 326 | b, l = pred.shape 327 | _, num_indices = pos.shape 328 | pos_indices = pos + (torch.arange(0, b).reshape(-1, 1) * l).cuda() 329 | neg_indices = neg + (torch.arange(0, b).reshape(-1, 1) * l).cuda() 330 | pred_score = torch.sigmoid(pred) 331 | pos_scores = pred_score.view(-1)[pos_indices.view(-1)].view(b, num_indices) 332 | neg_scores = pred_score.view(-1)[neg_indices.view(-1)].view(b, num_indices) 333 | loss_sim = torch.clamp(saliency_margin + neg_scores - pos_scores, min=0).sum() \ 334 | / (b * num_indices) * 2 # * 2 to keep the loss the same scale 335 | return loss_sim 336 | 337 | 338 | def early_pred_loss(self, video_features, pred, targets, mask): 339 | return self.compute_location_loss(pred, targets, mask) 340 | 341 | def aligment_score(self, 342 | query_features, 343 | video_features, 344 | query_mask, 345 | video_mask, 346 | inner_label, 347 | GT_inner=True): 348 | B, T, channels = video_features.shape 349 | 350 | query_features = query_features.sum(1) / query_mask.sum(1).unsqueeze(1) 351 | query_features = F.normalize(query_features, p=2, dim=1) # B, channels 352 | 353 | if GT_inner: 354 | frame_weights = inner_label / video_mask.sum(1, keepdim=True) 355 | else: 356 | norm_video = F.normalize(video_features, p=2, dim=-1) 357 | frame_weights = torch.bmm(query_features.unsqueeze(1), 358 | norm_video.transpose(1, 2)) # B,1,T 359 | frame_weights = mask_logits(frame_weights.squeeze(1), 360 | video_mask) # B,T 361 | frame_weights = torch.softmax(frame_weights, dim=-1) 362 | 363 | video_features = video_features * frame_weights.unsqueeze(2) 364 | video_features = video_features.sum(1) 365 | video_features = F.normalize(video_features, p=2, dim=1) 366 | video_sim = torch.matmul(video_features, video_features.T) 367 | video_sim = torch.softmax(video_sim, dim=-1) 368 | query_sim = torch.matmul(query_features, query_features.T) 369 | query_sim = torch.softmax(query_sim, dim=-1) 370 | kl_loss = (F.kl_div(query_sim.log(), video_sim, reduction='sum') + 371 | F.kl_div(video_sim.log(), query_sim, reduction='sum')) / 2 372 | 373 | return kl_loss 374 | 375 | @staticmethod 376 | def extract_index(start_logits, end_logits): 377 | start_prob = nn.Softmax(dim=1)(start_logits) 378 | end_prob = nn.Softmax(dim=1)(end_logits) 379 | outer = torch.matmul(start_prob.unsqueeze(dim=2), 380 | end_prob.unsqueeze(dim=1)) 381 | outer = torch.triu(outer, diagonal=0) 382 | _, start_index = torch.max(torch.max(outer, dim=2)[0], 383 | dim=1) # (batch_size, ) 384 | _, end_index = torch.max(torch.max(outer, dim=1)[0], 385 | dim=1) # (batch_size, ) 386 | return start_index, end_index 387 | 388 | @staticmethod 389 | def eval_test(model, 390 | data_loader, 391 | device, 392 | mode='test', 393 | epoch=None, 394 | shuffle_video_frame=False): 395 | ious = [] 396 | pos_labels = [] 397 | pseudo=[] 398 | preds, durations,names,times = [], [],[],[] 399 | with torch.no_grad(): 400 | for idx, batch_data in tqdm(enumerate(data_loader), 401 | total=len(data_loader), 402 | desc='evaluate {}'.format(mode)): 403 | data, annos = batch_data 404 | batch_word_vectors = data['batch_word_vectors'].to(device) 405 | batch_keyword_mask = data['batch_keyword_mask'].to(device) 406 | batch_txt_mask = data['batch_txt_mask'].squeeze(2).to(device) 407 | batch_vis_feats = data['batch_vis_feats'].to(device) 408 | batch_vis_mask = data['batch_vis_mask'].squeeze(2).to(device) 409 | batch_extend_pre = data['batch_extend_pre'].to(device) 410 | batch_extend_suf = data['batch_extend_suf'].to(device) 411 | batch_visual_scale = data["visual_scale"].unsqueeze(-1).to(device) 412 | if shuffle_video_frame: 413 | B = batch_vis_feats.shape[0] 414 | for i in range(B): 415 | T = batch_vis_mask[i].sum().int().item() 416 | pre = batch_extend_pre[i].item() 417 | new_T = torch.randperm(T) 418 | batch_vis_feats[i, torch.arange(T) + 419 | pre] = batch_vis_feats[i, new_T + pre] 420 | # compute predicted results 421 | with torch.cuda.amp.autocast(): 422 | output = model(batch_visual_scale,batch_word_vectors, batch_keyword_mask, 423 | batch_txt_mask, batch_vis_feats, 424 | batch_vis_mask) 425 | pseudo_pros=output[5] 426 | probalities_class = torch.softmax(output[9][-1],dim = -1) 427 | probalities = probalities_class[...,0] 428 | pred_p = torch.argmax(probalities,dim = 1) 429 | start_logits, end_logits = output[0][-1], output[1][-1] 430 | start_logits = torch.gather(start_logits, 1, pred_p.view(-1, 1)).clamp(0,1).squeeze(1) 431 | start_logits[torch.isnan(start_logits)] = 0. 432 | end_logits = torch.gather(end_logits, 1, pred_p.view(-1, 1)).clamp(0,1).squeeze(1) 433 | end_logits[torch.isnan(end_logits)] = 1. 434 | pos_frames = output[-3] 435 | 436 | start_indices = start_logits.cpu().numpy() 437 | end_indices = end_logits.cpu().numpy() 438 | batch_vis_mask = batch_vis_mask.cpu().numpy() 439 | batch_extend_pre = batch_extend_pre.cpu().numpy() 440 | batch_extend_suf = batch_extend_suf.cpu().numpy() 441 | pos_frames = pos_frames.cpu().numpy() 442 | 443 | 444 | for vis_mask, start_index, end_index, extend_pre, extend_suf, anno,pos_frame,pseudo_pro in zip( 445 | batch_vis_mask, start_indices, end_indices, 446 | batch_extend_pre, batch_extend_suf, annos,pos_frames,pseudo_pros): 447 | 448 | start_time, end_time = index_to_time2( 449 | start_index, end_index, vis_mask.sum(), extend_pre, 450 | extend_suf, anno["duration"]) 451 | 452 | iou = calculate_iou(i0=[start_time, end_time], 453 | i1=anno['times']) 454 | ious.append(iou) 455 | preds.append((start_time, end_time)) 456 | durations.append(anno["duration"]) 457 | times.append(anno["times"]) 458 | names.append(anno["video"]) 459 | pseudo.append(pseudo_pro) 460 | import pandas as pd 461 | df = pd.DataFrame({ 462 | 'Column1': names, 463 | 'Column2': times, 464 | 'Column3': preds, 465 | "Column4":pseudo 466 | }) 467 | df.to_excel('output.xlsx', index=False, engine='openpyxl') 468 | 469 | statistics_str = cal_statistics(preds, durations) 470 | r1i1 = calculate_iou_accuracy(ious, threshold=0.1) 471 | r1i2 = calculate_iou_accuracy(ious, threshold=0.2) 472 | r1i3 = calculate_iou_accuracy(ious, threshold=0.3) 473 | r1i4 = calculate_iou_accuracy(ious, threshold=0.4) 474 | r1i5 = calculate_iou_accuracy(ious, threshold=0.5) 475 | r1i6 = calculate_iou_accuracy(ious, threshold=0.6) 476 | r1i7 = calculate_iou_accuracy(ious, threshold=0.7) 477 | r1i8 = calculate_iou_accuracy(ious, threshold=0.8) 478 | r1i9 = calculate_iou_accuracy(ious, threshold=0.9) 479 | 480 | mi = np.mean(ious) * 100.0 481 | # write the scores 482 | score_str = "Epoch {}\n".format(epoch) 483 | score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3) 484 | score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5) 485 | score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7) 486 | score_str += "mean IoU: {:.2f}\n".format(mi) 487 | return r1i3, r1i5, r1i7, mi, score_str, statistics_str 488 | --------------------------------------------------------------------------------