├── core
├── __init__.py
├── __pycache__
│ ├── config.cpython-37.pyc
│ ├── config.cpython-38.pyc
│ ├── config.cpython-39.pyc
│ ├── meters.cpython-39.pyc
│ ├── optim.cpython-310.pyc
│ ├── optim.cpython-311.pyc
│ ├── optim.cpython-37.pyc
│ ├── optim.cpython-38.pyc
│ ├── optim.cpython-39.pyc
│ ├── __init__.cpython-37.pyc
│ ├── __init__.cpython-38.pyc
│ ├── __init__.cpython-39.pyc
│ ├── config.cpython-310.pyc
│ ├── config.cpython-311.pyc
│ ├── __init__.cpython-310.pyc
│ ├── __init__.cpython-311.pyc
│ ├── data_util.cpython-310.pyc
│ ├── data_util.cpython-311.pyc
│ ├── data_util.cpython-37.pyc
│ ├── data_util.cpython-38.pyc
│ ├── data_util.cpython-39.pyc
│ ├── runner_utils.cpython-310.pyc
│ ├── runner_utils.cpython-311.pyc
│ ├── runner_utils.cpython-37.pyc
│ ├── runner_utils.cpython-38.pyc
│ └── runner_utils.cpython-39.pyc
├── optim.py
├── meters.py
├── config.py
├── data_util.py
└── runner_utils.py
├── models
├── __init__.py
├── PGBP
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── PGBP.cpython-38.pyc
│ │ ├── EAMAT.cpython-310.pyc
│ │ ├── EAMAT.cpython-311.pyc
│ │ ├── EAMAT.cpython-37.pyc
│ │ ├── EAMAT.cpython-38.pyc
│ │ ├── EAMAT.cpython-39.pyc
│ │ ├── EAMAT1.cpython-38.pyc
│ │ ├── EAMAT2.cpython-38.pyc
│ │ ├── EAMAT3.cpython-38.pyc
│ │ ├── EAMAT4.cpython-38.pyc
│ │ ├── EAMAT5.cpython-38.pyc
│ │ ├── EAMAT6.cpython-38.pyc
│ │ ├── EAMAT7.cpython-38.pyc
│ │ ├── EAMAT8.cpython-38.pyc
│ │ ├── EAMAT9.cpython-38.pyc
│ │ ├── fusion.cpython-37.pyc
│ │ ├── fusion.cpython-38.pyc
│ │ ├── fusion.cpython-39.pyc
│ │ ├── gauss.cpython-38.pyc
│ │ ├── layers.cpython-37.pyc
│ │ ├── layers.cpython-38.pyc
│ │ ├── layers.cpython-39.pyc
│ │ ├── EAMAT10.cpython-38.pyc
│ │ ├── __init__.cpython-310.pyc
│ │ ├── __init__.cpython-311.pyc
│ │ ├── __init__.cpython-37.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ ├── __init__.cpython-39.pyc
│ │ ├── attention.cpython-37.pyc
│ │ ├── attention.cpython-38.pyc
│ │ ├── attention.cpython-39.pyc
│ │ ├── decoder.cpython-38.pyc
│ │ ├── decoder1.cpython-38.pyc
│ │ ├── decoder2.cpython-38.pyc
│ │ ├── decoder3.cpython-38.pyc
│ │ ├── decoder4.cpython-38.pyc
│ │ ├── decoder5.cpython-38.pyc
│ │ ├── decoder6.cpython-38.pyc
│ │ ├── decoder7.cpython-38.pyc
│ │ ├── decoder8.cpython-38.pyc
│ │ ├── encoder.cpython-310.pyc
│ │ ├── encoder.cpython-311.pyc
│ │ ├── encoder.cpython-37.pyc
│ │ ├── encoder.cpython-38.pyc
│ │ ├── encoder.cpython-39.pyc
│ │ ├── fusion.cpython-310.pyc
│ │ ├── fusion.cpython-311.pyc
│ │ ├── layers.cpython-310.pyc
│ │ ├── layers.cpython-311.pyc
│ │ ├── matcher.cpython-38.pyc
│ │ ├── operation.cpython-37.pyc
│ │ ├── operation.cpython-38.pyc
│ │ ├── operation.cpython-39.pyc
│ │ ├── EAMAT_text1.cpython-38.pyc
│ │ ├── EAMAT_text2.cpython-38.pyc
│ │ ├── EAMAT_text3.cpython-38.pyc
│ │ ├── EAMAT_text4.cpython-38.pyc
│ │ ├── EAMATtext1.cpython-38.pyc
│ │ ├── EAMATtext2.cpython-38.pyc
│ │ ├── EAMATtext3.cpython-38.pyc
│ │ ├── EAMATtext4.cpython-38.pyc
│ │ ├── attention.cpython-310.pyc
│ │ ├── attention.cpython-311.pyc
│ │ ├── operation.cpython-310.pyc
│ │ ├── operation.cpython-311.pyc
│ │ ├── slidewindow.cpython-38.pyc
│ │ ├── span_utils.cpython-38.pyc
│ │ ├── EAMATseprate.cpython-38.pyc
│ │ ├── phraseEncoder.cpython-38.pyc
│ │ ├── triplet_loss.cpython-310.pyc
│ │ ├── triplet_loss.cpython-311.pyc
│ │ ├── triplet_loss.cpython-37.pyc
│ │ ├── triplet_loss.cpython-38.pyc
│ │ └── triplet_loss.cpython-39.pyc
│ ├── phraseEncoder.py
│ ├── gauss.py
│ ├── operation.py
│ ├── slidewindow.py
│ ├── span_utils.py
│ ├── matcher.py
│ ├── triplet_loss.py
│ ├── layers.py
│ ├── encoder.py
│ ├── fusion.py
│ ├── decoder.py
│ ├── attention.py
│ └── PGBP.py
└── __pycache__
│ ├── __init__.cpython-310.pyc
│ ├── __init__.cpython-311.pyc
│ ├── __init__.cpython-37.pyc
│ ├── __init__.cpython-38.pyc
│ └── __init__.cpython-39.pyc
├── datasets
├── __pycache__
│ ├── tacos.cpython-310.pyc
│ ├── tacos.cpython-311.pyc
│ ├── tacos.cpython-37.pyc
│ ├── tacos.cpython-38.pyc
│ ├── tacos.cpython-39.pyc
│ ├── __init__.cpython-310.pyc
│ ├── __init__.cpython-311.pyc
│ ├── __init__.cpython-37.pyc
│ ├── __init__.cpython-38.pyc
│ ├── __init__.cpython-39.pyc
│ ├── charades.cpython-310.pyc
│ ├── charades.cpython-311.pyc
│ ├── charades.cpython-37.pyc
│ ├── charades.cpython-38.pyc
│ ├── charades.cpython-39.pyc
│ ├── BaseDataset.cpython-310.pyc
│ ├── BaseDataset.cpython-311.pyc
│ ├── BaseDataset.cpython-37.pyc
│ ├── BaseDataset.cpython-38.pyc
│ ├── BaseDataset.cpython-39.pyc
│ ├── activitynet.cpython-310.pyc
│ ├── activitynet.cpython-311.pyc
│ ├── activitynet.cpython-37.pyc
│ ├── activitynet.cpython-38.pyc
│ ├── activitynet.cpython-39.pyc
│ ├── charades_len.cpython-38.pyc
│ └── charades_mom.cpython-38.pyc
├── charades_len.py
├── charades_mom.py
├── activitynet.py
├── charades.py
├── tacos.py
├── __init__.py
└── BaseDataset.py
├── conda.sh
├── requirements.txt
├── experiments
├── charades
│ └── PGBP.yaml
├── tacos
│ └── PGBP.yaml
├── activitynet
│ └── PGBP.yaml
├── charades_len
│ └── PGBP.yaml
└── charades_mom
│ └── PGBP.yaml
└── README.md
/core/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .PGBP import PGBP
2 |
--------------------------------------------------------------------------------
/models/PGBP/__init__.py:
--------------------------------------------------------------------------------
1 | from .PGBP import PGBP
2 |
--------------------------------------------------------------------------------
/core/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-37.pyc
--------------------------------------------------------------------------------
/core/__pycache__/config.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-38.pyc
--------------------------------------------------------------------------------
/core/__pycache__/config.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-39.pyc
--------------------------------------------------------------------------------
/core/__pycache__/meters.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/meters.cpython-39.pyc
--------------------------------------------------------------------------------
/core/__pycache__/optim.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-310.pyc
--------------------------------------------------------------------------------
/core/__pycache__/optim.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-311.pyc
--------------------------------------------------------------------------------
/core/__pycache__/optim.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-37.pyc
--------------------------------------------------------------------------------
/core/__pycache__/optim.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-38.pyc
--------------------------------------------------------------------------------
/core/__pycache__/optim.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/optim.cpython-39.pyc
--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/core/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-310.pyc
--------------------------------------------------------------------------------
/core/__pycache__/config.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/config.cpython-311.pyc
--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/core/__pycache__/data_util.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-310.pyc
--------------------------------------------------------------------------------
/core/__pycache__/data_util.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-311.pyc
--------------------------------------------------------------------------------
/core/__pycache__/data_util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-37.pyc
--------------------------------------------------------------------------------
/core/__pycache__/data_util.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-38.pyc
--------------------------------------------------------------------------------
/core/__pycache__/data_util.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/data_util.cpython-39.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/tacos.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-310.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/tacos.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-311.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/tacos.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-37.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/tacos.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-38.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/tacos.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/tacos.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/PGBP.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/PGBP.cpython-38.pyc
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/core/__pycache__/runner_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-310.pyc
--------------------------------------------------------------------------------
/core/__pycache__/runner_utils.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-311.pyc
--------------------------------------------------------------------------------
/core/__pycache__/runner_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/core/__pycache__/runner_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-38.pyc
--------------------------------------------------------------------------------
/core/__pycache__/runner_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/core/__pycache__/runner_utils.cpython-39.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/charades.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-310.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/charades.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-311.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/charades.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-37.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/charades.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-38.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/charades.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT1.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT2.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT3.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT4.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT4.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT5.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT5.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT6.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT6.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT7.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT7.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT8.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT8.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT9.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT9.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/fusion.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/fusion.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/fusion.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/gauss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/gauss.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/layers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/layers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/layers.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-39.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/BaseDataset.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-310.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/BaseDataset.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-311.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/BaseDataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-37.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/BaseDataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-38.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/BaseDataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/BaseDataset.cpython-39.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/activitynet.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-310.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/activitynet.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-311.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/activitynet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-37.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/activitynet.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-38.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/activitynet.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/activitynet.cpython-39.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/charades_len.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades_len.cpython-38.pyc
--------------------------------------------------------------------------------
/datasets/__pycache__/charades_mom.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/datasets/__pycache__/charades_mom.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT10.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT10.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/attention.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/attention.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder1.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder2.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder3.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder4.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder4.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder5.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder5.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder6.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder6.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder7.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder7.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/decoder8.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/decoder8.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/encoder.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/encoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/encoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/encoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/encoder.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/fusion.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/fusion.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/fusion.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/layers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/layers.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/layers.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/matcher.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/matcher.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/operation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/operation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/operation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-39.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT_text1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text1.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT_text2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text2.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT_text3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text3.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMAT_text4.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMAT_text4.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMATtext1.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext1.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMATtext2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext2.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMATtext3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext3.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMATtext4.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATtext4.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/attention.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/attention.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/attention.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/operation.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/operation.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/operation.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/slidewindow.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/slidewindow.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/span_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/span_utils.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/EAMATseprate.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/EAMATseprate.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/phraseEncoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/phraseEncoder.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/triplet_loss.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-310.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/triplet_loss.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-311.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/triplet_loss.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-37.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/triplet_loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-38.pyc
--------------------------------------------------------------------------------
/models/PGBP/__pycache__/triplet_loss.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TJUMMG/PGBP/HEAD/models/PGBP/__pycache__/triplet_loss.cpython-39.pyc
--------------------------------------------------------------------------------
/conda.sh:
--------------------------------------------------------------------------------
1 | conda create --name PGBP python=3
2 | conda activate PGBP
3 | conda install pytorch torchvision cudatoolkit -c pytorch
4 | pip install easydict torchtext h5py nltk prettytable black transformers tensorboard
5 |
--------------------------------------------------------------------------------
/models/PGBP/phraseEncoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class PhraseEncodeNet(nn.Module):
5 |
6 | def __init__(self, dim):
7 | super(PhraseEncodeNet, self).__init__()
8 | self.unigram_conv = nn.Conv1d(dim, dim, 1, stride=1, padding=0)
9 | self.bigram_conv = nn.Conv1d(dim, dim, 2, stride=1, padding=1, dilation=2)
10 | self.trigram_conv = nn.Conv1d(dim, dim, 3, stride=1, padding=2, dilation=2)
11 | self.txt_linear = nn.Linear(dim * 3, dim)
12 | # padding,dilation设定保证L不变
13 | def forward(self, x):
14 | bs, _, dimc = x.size()
15 | words = x.transpose(-1, -2) # B, C, L
16 | unigrams = self.unigram_conv(words)
17 | bigrams = self.bigram_conv(words) # B, C, L
18 | trigrams = self.trigram_conv(words)
19 | phrase = torch.cat((unigrams, bigrams, trigrams), dim=1)
20 | phrase = phrase.transpose(-1, -2).view(bs, -1, dimc * 3)
21 | phrase = self.txt_linear(phrase)
22 | return phrase
23 |
24 |
--------------------------------------------------------------------------------
/models/PGBP/gauss.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def generate_gaussian_tensor(inter_label, key_frames, hp_sigma):
4 | """
5 | Generate a tensor with each batch as a Gaussian sequence.
6 |
7 | :param B: Batch size.
8 | :param L: Length of each sequence.
9 | :param key_frames: Tensor of shape (B,) containing key frames.
10 | :param variances: Tensor of shape (B,) containing variances.
11 | :return: Tensor with shape (B, L) containing Gaussian sequences.
12 | """
13 | # Generate a range of values from 0 to L-1
14 | B,L = inter_label.shape
15 | variances = hp_sigma * torch.sum(inter_label,dim =1)
16 | x_values = torch.arange(0, L, 1).float().cuda()
17 |
18 | # Repeat key_frames and variances for each batch
19 | key_frames = key_frames.view(-1, 1).repeat(1, L)
20 | variances = variances.view(-1, 1).repeat(1, L)
21 |
22 | # Calculate Gaussian values using the norm.pdf function
23 | gaussian_values = torch.exp(-(x_values - key_frames)**2 / (2 * variances**2))
24 | return gaussian_values
25 |
26 |
27 |
--------------------------------------------------------------------------------
/core/optim.py:
--------------------------------------------------------------------------------
1 | from transformers import AdamW, get_linear_schedule_with_warmup
2 |
3 | from .config import config
4 |
5 |
6 | def build_optimizer_and_scheduler(model, lr, num_train_steps,
7 | warmup_proportion):
8 | no_decay = ['bias', 'layer_norm',
9 | 'LayerNorm'] # no decay for parameters of layer norm and bias
10 | optimizer_grouped_parameters = [{
11 | 'params': [
12 | p for n, p in model.named_parameters()
13 | if not any(nd in n for nd in no_decay)
14 | ],
15 | 'weight_decay':
16 | config.TRAIN.WEIGHT_DECAY
17 | }, {
18 | 'params': [
19 | p for n, p in model.named_parameters()
20 | if any(nd in n for nd in no_decay)
21 | ],
22 | 'weight_decay':
23 | 0.0
24 | }]
25 | optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
26 | scheduler = get_linear_schedule_with_warmup(
27 | optimizer, num_train_steps * warmup_proportion, num_train_steps)
28 | return optimizer, scheduler
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==2.0.0
2 | black==23.10.1
3 | certifi==2023.7.22
4 | charset-normalizer==3.3.2
5 | click==8.1.7
6 | cmake==3.25.0
7 | contourpy==1.1.1
8 | cycler==0.12.1
9 | easydict==1.11
10 | einops==0.7.0
11 | filelock==3.13.1
12 | fonttools==4.44.3
13 | fsspec==2023.10.0
14 | grpcio==1.59.2
15 | h5py==3.10.0
16 | huggingface-hub==0.17.3
17 | idna==3.4
18 | importlib-metadata==6.8.0
19 | importlib-resources==6.1.1
20 | Jinja2==3.1.2
21 | joblib==1.3.2
22 | jstyleson==0.0.2
23 | kiwisolver==1.4.5
24 | lit==15.0.7
25 | Markdown==3.5.1
26 | MarkupSafe==2.1.3
27 | mpmath==1.3.0
28 | mypy-extensions==1.0.0
29 | networkx==3.0
30 | nltk==3.8.1
31 | numpy==1.18.5
32 | packaging==23.2
33 | pandas==1.1.5
34 | pathspec==0.11.2
35 | Pillow==10.1.0
36 | platformdirs==3.11.0
37 | prettytable==3.9.0
38 | protobuf==3.20.0
39 | pyparsing==3.1.1
40 | python-dateutil==2.8.2
41 | pytz==2023.3.post1
42 | PyYAML==6.0.1
43 | regex==2023.10.3
44 | requests==2.31.0
45 | safetensors==0.4.0
46 | six==1.16.0
47 | sympy==1.12
48 | tensorboard==1.15.0
49 | tokenizers==0.14.1
50 | tomli==2.0.1
51 | torch==2.0.0+cu118
52 | torchaudio==2.0.1+cu118
53 | torchdata==0.6.0
54 | torchtext==0.15.1
55 | torchvision==0.15.1+cu118
56 | tqdm==4.66.1
57 | transformers==4.35.0
58 | triton==2.0.0
59 | typing_extensions==4.8.0
60 | urllib3==2.0.7
61 | vpdb==1.0.0
62 | wcwidth==0.2.9
63 | Werkzeug==3.0.1
64 | zipp==3.17.0
65 |
--------------------------------------------------------------------------------
/models/PGBP/operation.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 | import math
6 |
7 | def cw2se(cw, fix_out_of_bound=False):
8 | # 创建一个与输入张量 cw 相同形状的全零张量 se
9 | se = torch.zeros_like(cw)
10 |
11 | # 计算起始坐标
12 | se[..., 0] = cw[..., 0] - cw[..., 1] / 2
13 |
14 | # 计算结束坐标
15 | se[..., 1] = cw[..., 0] + cw[..., 1] / 2
16 |
17 | # 如果开启修复越界的选项
18 | if fix_out_of_bound:
19 | # 将小于 0.0 的起始坐标修正为 0.0
20 | se[..., 0][se[..., 0] < 0.0] = 0.0
21 | # 将大于 1.0 的结束坐标修正为 1.0
22 | se[..., 1][se[..., 1] > 1.0] = 1.0
23 | return se
24 |
25 | def mask_logits(inputs, mask, mask_value=-1e30):
26 | mask = mask.type(torch.float32)
27 | return inputs + (1.0 - mask) * mask_value
28 |
29 |
30 | class Conv1D(nn.Module):
31 | def __init__(self,
32 | in_dim,
33 | out_dim,
34 | kernel_size=1,
35 | stride=1,
36 | padding=0,
37 | bias=True):
38 | super(Conv1D, self).__init__()
39 | self.conv1d = nn.Conv1d(in_channels=in_dim,
40 | out_channels=out_dim,
41 | kernel_size=kernel_size,
42 | padding=padding,
43 | stride=stride,
44 | bias=bias)
45 |
46 | def forward(self, x):
47 | # suppose all the input with shape (batch_size, seq_len, dim)
48 | x = x.transpose(1, 2) # (batch_size, dim, seq_len)
49 | x = self.conv1d(x)
50 | return x.transpose(1, 2) # (batch_size, seq_len, dim)
--------------------------------------------------------------------------------
/datasets/charades_len.py:
--------------------------------------------------------------------------------
1 | """ Dataset loader for the Charades-STA dataset """
2 | import os
3 | import csv
4 |
5 | import h5py
6 | import numpy as np
7 | import torch
8 | from torch import nn
9 | import torch.nn.functional as F
10 | import torch.utils.data as data
11 | import torchtext
12 | import json
13 | from . import average_to_fixed_length
14 | from .BaseDataset import BaseDataset
15 | from core.config import config
16 |
17 |
18 | class Charades_len(BaseDataset):
19 | def __init__(self, split):
20 | # statistics for all video length
21 | # min:12 max:390 mean: 62, std:18
22 | # max sentence length:train->10, test->10
23 | super(Charades_len, self).__init__(split)
24 |
25 | def __len__(self):
26 | return len(self.annotations)
27 |
28 | def get_annotation(self):
29 |
30 | anno_file = open(
31 | os.path.join(self.anno_dirs['Charades'],
32 | "{}_len_80.jsonl".format(self.split)), 'r')
33 | annotations = []
34 | # max_sentence_length = 0
35 | for line in anno_file:
36 | line_obj = json.loads(line.strip())
37 | sent = line_obj["query"]
38 | vid = line_obj["vid"]
39 | times = line_obj["relevant_windows"][0]
40 | duration = line_obj["duration"]
41 | annotations.append({
42 | 'video': vid,
43 | 'times': times,
44 | 'description': sent,
45 | 'duration': duration,
46 | 'dataset': 'Charades_len'
47 | })
48 | anno_file.close()
49 | # print("charade max sentence length: ", max_sentence_length)
50 | return annotations
--------------------------------------------------------------------------------
/datasets/charades_mom.py:
--------------------------------------------------------------------------------
1 | """ Dataset loader for the Charades-STA dataset """
2 | import os
3 | import csv
4 |
5 | import h5py
6 | import numpy as np
7 | import torch
8 | from torch import nn
9 | import torch.nn.functional as F
10 | import torch.utils.data as data
11 | import torchtext
12 | import json
13 | from . import average_to_fixed_length
14 | from .BaseDataset import BaseDataset
15 | from core.config import config
16 |
17 |
18 | class Charades_mom(BaseDataset):
19 | def __init__(self, split):
20 | # statistics for all video length
21 | # min:12 max:390 mean: 62, std:18
22 | # max sentence length:train->10, test->10
23 | super(Charades_mom, self).__init__(split)
24 |
25 | def __len__(self):
26 | return len(self.annotations)
27 |
28 | def get_annotation(self):
29 |
30 | anno_file = open(
31 | os.path.join(self.anno_dirs['Charades'],
32 | "{}_mom_80.jsonl".format(self.split)), 'r')
33 | annotations = []
34 | # max_sentence_length = 0
35 | for line in anno_file:
36 | line_obj = json.loads(line.strip())
37 | sent = line_obj["query"]
38 | vid = line_obj["vid"]
39 | times = line_obj["relevant_windows"][0]
40 | duration = line_obj["duration"]
41 | annotations.append({
42 | 'video': vid,
43 | 'times': times,
44 | 'description': sent,
45 | 'duration': duration,
46 | 'dataset': 'Charades_mom'
47 | })
48 | anno_file.close()
49 | # print("charade max sentence length: ", max_sentence_length)
50 | return annotations
--------------------------------------------------------------------------------
/datasets/activitynet.py:
--------------------------------------------------------------------------------
1 | """ Dataset loader for the ActivityNet Captions dataset """
2 | import os
3 | import json
4 |
5 | import h5py
6 | import torch
7 | from torch import nn
8 | import torch.nn.functional as F
9 | import torch.utils.data as data
10 | import torchtext
11 |
12 | from .BaseDataset import BaseDataset
13 | from . import average_to_fixed_length
14 | from core.config import config
15 |
16 |
17 | class ActivityNet(BaseDataset):
18 | def __init__(self, split):
19 | # statistics for all video length
20 | # min:2 medium: max:1415 mean: 204, std:120
21 | # max sentence length:train-->73, test-->73
22 | super(ActivityNet, self).__init__(split)
23 |
24 | def __len__(self):
25 | return len(self.annotations)
26 |
27 |
28 | def get_annotation(self):
29 |
30 | with open(
31 | os.path.join(self.anno_dirs['ActivityNet'],
32 | '{}_data.json'.format(self.split)), 'r') as f:
33 | annotations = json.load(f)
34 | anno_pairs = []
35 | for video_anno in annotations:
36 |
37 | vid = video_anno[0]
38 | duration = video_anno[1]
39 | timestamp = video_anno[2]
40 | sentence = video_anno[3]
41 |
42 | if timestamp[0] < timestamp[1]:
43 | anno_pairs.append({
44 | 'video':
45 | vid,
46 | 'duration':
47 | duration,
48 | 'times':
49 | [max(timestamp[0], 0),
50 | min(timestamp[1], duration)],
51 | 'description':
52 | sentence,
53 | 'dataset':
54 | 'ActivityNet'
55 | })
56 | # print("activitynet max sentence length: ", max_sentence_length)
57 | return anno_pairs
--------------------------------------------------------------------------------
/datasets/charades.py:
--------------------------------------------------------------------------------
1 | """ Dataset loader for the Charades-STA dataset """
2 | import os
3 | import csv
4 |
5 | import h5py
6 | import numpy as np
7 | import torch
8 | from torch import nn
9 | import torch.nn.functional as F
10 | import torch.utils.data as data
11 | import torchtext
12 |
13 | from . import average_to_fixed_length
14 | from .BaseDataset import BaseDataset
15 | from core.config import config
16 |
17 |
18 | class Charades(BaseDataset):
19 | def __init__(self, split):
20 | # statistics for all video length
21 | # min:12 max:390 mean: 62, std:18
22 | # max sentence length:train->10, test->10
23 | super(Charades, self).__init__(split)
24 |
25 | def __len__(self):
26 | return len(self.annotations)
27 |
28 | def get_annotation(self):
29 | self.durations = {}
30 | with open(
31 | os.path.join(self.anno_dirs['Charades'],
32 | 'Charades_v1_{}.csv'.format(self.split))) as f:
33 | reader = csv.DictReader(f)
34 | for row in reader:
35 | self.durations[row['id']] = float(row['length'])
36 |
37 | anno_file = open(
38 | os.path.join(self.anno_dirs['Charades'],
39 | "charades_sta_{}.txt".format(self.split)), 'r')
40 | annotations = []
41 | # max_sentence_length = 0
42 | for line in anno_file:
43 | anno, sent = line.split("##")
44 | sent = sent.split('.\n')[0]
45 | vid, s_time, e_time = anno.split(" ")
46 | s_time = float(s_time)
47 | e_time = min(float(e_time), self.durations[vid])
48 | if s_time < e_time:
49 | annotations.append({
50 | 'video': vid,
51 | 'times': [s_time, e_time],
52 | 'description': sent,
53 | 'duration': self.durations[vid],
54 | 'dataset': 'Charades'
55 | })
56 | anno_file.close()
57 | # print("charade max sentence length: ", max_sentence_length)
58 | return annotations
--------------------------------------------------------------------------------
/datasets/tacos.py:
--------------------------------------------------------------------------------
1 | """ Dataset loader for the TACoS dataset """
2 | import os
3 | import json
4 |
5 | import h5py
6 | import torch
7 | from torch import nn
8 | import torch.nn.functional as F
9 | import torch.utils.data as data
10 | import torchtext
11 |
12 | from . import average_to_fixed_length
13 | from .BaseDataset import BaseDataset
14 | from core.config import config
15 |
16 |
17 | class TACoS(BaseDataset):
18 | def __init__(self, split):
19 | # statistics for all video length
20 | # min:90 max:2578 mean: 528, std:436
21 | # max sentence length:train-->46, test-->50
22 | super(TACoS, self).__init__(split)
23 |
24 | def __len__(self):
25 | return len(self.annotations)
26 |
27 | def get_annotation(self):
28 | # val_1.json is renamed as val.json, val_2.json is renamed as test.json
29 | with open(
30 | os.path.join(self.anno_dirs['TACoS'],
31 | '{}.json'.format(self.split)), 'r') as f:
32 | annotations = json.load(f)
33 | anno_pairs = []
34 | # max_sentence_length = 0
35 | for vid, video_anno in annotations.items():
36 | duration = video_anno['num_frames'] / video_anno['fps']
37 | for timestamp, sentence in zip(video_anno['timestamps'],
38 | video_anno['sentences']):
39 | if timestamp[0] < timestamp[1]:
40 | anno_pairs.append({
41 | 'video':
42 | vid,
43 | # vid[:-4],
44 | 'duration':
45 | duration,
46 | 'times': [
47 | max(timestamp[0] / video_anno['fps'], 0),
48 | min(timestamp[1] / video_anno['fps'], duration)
49 | ],
50 | 'description':
51 | sentence,
52 | 'dataset':
53 | 'TACoS'
54 | })
55 | # print("tacos max sentence length: ", max_sentence_length)
56 | return anno_pairs
--------------------------------------------------------------------------------
/experiments/charades/PGBP.yaml:
--------------------------------------------------------------------------------
1 | WORKERS: 4
2 |
3 | SEED: 328
4 |
5 | DATASET:
6 | NAME: Charades
7 | NO_VAL: True
8 | NORMALIZE: False
9 | num_pairs: 15
10 | num_clips: 256
11 |
12 | MODEL:
13 | NAME: PGBP
14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7
15 | PARAMS:
16 | aligned_len: True
17 | sementic_fu: True
18 | use_keyword: True
19 | cost_class: 4.0
20 | cost_span: 10.0
21 | cost_giou: 1.0
22 | eos_coef: 0.1
23 | content_prior: "learned"
24 | neg: True
25 | top_k: 6
26 | pos: True
27 | top_k0: 1
28 | fusion_attention: True
29 | num_queries: 10
30 | windowsize: 10
31 | video_feature_dim: 1024
32 | query_feature_dim: 300
33 | max_len_query_tag: 8
34 | dim: 512
35 | query_position: False
36 | video_position: True
37 | query_attention_layers: 1
38 | video_attention_layers: 1
39 | query_attention: "MultiLSTMAttention"
40 | video_attention: "MultiHeadAttention"
41 | early_attention: "MultiHeadAttention"
42 | detr_attention: "DETR_Decoder"
43 | detr_layers: 5
44 | early_attention_layers: 1
45 | post_attention_layers: 2
46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention]
47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion]
48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion
49 | beta: 2
50 | MULTI_SCALE: True
51 | MULTI_SCALE_LEN: 1
52 | num_heads: 8
53 | num_layers: 1
54 | num_step: 3
55 | bi_direction: True
56 | kernels: [3, 5, 7]
57 | drop_rate: 0.5
58 | DEBUG: False
59 |
60 | modulate_t_attn: True
61 | bbox_embed_diff_each_layer: False
62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise'
63 | query_dim: 2
64 | return_intermediate: True
65 | feedforward: 1024
66 | dropout: 0.1
67 | activation: "relu"
68 | normalize_before: False
69 | keep_query_pos: False
70 | rm_self_attn_decoder: False
71 |
72 | TRAIN:
73 | BATCH_SIZE: 16
74 | LR: 0.00005
75 | WEIGHT_DECAY: 0.01
76 | MAX_EPOCH: 50
77 | MILE_STONE: [80]
78 | GAMMA: 0.1
79 | CONTINUE: False
80 |
81 | LOSS:
82 | LOCALIZATION: 10.0
83 | MATCH: 10.0
84 | DISTANCE: 0.0
85 | KL: 100.0
86 | EARLY: 1.0
87 | contrast: 1.0
88 | cont: 1.0
89 | hy_sigma: 1.0
90 | contrast_weight: True
91 | bce: 4.0
92 | iou: 1.0
93 |
94 | TEST:
95 | BATCH_SIZE: 32
96 | EVAL_TRAIN: True
97 |
--------------------------------------------------------------------------------
/experiments/tacos/PGBP.yaml:
--------------------------------------------------------------------------------
1 | WORKERS: 2
2 |
3 | SEED: 12345
4 |
5 | DATASET:
6 | NAME: TACoS
7 | NO_VAL: True
8 | NORMALIZE: False
9 | num_pairs: 15
10 | num_clips: 500
11 |
12 | MODEL:
13 | NAME: PGBP
14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7
15 | PARAMS:
16 | aligned_len: True
17 | sementic_fu: True
18 | use_keyword: True
19 | cost_class: 4.0
20 | cost_span: 10.0
21 | cost_giou: 1.0
22 | eos_coef: 0.1
23 | content_prior: "learned"
24 | neg: True
25 | top_k: 5
26 | pos: True
27 | top_k0: 3
28 | fusion_attention: True
29 | num_queries: 10
30 | windowsize: 10
31 | video_feature_dim: 4096
32 | query_feature_dim: 300
33 | max_len_query_tag: 8
34 | dim: 512
35 | query_position: False
36 | video_position: True
37 | query_attention_layers: 1
38 | video_attention_layers: 1
39 | query_attention: "MultiLSTMAttention"
40 | video_attention: "MultiHeadAttention"
41 | early_attention: "MultiHeadAttention"
42 | detr_attention: "DETR_Decoder"
43 | detr_layers: 5
44 | early_attention_layers: 1
45 | post_attention_layers: 3
46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention]
47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion]
48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion
49 | beta: 2
50 | MULTI_SCALE: True
51 | MULTI_SCALE_LEN: 1
52 | num_heads: 8
53 | num_layers: 1
54 | num_step: 3
55 | bi_direction: True
56 | kernels: [3, 5, 7]
57 | drop_rate: 0.5
58 | DEBUG: False
59 |
60 | modulate_t_attn: True
61 | bbox_embed_diff_each_layer: False
62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise'
63 | query_dim: 2
64 | return_intermediate: True
65 | feedforward: 1024
66 | dropout: 0.1
67 | activation: "relu"
68 | normalize_before: False
69 | keep_query_pos: False
70 | rm_self_attn_decoder: False
71 |
72 | TRAIN:
73 | BATCH_SIZE: 16
74 | LR: 0.00005
75 | WEIGHT_DECAY: 0.01
76 | MAX_EPOCH: 50
77 | MILE_STONE: [80]
78 | GAMMA: 0.1
79 | CONTINUE: False
80 |
81 | LOSS:
82 | LOCALIZATION: 10.0
83 | MATCH: 10.0
84 | DISTANCE: 0.0
85 | KL: 100.0
86 | EARLY: 1.0
87 | contrast: 1.0
88 | cont: 1.0
89 | hy_sigma: 1.0
90 | contrast_weight: True
91 | bce: 4.0
92 | iou: 1.0
93 |
94 | TEST:
95 | BATCH_SIZE: 32
96 | EVAL_TRAIN: True
97 |
--------------------------------------------------------------------------------
/experiments/activitynet/PGBP.yaml:
--------------------------------------------------------------------------------
1 | WORKERS: 4
2 |
3 | SEED: 12345
4 |
5 | DATASET:
6 | NAME: ActivityNet
7 | NO_VAL: True
8 | NORMALIZE: True
9 | num_pairs: 15
10 | num_clips: 256
11 |
12 | MODEL:
13 | NAME: PGBP
14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7
15 | PARAMS:
16 | aligned_len: True
17 | sementic_fu: True
18 | use_keyword: True
19 | cost_class: 4.0
20 | cost_span: 10.0
21 | cost_giou: 1.0
22 | eos_coef: 0.1
23 | content_prior: "learned"
24 | neg: True
25 | top_k: 5
26 | pos: True
27 | top_k0: 1
28 | fusion_attention: True
29 | num_queries: 10
30 | windowsize: 10
31 | video_feature_dim: 1024
32 | query_feature_dim: 300
33 | max_len_query_tag: 8
34 | dim: 512
35 | query_position: False
36 | video_position: True
37 | query_attention_layers: 1
38 | video_attention_layers: 1
39 | query_attention: "MultiLSTMAttention"
40 | video_attention: "MultiHeadAttention"
41 | early_attention: "MultiHeadAttention"
42 | detr_attention: "DETR_Decoder"
43 | detr_layers: 5
44 | early_attention_layers: 1
45 | post_attention_layers: 2
46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention]
47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion]
48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion
49 | beta: 2
50 | MULTI_SCALE: True
51 | MULTI_SCALE_LEN: 1
52 | num_heads: 8
53 | num_layers: 1
54 | num_step: 3
55 | bi_direction: True
56 | kernels: [3, 5, 7]
57 | drop_rate: 0.5
58 | DEBUG: False
59 |
60 | modulate_t_attn: True
61 | bbox_embed_diff_each_layer: False
62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise'
63 | query_dim: 2
64 | return_intermediate: True
65 | feedforward: 1024
66 | dropout: 0.1
67 | activation: "relu"
68 | normalize_before: False
69 | keep_query_pos: False
70 | rm_self_attn_decoder: False
71 |
72 | TRAIN:
73 | BATCH_SIZE: 64
74 | LR: 0.00005
75 | WEIGHT_DECAY: 0.01
76 | MAX_EPOCH: 50
77 | MILE_STONE: [80]
78 | GAMMA: 0.1
79 | CONTINUE: False
80 |
81 | LOSS:
82 | LOCALIZATION: 10.0
83 | MATCH: 10.0
84 | DISTANCE: 0.0
85 | KL: 100.0
86 | EARLY: 1.0
87 | contrast: 1.0
88 | cont: 1.0
89 | hy_sigma: 1.0
90 | contrast_weight: True
91 | bce: 4.0
92 | iou: 1.0
93 |
94 | TEST:
95 | BATCH_SIZE: 64
96 | EVAL_TRAIN: True
97 |
--------------------------------------------------------------------------------
/experiments/charades_len/PGBP.yaml:
--------------------------------------------------------------------------------
1 | WORKERS: 4
2 |
3 | SEED: 328
4 |
5 | DATASET:
6 | NAME: Charades_len
7 | NO_VAL: True
8 | NORMALIZE: True
9 | num_pairs: 15
10 | num_clips: 256
11 |
12 | MODEL:
13 | NAME: PGBP
14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7
15 | PARAMS:
16 | aligned_len: True
17 | sementic_fu: True
18 | use_keyword: True
19 | cost_class: 4.0
20 | cost_span: 10.0
21 | cost_giou: 1.0
22 | eos_coef: 0.1
23 | content_prior: "learned"
24 | neg: True
25 | top_k: 6
26 | pos: True
27 | top_k0: 1
28 | fusion_attention: True
29 | num_queries: 10
30 | windowsize: 10
31 | video_feature_dim: 4096
32 | query_feature_dim: 300
33 | max_len_query_tag: 8
34 | dim: 512
35 | query_position: False
36 | video_position: True
37 | query_attention_layers: 1
38 | video_attention_layers: 1
39 | query_attention: "MultiLSTMAttention"
40 | video_attention: "MultiHeadAttention"
41 | early_attention: "MultiHeadAttention"
42 | detr_attention: "DETR_Decoder"
43 | detr_layers: 5
44 | early_attention_layers: 1
45 | post_attention_layers: 2
46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention]
47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion]
48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion
49 | beta: 2
50 | MULTI_SCALE: True
51 | MULTI_SCALE_LEN: 1
52 | num_heads: 8
53 | num_layers: 1
54 | num_step: 3
55 | bi_direction: True
56 | kernels: [3, 5, 7]
57 | drop_rate: 0.5
58 | DEBUG: False
59 |
60 | modulate_t_attn: True
61 | bbox_embed_diff_each_layer: False
62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise'
63 | query_dim: 2
64 | return_intermediate: True
65 | feedforward: 1024
66 | dropout: 0.1
67 | activation: "relu"
68 | normalize_before: False
69 | keep_query_pos: False
70 | rm_self_attn_decoder: False
71 |
72 | TRAIN:
73 | BATCH_SIZE: 16
74 | LR: 0.00005
75 | WEIGHT_DECAY: 0.01
76 | MAX_EPOCH: 50
77 | MILE_STONE: [80]
78 | GAMMA: 0.1
79 | CONTINUE: False
80 |
81 | LOSS:
82 | LOCALIZATION: 10.0
83 | MATCH: 10.0
84 | DISTANCE: 0.0
85 | KL: 100.0
86 | EARLY: 1.0
87 | contrast: 1.0
88 | cont: 1.0
89 | hy_sigma: 1.0
90 | contrast_weight: True
91 | bce: 4.0
92 | iou: 1.0
93 |
94 | TEST:
95 | BATCH_SIZE: 32
96 | EVAL_TRAIN: True
97 |
--------------------------------------------------------------------------------
/experiments/charades_mom/PGBP.yaml:
--------------------------------------------------------------------------------
1 | WORKERS: 4
2 |
3 | SEED: 328
4 |
5 | DATASET:
6 | NAME: Charades_mom
7 | NO_VAL: True
8 | NORMALIZE: True
9 | num_pairs: 15
10 | num_clips: 256
11 |
12 | MODEL:
13 | NAME: PGBP
14 | CHECKPOINT: /media/Harddisk/zzb/work1/202404/results/Charades/EAMATDETR901/checkpoints/EAMAT1_26.t7
15 | PARAMS:
16 | aligned_len: True
17 | sementic_fu: True
18 | use_keyword: True
19 | cost_class: 4.0
20 | cost_span: 10.0
21 | cost_giou: 1.0
22 | eos_coef: 0.1
23 | content_prior: "learned"
24 | neg: True
25 | top_k: 6
26 | pos: True
27 | top_k0: 1
28 | fusion_attention: True
29 | num_queries: 10
30 | windowsize: 10
31 | video_feature_dim: 4096
32 | query_feature_dim: 300
33 | max_len_query_tag: 8
34 | dim: 512
35 | query_position: False
36 | video_position: True
37 | query_attention_layers: 1
38 | video_attention_layers: 1
39 | query_attention: "MultiLSTMAttention"
40 | video_attention: "MultiHeadAttention"
41 | early_attention: "MultiHeadAttention"
42 | detr_attention: "DETR_Decoder"
43 | detr_layers: 5
44 | early_attention_layers: 1
45 | post_attention_layers: 2
46 | post_attention: "MultiLSTMAttention" # choice of [MultiHeadAttention, DaMultiHeadAttention, MultiLSTMAttention, MultiConvAttention]
47 | early_fusion_module: "CQFusion" # choice of [CQFusion, InteractorFusion, CosineFusion]
48 | fusion_module: "multiscale_CQFusion" #multiscale_Fusion, CQFusion
49 | beta: 2
50 | MULTI_SCALE: True
51 | MULTI_SCALE_LEN: 1
52 | num_heads: 8
53 | num_layers: 1
54 | num_step: 3
55 | bi_direction: True
56 | kernels: [3, 5, 7]
57 | drop_rate: 0.5
58 | DEBUG: False
59 |
60 | modulate_t_attn: True
61 | bbox_embed_diff_each_layer: False
62 | query_scale_type: 'cond_elewise' #'cond_elewise', 'cond_scalar', 'fix_elewise'
63 | query_dim: 2
64 | return_intermediate: True
65 | feedforward: 1024
66 | dropout: 0.1
67 | activation: "relu"
68 | normalize_before: False
69 | keep_query_pos: False
70 | rm_self_attn_decoder: False
71 |
72 | TRAIN:
73 | BATCH_SIZE: 16
74 | LR: 0.00005
75 | WEIGHT_DECAY: 0.01
76 | MAX_EPOCH: 50
77 | MILE_STONE: [80]
78 | GAMMA: 0.1
79 | CONTINUE: False
80 |
81 | LOSS:
82 | LOCALIZATION: 10.0
83 | MATCH: 10.0
84 | DISTANCE: 0.0
85 | KL: 100.0
86 | EARLY: 1.0
87 | contrast: 1.0
88 | cont: 1.0
89 | hy_sigma: 1.0
90 | contrast_weight: True
91 | bce: 4.0
92 | iou: 1.0
93 |
94 | TEST:
95 | BATCH_SIZE: 32
96 | EVAL_TRAIN: True
97 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Aggregate and Discriminate: Pseudo Clips-Guided Boundary Perception for Video Moment Retrieval
2 |
3 | This is implementation for the paper "Aggregate and Discriminate: Pseudo Clips-Guided Boundary Perception for Video Moment Retrieval" (**TMM 2024**)
4 |
5 | ```shell
6 | # preparing environment
7 | bash conda.sh
8 | ```
9 | ## Introduction
10 | Video moment retrieval (VMR) aims to localize a video segment in an untrimmed video, that is semantically relevant to a language query. The challenge of this task lies in effectively aligning the intricate and information-dense video modality with the succinctly summarized textual modality, and further localizing the starting and ending timestamps of the target moments. Previous works have attempted to achieve multi-granularity alignment of video and query in a coarse to fine manner, yet these efforts still fall short in addressing the inherent disparities in representation and information density between videos and queries, leading to modal misalignments. In this paper, we propose a progressive video moment retrieval framework, initially retrieving the most relevant and irrelevant video clips to the query as semantic guidance, thereby bridging the semantic gap between video modality and language modality. Futher more, we introduce a pseudo clips guided aggregation module to aggregate densely relevant moment clips closer together and propose a discriminative boundary-enhanced decoder with the guidance of pseudo clips to push the semantically confusing proposals away. Extensive experiments on the Charades-STA, ActivityNet Captions and TACoS datasets demonstrate that our method outperforms existing methods.
11 |
12 |

13 |
14 |
15 | ## Dataset Preparation
16 | We use [VSLNet's](https://github.com/IsaacChanghau/VSLNet) data. The visual features can be download [here](https://app.box.com/s/h0sxa5klco6qve5ahnz50ly2nksmuedw), for CharadesSTA we use the "new" fold, and for TACoS we use the "old" fold, annotation and other details can be found [here](https://github.com/IsaacChanghau/VSLNet/tree/master/prepare)
17 | and then modify the line 81~91 of "dataset/BaseDataset.py" to your own path.
18 |
19 | ## Quick Start
20 | **Train**
21 | ```shell script
22 | python main.py --cfg experiments/activitynet/PGBP.yaml --mode train
23 | python main.py --cfg experiments/charades/PGBP.yaml --mode train
24 | python main.py --cfg experiments/tacos/PGBP.yaml --mode train
25 |
26 | python main.py --cfg experiments/charades_len/PGBP.yaml --mode train
27 | python main.py --cfg experiments/charades_mom/PGBP.yaml --mode train
28 | ```
29 | a new fold "results" are created.
30 |
31 | ## Citation
32 | If you feel this project helpful to your research, please cite our work.
33 |
--------------------------------------------------------------------------------
/core/meters.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | class CatMeter:
6 | '''
7 | Concatenate Meter for torch.Tensor
8 | '''
9 | def __init__(self):
10 | self.reset()
11 |
12 | def reset(self):
13 | self.val = None
14 |
15 | def update(self, val):
16 | if self.val is None:
17 | self.val = val
18 | else:
19 | self.val = torch.cat([self.val, val], dim=0)
20 |
21 | def get_val(self):
22 | return self.val
23 |
24 | def get_val_numpy(self):
25 | return self.val.data.cpu().numpy()
26 |
27 |
28 | class MultiItemAverageMeter:
29 | def __init__(self):
30 | self.reset()
31 |
32 | def reset(self):
33 | self.content = {}
34 |
35 | def update(self, val):
36 | '''
37 | :param val: dict, keys are strs, values are torch.Tensor or np.array
38 | '''
39 | for key in list(val.keys()):
40 | value = val[key]
41 | if key not in list(self.content.keys()):
42 | self.content[key] = {'avg': value, 'sum': value, 'count': 1.0}
43 | else:
44 | self.content[key]['sum'] += value
45 | self.content[key]['count'] += 1.0
46 | self.content[key]['avg'] = self.content[key]['sum'] / \
47 | self.content[key]['count']
48 |
49 | def get_val(self):
50 | keys = list(self.content.keys())
51 | values = []
52 | for key in keys:
53 | val = self.content[key]['avg']
54 | if isinstance(val, torch.Tensor):
55 | val = val.data.cpu().numpy()
56 | values.append(val)
57 | return keys, values
58 |
59 | def get_str(self):
60 |
61 | result = ''
62 | keys, values = self.get_val()
63 |
64 | for key, value in zip(keys, values):
65 | result += key
66 | result += ': '
67 | if isinstance(value, np.ndarray):
68 | value = np.round(value, 5)
69 | result += str(value)
70 | result += '; '
71 |
72 | return result
73 |
74 |
75 | # class AverageMeter:
76 | # """
77 | # Average Meter
78 | # """
79 |
80 | # def __init__(self):
81 | # self.reset()
82 |
83 | # def reset(self):
84 | # self.sum = 0
85 | # self.count = 0
86 |
87 | # def update(self, val):
88 | # self.sum += val
89 | # self.count += 1
90 |
91 | # def get_val(self):
92 | # return self.sum / self.count
93 |
94 |
95 | class AverageMeter(object):
96 | """Computes and stores the average and current value"""
97 | def __init__(self):
98 | self.reset()
99 |
100 | def reset(self):
101 | self.val = 0
102 | self.avg = 0
103 | self.sum = 0
104 | self.count = 0
105 |
106 | def update(self, val, n=1):
107 | self.val = val
108 | self.sum += val * n
109 | self.count += n
110 | self.avg = self.sum / self.count
111 |
112 | def get_val(self):
113 | return self.sum / self.count
114 |
--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import yaml
6 | from easydict import EasyDict as edict
7 |
8 | config = edict()
9 |
10 | config.WORKERS = 16
11 | config.SEED = 328
12 | config.LOG_DIR = ''
13 | config.MODEL_DIR = ''
14 | config.VERBOSE = False
15 | config.TAG = None
16 |
17 | # common params for NETWORK
18 | config.MODEL = edict()
19 | config.MODEL.NAME = ''
20 | config.MODEL.CHECKPOINT = '' # The checkpoint for the best performance
21 | config.MODEL.PARAMS = None
22 |
23 | # DATASET related params
24 | config.DATASET = edict()
25 | config.DATASET.NAME = ''
26 | config.DATASET.DATASETS = []
27 | config.DATASET.NO_VAL = True
28 | config.DATASET.NUM_SAMPLE_CLIPS = 128
29 | config.DATASET.SPLIT = ''
30 | config.DATASET.NORMALIZE = False
31 | config.DATASET.EXTEND_INNRE = 0.0 # extend the inner action label
32 | config.DATASET.EXTEND_TIME = False # extend TIME length of the input for bias
33 | config.DATASET.FLIP_TIME = False # flip the input in time direction
34 | config.DATASET.num_pairs = 10
35 | config.DATASET.num_clips = 256
36 | # train
37 | config.TRAIN = edict()
38 | config.TRAIN.LR = 0.001
39 | config.TRAIN.WEIGHT_DECAY = 0.0001
40 | config.TRAIN.FACTOR = 0.8
41 | config.TRAIN.PATIENCE = 5
42 | config.TRAIN.GAMMA = 0.5
43 | config.TRAIN.MILE_STONE = [10, 15]
44 | config.TRAIN.MAX_EPOCH = 20
45 | config.TRAIN.BATCH_SIZE = 4
46 | config.TRAIN.PER_NEGATIVE_PAIRS_INBATCH = 3
47 | config.TRAIN.SHUFFLE = True
48 | config.TRAIN.CONTINUE = False
49 | config.TRAIN.MILESTONES = [7,15]
50 |
51 | config.LOSS = edict()
52 | config.LOSS.NAME = 'bce_loss'
53 | config.LOSS.MATCH = 1.0
54 | config.LOSS.DISTANCE = 1.0
55 | config.LOSS.KL = 1.0
56 | config.LOSS.EARLY = 1.0
57 | config.LOSS.LOCALIZATION = 1.0
58 | config.LOSS.CLIP_NORM = 1.0
59 | config.LOSS.DCOR = 1.0
60 | config.LOSS.contrast = 1.0
61 | config.LOSS.cont = 1.0
62 | config.LOSS.iou = 1.0
63 | config.LOSS.saliency_margin = 0.2
64 | config.LOSS.hy_sigma = 1.0
65 | config.LOSS.contrast_weight = True
66 | config.LOSS.PARAMS = None
67 | config.LOSS.bce = 1.0
68 | # test
69 | config.TEST = edict()
70 | config.TEST.RECALL = []
71 | config.TEST.TIOU = []
72 | config.TEST.NMS_THRESH = 0.4
73 | config.TEST.INTERVAL = 1
74 | config.TEST.EVAL_TRAIN = False
75 | config.TEST.BATCH_SIZE = 1
76 | config.TEST.TOP_K = 10
77 | config.TEST.SHUFFLE_VIDEO_FRAME = False
78 |
79 |
80 | def _update_dict(cfg, value):
81 | for k, v in value.items():
82 | if k in cfg:
83 | if k == 'PARAMS':
84 | cfg[k] = v
85 | elif isinstance(v, dict):
86 | _update_dict(cfg[k], v)
87 | else:
88 | cfg[k] = v
89 | else:
90 | raise ValueError("{} not exist in config.py".format(k))
91 |
92 |
93 | def update_config(config_file):
94 | with open(config_file) as f:
95 | exp_config = edict(yaml.load(f, Loader=yaml.FullLoader))
96 | for k, v in exp_config.items():
97 | if k in config:
98 | if isinstance(v, dict):
99 | _update_dict(config[k], v)
100 | else:
101 | config[k] = v
102 | else:
103 | raise ValueError("{} not exist in config.py".format(k))
104 |
--------------------------------------------------------------------------------
/models/PGBP/slidewindow.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def find_most_relevant_frame(probabilities, mask, window_size):
4 | """
5 | This function finds the most relevant frame in a batch of videos based on the probabilities of each frame
6 | being relevant to the text. It uses a sliding window approach to find a continuous sequence of frames
7 | with the highest average probability. The mask ensures that only valid values are considered.
8 |
9 | :param probabilities: Batched tensor of probabilities (shape: [B, L]).
10 | :param mask: Batched tensor of masks (shape: [B, L]) where 1 indicates a valid value and 0 indicates invalid.
11 | :param window_size: Size of the sliding window.
12 | :return: The index of the frame with the highest probability for each batch.
13 | """
14 | batch_size, L = probabilities.shape
15 |
16 | # Initialize arrays to store results
17 | indices_of_max_frames = torch.zeros(batch_size, dtype=int).cuda()
18 | visual_len = torch.sum(mask,dim=1).long()
19 | for batch_index in range(batch_size):
20 | # Slide the window across the valid probabilities
21 | max_avg_probability = 0
22 | index_of_max_frame = 0
23 | probability = probabilities[batch_index]
24 | if visual_len[batch_index] < window_size:
25 | index_of_max_frame = torch.max(probability[0:visual_len[batch_index]],dim = 0)[1]
26 | else:
27 | for start_index in range(visual_len[batch_index] - window_size + 1):
28 | # Compute the average probability for the current window
29 | window_avg = torch.mean(probability[start_index:start_index + window_size])
30 |
31 | # If the current window's average probability is greater than the max found so far, update max
32 | if window_avg > max_avg_probability:
33 | max_avg_probability = window_avg
34 | index_of_max_frame = torch.max(probability[start_index:start_index + window_size],dim = 0)[1]
35 | index_of_max_frame = index_of_max_frame + start_index
36 | indices_of_max_frames[batch_index] = index_of_max_frame
37 | if (indices_of_max_frames >= visual_len).any():
38 | print("indices_of_max_frames out of boundary")
39 | return indices_of_max_frames
40 |
41 | # Example usage:
42 |
43 | # torch.manual_seed(42) # For reproducibility
44 | # B = 3 # Batch size
45 | # L = 100 # Length of each video in frames
46 | # probabilities_batched = torch.rand(B, L) # Random probabilities
47 | # mask_batched = torch.randint(0, 2, size=(B, L)) # Random binary mask
48 | # # Define a window size, e.g., corresponding to 1 second of video at 30 fps
49 | # window_size = 30
50 |
51 | # # Find the index of the most relevant frame for each batch
52 | # index_of_max_frames_batched = find_most_relevant_frame_batched(probabilities_batched, mask_batched, window_size)
53 |
54 | # index_of_max_frames_batched
55 | def get_neg_sample(pos_ind,mask,pred):
56 | B,L = mask.shape
57 | mask1 = mask.clone()
58 | for i in range(B):
59 | mask1[i, pos_ind[i]:] = 0.0
60 | mask2 = mask-mask1
61 | neg1_value,neg1 = torch.min(pred.masked_fill(~mask1.bool(), float('1.0')), dim=1)
62 | neg2_value,neg2 = torch.min(pred.masked_fill(~mask2.bool(), float('1.0')), dim=1)
63 | condition1 = (neg1_value == 1.0)
64 | neg1 = torch.where(condition1, neg2, neg1)
65 | condition2 = (neg2_value == 1.0)
66 | neg2 = torch.where(condition2, neg1, neg2)
67 | return neg1,neg2
--------------------------------------------------------------------------------
/models/PGBP/span_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def span_xx_to_cxw(xx_spans):
5 | """
6 | Args:
7 | xx_spans: tensor, (#windows, 2) or (..., 2), each row is a window of format (st, ed)
8 |
9 | Returns:
10 | cxw_spans: tensor, (#windows, 2), each row is a window of format (center=(st+ed)/2, width=(ed-st))
11 | >>> spans = torch.Tensor([[0, 1], [0.2, 0.4]])
12 | >>> span_xx_to_cxw(spans)
13 | tensor([[0.5000, 1.0000],
14 | [0.3000, 0.2000]])
15 | >>> spans = torch.Tensor([[[0, 1], [0.2, 0.4]]])
16 | >>> span_xx_to_cxw(spans)
17 | tensor([[[0.5000, 1.0000],
18 | [0.3000, 0.2000]]])
19 | """
20 | center = xx_spans.sum(-1) * 0.5
21 | width = xx_spans[..., 1] - xx_spans[..., 0]
22 | return torch.stack([center, width], dim=-1)
23 |
24 |
25 | def span_cxw_to_xx(cxw_spans):
26 | """
27 | Args:
28 | cxw_spans: tensor, (#windows, 2) or (..., 2), the last dim is a row denoting a window of format (center, width)
29 |
30 | >>> spans = torch.Tensor([[0.5000, 1.0000], [0.3000, 0.2000]])
31 | >>> span_cxw_to_xx(spans)
32 | tensor([[0.0000, 1.0000],
33 | [0.2000, 0.4000]])
34 | >>> spans = torch.Tensor([[[0.5000, 1.0000], [0.3000, 0.2000]]])
35 | >>> span_cxw_to_xx(spans)
36 | tensor([[[0.0000, 1.0000],
37 | [0.2000, 0.4000]]])
38 | """
39 | x1 = cxw_spans[..., 0] - 0.5 * cxw_spans[..., 1]
40 | x2 = cxw_spans[..., 0] + 0.5 * cxw_spans[..., 1]
41 | return torch.stack([x1, x2], dim=-1)
42 |
43 |
44 | def temporal_iou(spans1, spans2):
45 | """
46 | Args:
47 | spans1: (N, 2) torch.Tensor, each row defines a span [st, ed]
48 | spans2: (M, 2) torch.Tensor, ...
49 |
50 | Returns:
51 | iou: (N, M) torch.Tensor
52 | union: (N, M) torch.Tensor
53 | >>> test_spans1 = torch.Tensor([[0, 0.2], [0.5, 1.0]])
54 | >>> test_spans2 = torch.Tensor([[0, 0.3], [0., 1.0]])
55 | >>> temporal_iou(test_spans1, test_spans2)
56 | (tensor([[0.6667, 0.2000],
57 | [0.0000, 0.5000]]),
58 | tensor([[0.3000, 1.0000],
59 | [0.8000, 1.0000]]))
60 | """
61 | areas1 = spans1[:, 1] - spans1[:, 0] # (N, )
62 | areas2 = spans2[:, 1] - spans2[:, 0] # (M, )
63 |
64 | left = torch.max(spans1[:, None, 0], spans2[:, 0]) # (N, M)
65 | right = torch.min(spans1[:, None, 1], spans2[:, 1]) # (N, M)
66 |
67 | inter = (right - left).clamp(min=0) # (N, M)
68 | union = areas1[:, None] + areas2 - inter # (N, M)
69 |
70 | iou = inter / union
71 | return iou, union
72 |
73 |
74 | def temporal_intersection_over_pred(gt_spans, pred_spans):
75 | """ intersection over the second input spans
76 | Args:
77 | gt_spans: (N, 2),
78 | pred_spans: (M, 2)
79 |
80 | Returns:
81 |
82 | """
83 | left = torch.max(gt_spans[:, None, 0], pred_spans[:, 0])
84 | right = torch.min(gt_spans[:, None, 1], pred_spans[:, 1])
85 |
86 | inter = (right - left).clamp(min=0) # (N, M)
87 | inter_over_pred = inter / (pred_spans[:, 1] - pred_spans[:, 0])
88 | return inter_over_pred
89 |
90 |
91 | def generalized_temporal_iou(spans1, spans2):
92 | """
93 | Generalized IoU from https://giou.stanford.edu/
94 | Also reference to DETR implementation of generalized_box_iou
95 | https://github.com/facebookresearch/detr/blob/master/util/box_ops.py#L40
96 |
97 | Args:
98 | spans1: (N, 2) torch.Tensor, each row defines a span in xx format [st, ed]
99 | spans2: (M, 2) torch.Tensor, ...
100 |
101 | Returns:
102 | giou: (N, M) torch.Tensor
103 |
104 | >>> test_spans1 = torch.Tensor([[0, 0.2], [0.5, 1.0]])
105 | >>> test_spans2 = torch.Tensor([[0, 0.3], [0., 1.0]])
106 | >>> generalized_temporal_iou(test_spans1, test_spans2)
107 | tensor([[ 0.6667, 0.2000],
108 | [-0.2000, 0.5000]])
109 | """
110 | spans1 = spans1.float()
111 | spans2 = spans2.float()
112 | assert (spans1[:, 1] >= spans1[:, 0]).all()
113 | assert (spans2[:, 1] >= spans2[:, 0]).all()
114 | iou, union = temporal_iou(spans1, spans2)
115 |
116 | left = torch.min(spans1[:, None, 0], spans2[:, 0]) # (N, M)
117 | right = torch.max(spans1[:, None, 1], spans2[:, 1]) # (N, M)
118 | enclosing_area = (right - left).clamp(min=0) # (N, M)
119 |
120 | return iou - (enclosing_area - union) / enclosing_area
121 |
122 |
123 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn.utils import rnn
4 | from core.config import config
5 | import numpy as np
6 |
7 |
8 | def collate_fn(batch):
9 | batch_word_vectors = [b[0]['word_vectors'] for b in batch]
10 | # batch_pos_tags = [b[0]['pos_tags'] for b in batch]
11 | batch_txt_mask = [b[0]['txt_mask'] for b in batch]
12 | batch_vis_feats = [b[0]['visual_input'] for b in batch]
13 | batch_vis_mask = [b[0]['vis_mask'] for b in batch]
14 | batch_start_label = [b[0]['start_label'] for b in batch]
15 | batch_end_label = [b[0]['end_label'] for b in batch]
16 | batch_start_frame = [b[0]['start_frame'] for b in batch]
17 | batch_end_frame = [b[0]['end_frame'] for b in batch]
18 | batch_internel_label = [b[0]['internel_label'] for b in batch]
19 | batch_extend_pre = [b[0]['extend_pre'] for b in batch]
20 | batch_extend_suf = [b[0]['extend_suf'] for b in batch]
21 | batch_keyword_mask = [b[0]['keyword_mask'] for b in batch]
22 | batch_negative_indices =np.array([b[0]['selected_negative_indices'] for b in batch])
23 | batch_positive_indices = np.array([b[0]['selected_positive_indices'] for b in batch])
24 | batch_time = [b[1]["times"] for b in batch]
25 | batch_duration = [b[1]["duration"] for b in batch]
26 | annotations = [b[1] for b in batch]
27 | visual_len = torch.from_numpy(np.array([b[0]['visual_len'] for b in batch])).float()
28 | visual_scale = visual_len / torch.max(visual_len)
29 | batch_data = {
30 | 'batch_word_vectors':
31 | nn.utils.rnn.pad_sequence(batch_word_vectors, batch_first=True),
32 | # 'batch_pos_tags':
33 | # rnn.pad_sequence(batch_pos_tags, batch_first=True),
34 | 'batch_txt_mask':
35 | nn.utils.rnn.pad_sequence(batch_txt_mask, batch_first=True),
36 | 'batch_vis_feats':
37 | nn.utils.rnn.pad_sequence(batch_vis_feats, batch_first=True).float(),
38 | 'batch_vis_mask':
39 | nn.utils.rnn.pad_sequence(batch_vis_mask, batch_first=True).float(),
40 | 'batch_start_label':
41 | nn.utils.rnn.pad_sequence(batch_start_label, batch_first=True).float(),
42 | 'batch_end_label':
43 | nn.utils.rnn.pad_sequence(batch_end_label, batch_first=True).float(),
44 | 'batch_internel_label':
45 | nn.utils.rnn.pad_sequence(batch_internel_label,
46 | batch_first=True).float(),
47 | 'batch_start_frame':
48 | torch.tensor(batch_start_frame).long(),
49 | 'batch_end_frame':
50 | torch.tensor(batch_end_frame).long(),
51 | 'batch_extend_pre':
52 | torch.tensor(batch_extend_pre).long(),
53 | 'batch_extend_suf':
54 | torch.tensor(batch_extend_suf).long(),
55 | "batch_keyword_mask":
56 | nn.utils.rnn.pad_sequence(batch_keyword_mask,
57 | batch_first=True).float(),
58 | "batch_negative_indices":
59 | torch.from_numpy(batch_negative_indices).long(),
60 | "batch_positive_indices":
61 | torch.from_numpy(batch_positive_indices).long(),
62 | "batch_start_time":
63 | torch.tensor(batch_time).float()[:,0],
64 | "batch_end_time":
65 | torch.tensor(batch_time).float()[:,1],
66 | "batch_duration":
67 | torch.tensor(batch_duration).float(),
68 | "visual_scale":
69 | visual_scale
70 |
71 | }
72 |
73 | return batch_data, annotations
74 |
75 |
76 | def average_to_fixed_length(visual_input, num_sample_clips=0):
77 | if num_sample_clips == 0:
78 | num_sample_clips = config.DATASET.NUM_SAMPLE_CLIPS
79 | num_clips = visual_input.shape[0]
80 | idxs = torch.arange(0, num_sample_clips + 1,
81 | 1.0) / num_sample_clips * num_clips
82 | idxs = torch.min(torch.round(idxs).long(), torch.tensor(num_clips - 1))
83 | new_visual_input = []
84 | for i in range(num_sample_clips):
85 | s_idx, e_idx = idxs[i].item(), idxs[i + 1].item()
86 | if s_idx < e_idx:
87 | new_visual_input.append(
88 | torch.mean(visual_input[s_idx:e_idx], dim=0))
89 | else:
90 | new_visual_input.append(visual_input[s_idx])
91 | new_visual_input = torch.stack(new_visual_input, dim=0)
92 | return new_visual_input
93 |
94 |
95 | from datasets.activitynet import ActivityNet
96 | from datasets.charades import Charades
97 | from datasets.charades_len import Charades_len
98 | from datasets.charades_mom import Charades_mom
99 | from datasets.tacos import TACoS
100 |
--------------------------------------------------------------------------------
/core/data_util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import json
4 | import pickle
5 | import numpy as np
6 | from tqdm import tqdm
7 |
8 |
9 | def load_json(filename):
10 | with open(filename, mode='r', encoding='utf-8') as f:
11 | data = json.load(f)
12 | return data
13 |
14 |
15 | def save_json(data, filename, save_pretty=False, sort_keys=False):
16 | with open(filename, mode='w', encoding='utf-8') as f:
17 | if save_pretty:
18 | f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
19 | else:
20 | json.dump(data, f)
21 |
22 |
23 | def load_lines(filename):
24 | with open(filename, mode='r', encoding='utf-8') as f:
25 | return [e.strip("\n") for e in f.readlines()]
26 |
27 |
28 | def save_lines(data, filename):
29 | with open(filename, mode='w', encoding='utf-8') as f:
30 | f.write("\n".join(data))
31 |
32 |
33 | def load_pickle(filename):
34 | with open(filename, mode='rb') as handle:
35 | data = pickle.load(handle)
36 | return data
37 |
38 |
39 | def save_pickle(data, filename):
40 | with open(filename, mode='wb') as handle:
41 | pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
42 |
43 |
44 | def compute_overlap(pred, gt):
45 | # check format
46 | assert isinstance(pred, list) and isinstance(gt, list)
47 | pred_is_list = isinstance(pred[0], list)
48 | gt_is_list = isinstance(gt[0], list)
49 | pred = pred if pred_is_list else [pred]
50 | gt = gt if gt_is_list else [gt]
51 | # compute overlap
52 | pred, gt = np.array(pred), np.array(gt)
53 | inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0])
54 | inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1])
55 | inter = np.maximum(0.0, inter_right - inter_left)
56 | union_left = np.minimum(pred[:, 0, None], gt[None, :, 0])
57 | union_right = np.maximum(pred[:, 1, None], gt[None, :, 1])
58 | union = np.maximum(1e-12, union_right - union_left)
59 | overlap = 1.0 * inter / union
60 | # reformat output
61 | overlap = overlap if gt_is_list else overlap[:, 0]
62 | overlap = overlap if pred_is_list else overlap[0]
63 | return overlap
64 |
65 |
66 | # def time_to_index(start_time, end_time, num_units, duration):
67 | # s_times = np.arange(0, num_units).astype(
68 | # np.float32) / float(num_units) * duration
69 | # e_times = np.arange(1, num_units + 1).astype(
70 | # np.float32) / float(num_units) * duration
71 | # candidates = np.stack([
72 | # np.repeat(s_times[:, None], repeats=num_units, axis=1),
73 | # np.repeat(e_times[None, :], repeats=num_units, axis=0)
74 | # ],
75 | # axis=2).reshape((-1, 2))
76 | # overlaps = compute_overlap(candidates.tolist(),
77 | # [start_time, end_time]).reshape(
78 | # num_units, num_units)
79 | # start_index = np.argmax(overlaps) // num_units
80 | # end_index = np.argmax(overlaps) % num_units
81 | # return start_index, end_index, overlaps
82 |
83 |
84 | # def index_to_time(start_index, end_index, num_units, extend_pre, extend_suf,
85 | # duration):
86 | # if start_index <= extend_pre:
87 | # start_index = extend_pre
88 | # if end_index <= extend_pre:
89 | # end_index = extend_pre
90 | # s_times = np.arange(0, num_units).astype(
91 | # np.float32) * duration / float(num_units)
92 | # e_times = np.arange(1, num_units + 1).astype(
93 | # np.float32) * duration / float(num_units)
94 | # start_time = s_times[start_index - extend_pre]
95 | # end_time = e_times[end_index - extend_pre]
96 | # return start_time, end_time
97 |
98 | def index_to_time(start_index, end_index, num_units, extend_pre, extend_suf,
99 | duration,pos_index):
100 | p_times = np.arange(0, num_units).astype(
101 | np.float32) * duration / float(num_units)
102 | pos_time = p_times[pos_index - extend_pre]
103 | start_time = start_index * duration
104 | end_time = end_index * duration
105 | return start_time, end_time,pos_time
106 |
107 | def index_to_time1(num_units, extend_pre, extend_suf,
108 | duration,pos_index):
109 | p_times = np.arange(0, num_units).astype(
110 | np.float32) * duration / float(num_units)
111 | pos_time = p_times[pos_index - extend_pre]
112 | return pos_time
113 |
114 | def index_to_time2(start_index, end_index, num_units, extend_pre, extend_suf,
115 | duration):
116 | start_time = start_index * duration
117 | end_time = end_index * duration
118 | return start_time, end_time
119 |
--------------------------------------------------------------------------------
/models/PGBP/matcher.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | """
3 | Modules to compute the matching cost and solve the corresponding LSAP.
4 | """
5 | import torch
6 | from scipy.optimize import linear_sum_assignment
7 | from torch import nn
8 | import torch.nn.functional as F
9 | from .span_utils import generalized_temporal_iou, span_cxw_to_xx
10 |
11 | class HungarianMatcher(nn.Module):
12 | """This class computes an assignment between the targets and the predictions of the network
13 |
14 | For efficiency reasons, the targets don't include the no_object. Because of this, in general,
15 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
16 | while the others are un-matched (and thus treated as non-objects).
17 | """
18 | def __init__(self, cost_class: float = 1, cost_span: float = 1, cost_giou: float = 1,
19 | span_loss_type: str = "l1"):
20 | """Creates the matcher
21 |
22 | Params:
23 | cost_span: This is the relative weight of the L1 error of the span coordinates in the matching cost
24 | cost_giou: This is the relative weight of the giou loss of the spans in the matching cost
25 | """
26 | super().__init__()
27 | self.cost_class = cost_class
28 | self.cost_span = cost_span
29 | self.cost_giou = cost_giou
30 | self.span_loss_type = span_loss_type
31 | self.foreground_label = 0
32 | assert cost_class != 0 or cost_span != 0 or cost_giou != 0, "all costs cant be 0"
33 |
34 | @torch.no_grad()
35 | def forward(self, pred_logits,pred_spans, tgt_spans):
36 | """ Performs the matching
37 |
38 | Params:
39 | outputs: This is a dict that contains at least these entries:
40 | "pred_spans": Tensor of dim [batch_size, num_queries, 2] with the predicted span coordinates,
41 | in normalized (cx, w) format
42 | ""pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
43 |
44 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
45 | "spans": Tensor of dim [num_target_spans, 2] containing the target span coordinates. The spans are
46 | in normalized (cx, w) format
47 |
48 | Returns:
49 | A list of size batch_size, containing tuples of (index_i, index_j) where:
50 | - index_i is the indices of the selected predictions (in order)
51 | - index_j is the indices of the corresponding selected targets (in order)
52 | For each batch element, it holds:
53 | len(index_i) = len(index_j) = min(num_queries, num_target_spans)
54 | """
55 | bs, num_queries = pred_spans.shape[:2]
56 | # Also concat the target labels and spans
57 | out_prob = pred_logits.flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes]
58 | tgt_ids = torch.full([len(tgt_spans)], self.foreground_label) # [total #spans in the batch]
59 | # Compute the classification cost. Contrary to the loss, we don't use the NLL,
60 | # but approximate it in 1 - prob[target class].
61 | # The 1 is a constant that doesn't change the matching, it can be omitted.
62 | cost_class = -out_prob[:, tgt_ids] # [batch_size * num_queries, total #spans in the batch]
63 |
64 | if self.span_loss_type == "l1":
65 | # We flatten to compute the cost matrices in a batch
66 | out_spans = pred_spans.flatten(0, 1) # [batch_size * num_queries, 2]
67 |
68 | # Compute the L1 cost between spans
69 | cost_span = torch.cdist(out_spans, tgt_spans, p=1) # [batch_size * num_queries, total #spans in the batch]
70 |
71 | # Compute the giou cost between spans
72 | # [batch_size * num_queries, total #spans in the batch]
73 | cost_giou = - generalized_temporal_iou(out_spans,tgt_spans)
74 | # Final cost matrix
75 | # import ipdb; ipdb.set_trace()
76 | C = self.cost_span * cost_span + self.cost_giou * cost_giou + self.cost_class * cost_class
77 | C = C.view(bs, num_queries, -1).cpu()
78 |
79 | sizes = [1]*bs
80 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
81 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
82 |
83 |
84 | def build_matcher(args):
85 | return HungarianMatcher(
86 | cost_span=args.set_cost_span, cost_giou=args.set_cost_giou,
87 | cost_class=args.set_cost_class, span_loss_type=args.span_loss_type, max_v_l=args.max_v_l
88 | )
89 |
--------------------------------------------------------------------------------
/models/PGBP/triplet_loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def pairwise_distances(embeddings, squared=False):
5 | """
6 | ||a-b||^2 = |a|^2 - 2* + |b|^2
7 | """
8 | # get dot product (batch_size, batch_size)
9 | dot_product = embeddings.mm(embeddings.t())
10 |
11 | # a vector
12 | square_sum = dot_product.diag()
13 |
14 | distances = square_sum.unsqueeze(
15 | 1) - 2 * dot_product + square_sum.unsqueeze(0)
16 |
17 | distances = distances.clamp(min=1e-16)
18 |
19 | if not squared:
20 | epsilon = 1e-16
21 | mask = torch.eq(distances, 0).float()
22 | distances += (mask + epsilon)
23 | distances = torch.sqrt(distances)
24 | distances *= (1 - mask)
25 |
26 | return distances
27 |
28 |
29 | def get_valid_positive_mask(labels):
30 | """
31 | To be a valid positive pair (a,p),
32 | - a and p are different embeddings
33 | - a and p have the same label
34 | """
35 | indices_equal = torch.eye(labels.size(0)).byte()
36 | indices_not_equal = ~indices_equal
37 |
38 | label_equal = torch.eq(labels.unsqueeze(1), labels.unsqueeze(0))
39 |
40 | mask = indices_not_equal & label_equal
41 | return mask
42 |
43 |
44 | def get_valid_negative_mask(labels):
45 | """
46 | To be a valid negative pair (a,n),
47 | - a and n are different embeddings
48 | - a and n have the different label
49 | """
50 | indices_equal = torch.eye(labels.size(0)).byte()
51 | indices_not_equal = ~indices_equal
52 |
53 | label_not_equal = torch.ne(labels.unsqueeze(1), labels.unsqueeze(0))
54 |
55 | mask = indices_not_equal & label_not_equal
56 | return mask
57 |
58 |
59 | def get_valid_triplets_mask(labels):
60 | """
61 | To be valid, a triplet (a,p,n) has to satisfy:
62 | - a,p,n are distinct embeddings
63 | - a and p have the same label, while a and n have different label
64 | """
65 | indices_equal = torch.eye(labels.size(0)).byte().to(labels.device)
66 | indices_not_equal = ~indices_equal
67 | i_ne_j = indices_not_equal.unsqueeze(2)
68 | i_ne_k = indices_not_equal.unsqueeze(1)
69 | j_ne_k = indices_not_equal.unsqueeze(0)
70 | distinct_indices = i_ne_j & i_ne_k & j_ne_k
71 |
72 | label_equal = torch.eq(labels.unsqueeze(1), labels.unsqueeze(0))
73 | i_eq_j = label_equal.unsqueeze(2)
74 | i_eq_k = label_equal.unsqueeze(1)
75 | i_ne_k = ~i_eq_k
76 | valid_labels = i_eq_j & i_ne_k
77 |
78 | mask = distinct_indices & valid_labels
79 | return mask
80 |
81 |
82 | def batch_all_triplet_loss(labels, embeddings, margin, squared=False):
83 | """
84 | get triplet loss for all valid triplets and average over those triplets whose loss is positive.
85 | """
86 |
87 | distances = pairwise_distances(embeddings, squared=squared)
88 |
89 | anchor_positive_dist = distances.unsqueeze(2)
90 | anchor_negative_dist = distances.unsqueeze(1)
91 | triplet_loss = anchor_positive_dist - anchor_negative_dist + margin
92 |
93 | # get a 3D mask to filter out invalid triplets
94 | mask = get_valid_triplets_mask(labels)
95 |
96 | triplet_loss = triplet_loss * mask.float()
97 | triplet_loss.clamp_(min=0)
98 |
99 | # count the number of positive triplets
100 | epsilon = 1e-16
101 | num_positive_triplets = (triplet_loss > 0).float().sum()
102 | num_valid_triplets = mask.float().sum()
103 | fraction_positive_triplets = num_positive_triplets / (num_valid_triplets +
104 | epsilon)
105 |
106 | triplet_loss = triplet_loss.sum() / (num_positive_triplets + epsilon)
107 |
108 | return triplet_loss, fraction_positive_triplets
109 |
110 |
111 | def batch_hard_triplet_loss(labels, embeddings, margin, squared=False):
112 | """
113 | - compute distance matrix
114 | - for each anchor a0, find the (a0,p0) pair with greatest distance s.t. a0 and p0 have the same label
115 | - for each anchor a0, find the (a0,n0) pair with smallest distance s.t. a0 and n0 have different label
116 | - compute triplet loss for each triplet (a0, p0, n0), average them
117 | """
118 | distances = pairwise_distances(embeddings, squared=squared)
119 |
120 | mask_positive = get_valid_positive_mask(labels)
121 | hardest_positive_dist = (distances * mask_positive.float()).max(dim=1)[0]
122 |
123 | mask_negative = get_valid_negative_mask(labels)
124 | max_negative_dist = distances.max(dim=1, keepdim=True)[0]
125 | distances = distances + max_negative_dist * (~mask_negative).float()
126 | hardest_negative_dist = distances.min(dim=1)[0]
127 |
128 | triplet_loss = (hardest_positive_dist - hardest_negative_dist +
129 | margin).clamp(min=0)
130 | triplet_loss = triplet_loss.mean()
131 |
132 | return triplet_loss
133 |
--------------------------------------------------------------------------------
/models/PGBP/layers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 | import math
6 |
7 | from .operation import Conv1D, mask_logits
8 |
9 |
10 | class PositionEmbeddingSine(nn.Module):
11 | """
12 | This is a more standard version of the position embedding, very similar to the one
13 | used by the Attention is all you need paper, generalized to work on images. (To 1D sequences)
14 | """
15 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
16 | super().__init__()
17 | self.num_pos_feats = num_pos_feats
18 | self.temperature = temperature
19 | self.normalize = normalize
20 | if scale is not None and normalize is False:
21 | raise ValueError("normalize should be True if scale is passed")
22 | if scale is None:
23 | scale = 2 * math.pi
24 | self.scale = scale
25 |
26 | def forward(self, x, mask):
27 | """
28 | Args:
29 | x: torch.tensor, (batch_size, L, d)
30 | mask: torch.tensor, (batch_size, L), with 1 as valid
31 |
32 | Returns:
33 |
34 | """
35 | assert mask is not None
36 | x_embed = mask.cumsum(1, dtype=torch.float32) # (bsz, L)
37 | if self.normalize:
38 | eps = 1e-6
39 | x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale
40 |
41 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
42 | # dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
43 | dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='trunc') / self.num_pos_feats)
44 | pos_x = x_embed[:, :, None] / dim_t # (bsz, L, num_pos_feats)
45 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) # (bsz, L, num_pos_feats*2)
46 | # import ipdb; ipdb.set_trace()
47 | return pos_x # .permute(0, 2, 1) # (bsz, num_pos_feats*2, L)
48 |
49 | class TransformerPositionalEmbedding(nn.Module):
50 | def __init__(self, dim, max_len=512):
51 | super().__init__()
52 |
53 | # Compute the positional encodings once in log space.
54 | pe = torch.zeros(max_len, dim).float()
55 | pe.require_grad = False
56 |
57 | position = torch.arange(0, max_len).float().unsqueeze(1)
58 | div_term = (torch.arange(0, dim, 2).float() *
59 | -(math.log(10000.0) / dim)).exp()
60 |
61 | pe[:, 0::2] = torch.sin(position * div_term)
62 | pe[:, 1::2] = torch.cos(position * div_term)
63 |
64 | pe = pe.unsqueeze(0)
65 | self.register_buffer('pe', pe)
66 |
67 | def forward(self, x):
68 | return self.pe[:, :x.size(1)]
69 |
70 |
71 | class PositionalEmbedding(nn.Module):
72 | """Construct the embeddings from word, position and token_type embeddings."""
73 | def __init__(self, embedding_dim, num_embeddings):
74 | super(PositionalEmbedding, self).__init__()
75 | self.position_embeddings = nn.Embedding(num_embeddings, embedding_dim)
76 |
77 | def forward(self, inputs):
78 | bsz, seq_length = inputs.shape[:2]
79 | position_ids = torch.arange(seq_length,
80 | dtype=torch.long,
81 | device=inputs.device)
82 | position_ids = position_ids.unsqueeze(0).repeat(bsz, 1) # (N, L)
83 | position_embeddings = self.position_embeddings(position_ids)
84 | return position_embeddings
85 |
86 |
87 | class Projection(nn.Module):
88 | def __init__(self, in_dim, dim, drop_rate=0.0):
89 | super(Projection, self).__init__()
90 | self.drop = nn.Dropout(p=drop_rate)
91 | self.projection = Conv1D(in_dim=in_dim,
92 | out_dim=dim,
93 | kernel_size=1,
94 | stride=1,
95 | bias=True,
96 | padding=0)
97 | self.layer_norm = nn.LayerNorm(dim, eps=1e-6)
98 |
99 | def forward(self, input_features):
100 | # the input feature with shape (batch_size, seq_len, in_dim)
101 | input_features = self.drop(input_features)
102 | output = self.projection(input_features) # (batch_size, seq_len, dim)
103 | output = self.layer_norm(output)
104 | return output
105 |
106 |
107 | class Prediction(nn.Module):
108 | def __init__(self, in_dim, hidden_dim, out_dim, drop_rate=0.):
109 | super(Prediction, self).__init__()
110 | self.fc1 = Conv1D(in_dim=in_dim,
111 | out_dim=hidden_dim,
112 | kernel_size=1,
113 | stride=1,
114 | padding=0,
115 | bias=True)
116 | self.dropout = nn.Dropout(p=drop_rate)
117 | self.fc2 = Conv1D(in_dim=hidden_dim,
118 | out_dim=out_dim,
119 | kernel_size=1,
120 | stride=1,
121 | padding=0,
122 | bias=True)
123 |
124 | def forward(self, input_feature):
125 | output = self.fc1(input_feature)
126 | output = F.gelu(output)
127 | output = self.dropout(output)
128 | output = self.fc2(output)
129 | return output
130 |
131 | class MLP(nn.Module):
132 |
133 | def __init__(self, dims, dropout=0.1) -> None:
134 | super().__init__()
135 | # assert num_layers > 1, "this class is intended for multiple linear layers"
136 | # dims = dims
137 | num_layers = len(dims) - 1
138 | self.layers = nn.ModuleList([nn.Linear(dims[i], dims[i + 1]) for i in range(num_layers)])
139 | self.do = nn.Dropout(dropout)
140 |
141 | def forward(self, x):
142 | for idx, layer in enumerate(self.layers):
143 | x = layer(x)
144 | if idx != len(self.layers) - 1:
145 | x = F.gelu(x)
146 | x = self.do(x)
147 | return x
--------------------------------------------------------------------------------
/models/PGBP/encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | from .operation import Conv1D
5 |
6 |
7 | class LSTMEncoder(nn.Module):
8 | def __init__(self,
9 | in_dim,
10 | out_dim,
11 | num_layers,
12 | bi_direction=False,
13 | drop_rate=0.0):
14 | super(LSTMEncoder, self).__init__()
15 |
16 | self.layers_norm1 = nn.LayerNorm(in_dim, eps=1e-6)
17 | self.layers_norm2 = nn.LayerNorm(out_dim, eps=1e-6)
18 |
19 | self.dropout = nn.Dropout(p=drop_rate)
20 | self.encoder = nn.LSTM(in_dim,
21 | out_dim // 2 if bi_direction else out_dim,
22 | num_layers=num_layers,
23 | bidirectional=bi_direction,
24 | dropout=drop_rate,
25 | batch_first=True)
26 |
27 | self.linear = Conv1D(in_dim=out_dim,
28 | out_dim=out_dim,
29 | kernel_size=1,
30 | stride=1,
31 | bias=True,
32 | padding=0)
33 |
34 | def forward(self, input_feature):
35 | input_feature = self.layers_norm1(input_feature)
36 | output, _ = self.encoder(input_feature)
37 | output = self.layers_norm2(output)
38 | output = self.dropout(output)
39 | output = self.linear(output)
40 | return output
41 |
42 |
43 | class MultiStepLSTMEncoder(nn.Module):
44 | def __init__(self,
45 | in_dim,
46 | out_dim,
47 | num_layers,
48 | num_step=1,
49 | bi_direction=False,
50 | drop_rate=0.0):
51 | super(MultiStepLSTMEncoder, self).__init__()
52 |
53 | self.num_step = num_step
54 | self.out_dim = out_dim
55 | self.layers_norm = nn.LayerNorm(in_dim, eps=1e-6)
56 |
57 | self.dropout = nn.Dropout(p=drop_rate)
58 |
59 | self.encoder = nn.ModuleList([
60 | nn.LSTM(in_dim,
61 | out_dim // 2 if bi_direction else out_dim,
62 | num_layers=num_layers,
63 | bidirectional=bi_direction,
64 | dropout=drop_rate,
65 | batch_first=True) for _ in range(num_step)
66 | ])
67 | self.linear = Conv1D(in_dim=int(num_step * out_dim),
68 | out_dim=out_dim,
69 | kernel_size=1,
70 | stride=1,
71 | bias=True,
72 | padding=0)
73 |
74 | def forward(self, input_feature):
75 | input_feature = self.layers_norm(input_feature)
76 | B, seq_len, _ = input_feature.shape
77 | # assert seq_len // self.num_step == 0, "length of sequence({}) must be devided by num_step({})".format(
78 | # seq_len, self.num_step)
79 | output = []
80 | for i in range(self.num_step):
81 | encoder_i = self.encoder[i]
82 | output_i = input_feature.new_zeros([B, seq_len, self.out_dim])
83 | input_i_len = (seq_len // (i + 1)) * (i + 1)
84 | for j in range(i + 1):
85 | input_j = input_feature[:, j:input_i_len:(i + 1), :]
86 | output_j, _ = encoder_i(input_j)
87 | output_i[:, j:input_i_len:(i + 1), :] = output_j
88 | output_i = self.dropout(output_i)
89 | output.append(output_i)
90 | output = torch.cat(output, dim=2)
91 | output = self.linear(output)
92 | return output
93 |
94 | class TemporalContextModule(nn.Module):
95 | def __init__(self, in_dim, out_dim, kernels=[3], drop_rate=0.):
96 | super(TemporalContextModule, self).__init__()
97 | self.dropout = nn.Dropout(p=drop_rate)
98 | self.temporal_convs = nn.ModuleList([
99 | Conv1D(in_dim=in_dim,
100 | out_dim=out_dim,
101 | kernel_size=s,
102 | stride=1,
103 | padding=s // 2,
104 | bias=True) for s in kernels
105 | ])
106 | self.out_layer = Conv1D(in_dim=out_dim * len(kernels),
107 | out_dim=out_dim,
108 | kernel_size=1,
109 | stride=1,
110 | padding=0,
111 | bias=True)
112 |
113 | def forward(self, input_feature):
114 | intermediate = []
115 | for layer in self.temporal_convs:
116 | intermediate.append(layer(input_feature))
117 | intermediate = torch.cat(intermediate, dim=-1)
118 | out = self.out_layer(intermediate)
119 | return out
120 |
121 |
122 | class MultiStepGRUEncoder(nn.Module):
123 | def __init__(self,
124 | in_dim,
125 | out_dim,
126 | num_layers,
127 | num_step=1,
128 | bi_direction=False,
129 | drop_rate=0.0):
130 | super(MultiStepGRUEncoder, self).__init__()
131 |
132 | self.num_step = num_step
133 | self.out_dim = out_dim
134 | self.layers_norm = nn.LayerNorm(in_dim, eps=1e-6)
135 |
136 | self.dropout = nn.Dropout(p=drop_rate)
137 |
138 | self.encoder = nn.ModuleList([
139 | nn.GRU(in_dim,
140 | out_dim // 2 if bi_direction else out_dim,
141 | num_layers=num_layers,
142 | bidirectional=bi_direction,
143 | dropout=drop_rate,
144 | batch_first=True) for _ in range(num_step)
145 | ])
146 | self.linear = Conv1D(in_dim=int(num_step * out_dim),
147 | out_dim=out_dim,
148 | kernel_size=1,
149 | stride=1,
150 | bias=True,
151 | padding=0)
152 |
153 | def forward(self, input_feature):
154 | input_feature = self.layers_norm(input_feature)
155 | B, seq_len, _ = input_feature.shape
156 | # assert seq_len // self.num_step == 0, "length of sequence({}) must be devided by num_step({})".format(
157 | # seq_len, self.num_step)
158 | output = []
159 | for i in range(self.num_step):
160 | encoder_i = self.encoder[i]
161 | output_i = input_feature.new_zeros([B, seq_len, self.out_dim])
162 | input_i_len = (seq_len // (i + 1)) * (i + 1)
163 | for j in range(i + 1):
164 | input_j = input_feature[:, j:input_i_len:(i + 1), :]
165 | output_j, _ = encoder_i(input_j)
166 | output_i[:, j:input_i_len:(i + 1), :] = output_j
167 | output_i = self.dropout(output_i)
168 | output.append(output_i)
169 | output = torch.cat(output, dim=2)
170 | output = self.linear(output)
171 | return output
--------------------------------------------------------------------------------
/core/runner_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import random
4 | import numpy as np
5 | import torch
6 | from torch.cuda.profiler import start
7 | import torch.utils.data
8 | import torch.backends.cudnn
9 | from tqdm import tqdm
10 | from prettytable import PrettyTable
11 |
12 | from .data_util import index_to_time,index_to_time1,index_to_time2
13 |
14 |
15 | def set_th_config(seed):
16 | random.seed(seed)
17 | np.random.seed(seed)
18 | torch.manual_seed(seed)
19 | torch.cuda.manual_seed(seed)
20 | torch.cuda.manual_seed_all(seed)
21 | torch.backends.cudnn.benchmark = False
22 | torch.backends.cudnn.deterministic = True
23 | torch.backends.cudnn.enabled = True
24 |
25 |
26 | def dcor(x, y):
27 | m, _ = x.shape
28 | assert len(x.shape) == 2
29 | assert len(y.shape) == 2
30 |
31 | dx = pairwise_dist(x)
32 | dy = pairwise_dist(y)
33 |
34 | dx_m = dx - dx.mean(dim=0)[None, :] - dx.mean(dim=1)[:, None] + dx.mean()
35 | dy_m = dy - dy.mean(dim=0)[None, :] - dy.mean(dim=1)[:, None] + dy.mean()
36 |
37 | dcov2_xy = (dx_m * dy_m).sum() / float(m * m)
38 | dcov2_xx = (dx_m * dx_m).sum() / float(m * m)
39 | dcov2_yy = (dy_m * dy_m).sum() / float(m * m)
40 |
41 | dcor = torch.sqrt(dcov2_xy) / torch.sqrt(
42 | (torch.sqrt(dcov2_xx) * torch.sqrt(dcov2_yy)).clamp(min=0) + 1e-10)
43 |
44 | return dcor
45 |
46 |
47 | def pairwise_dist(x):
48 | #x should be two dimensional
49 | instances_norm = torch.sum(x**2, -1).reshape((-1, 1))
50 | output = -2 * torch.mm(x, x.t()) + instances_norm + instances_norm.t()
51 | return torch.sqrt(output.clamp(min=0) + 1e-10)
52 |
53 |
54 | def filter_checkpoints(model_dir, suffix='t7', max_to_keep=5):
55 | model_paths = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix)))
56 | if len(model_paths) > max_to_keep:
57 | model_file_dict = dict()
58 | suffix_len = len(suffix) + 1
59 | for model_path in model_paths:
60 | step = int(
61 | os.path.basename(model_path).split('_')[1][0:-suffix_len])
62 | model_file_dict[step] = model_path
63 | sorted_tuples = sorted(model_file_dict.items())
64 | unused_tuples = sorted_tuples[0:-max_to_keep]
65 | for _, model_path in unused_tuples:
66 | os.remove(model_path)
67 |
68 |
69 | def get_last_checkpoint(model_dir, suffix='t7'):
70 | model_filenames = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix)))
71 | model_file_dict = dict()
72 | suffix_len = len(suffix) + 1
73 | for model_filename in model_filenames:
74 | step = int(
75 | os.path.basename(model_filename).split('_')[1][0:-suffix_len])
76 | model_file_dict[step] = model_filename
77 | sorted_tuples = sorted(model_file_dict.items())
78 | last_checkpoint = sorted_tuples[-1]
79 | return last_checkpoint[1]
80 |
81 |
82 | def convert_length_to_mask(lengths):
83 | max_len = lengths.max().item()
84 | mask = torch.arange(max_len, device=lengths.device).expand(
85 | lengths.size()[0], max_len) < lengths.unsqueeze(1)
86 | mask = mask.float()
87 | return mask
88 |
89 |
90 | def calculate_iou_accuracy(ious, threshold):
91 | total_size = float(len(ious))
92 | count = 0
93 | for iou in ious:
94 | if iou >= threshold:
95 | count += 1
96 | return float(count) / total_size * 100.0
97 |
98 |
99 | def calculate_iou(i0, i1):
100 | union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
101 | inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
102 | iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0])
103 | return max(0.0,iou)
104 |
105 |
106 | def cal_statistics(preds, durations):
107 | start_fre = [0] * 10
108 | end_fre = [0] * 10
109 | duration_fre = [0] * 10
110 | start_end_fre = [[0] * 10 for _ in range(10)]
111 | tb = PrettyTable()
112 | tb.field_names = [
113 | "type", "0.1", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9",
114 | "1.0"
115 | ]
116 | for pred, duration in zip(preds, durations):
117 | start_f = int(pred[0] / duration * 10)
118 | end_f = min(int(pred[1] / duration * 10), 9)
119 | duration_f = min(int((pred[1] - pred[0]) / duration * 10), 9)
120 | start_fre[start_f] += 1
121 | end_fre[end_f] += 1
122 | duration_fre[duration_f] += 1
123 | start_end_fre[start_f][end_f] += 1
124 | assert len(preds) == len(durations)
125 | all_len = len(durations)
126 | for i in range(10):
127 | start_fre[i] /= all_len
128 | end_fre[i] /= all_len
129 | duration_fre[i] /= all_len
130 | for j in range(10):
131 | start_end_fre[i][j] /= all_len
132 | start_end_fre[i][j] = "{:.6f}".format(start_end_fre[i][j])
133 | start_fre = ["{:.6f}".format(s) for s in start_fre]
134 | end_fre = ["{:.6f}".format(s) for s in end_fre]
135 | duration_fre = ["{:.6f}".format(s) for s in duration_fre]
136 | tb.add_row(["start_fre"] + start_fre)
137 | tb.add_row(["end_fre"] + end_fre)
138 | tb.add_row(["duration_fre"] + duration_fre)
139 | tb.add_row(["--"] * 11)
140 | for i in range(10):
141 | tb.add_row([str((i + 1) / 10)] + start_end_fre[i])
142 | return tb.get_string()
143 |
144 |
145 | def eval_test(model,
146 | data_loader,
147 | device,
148 | mode='test',
149 | epoch=None,
150 | global_step=None):
151 | ious = []
152 | with torch.no_grad():
153 | for idx, batch_data in tqdm(enumerate(data_loader),
154 | total=len(data_loader),
155 | desc='evaluate {}'.format(mode)):
156 | data, annos = batch_data
157 | batch_word_vectors = data['batch_word_vectors'].to(device)
158 | batch_txt_mask = data['batch_txt_mask'].squeeze().to(device)
159 | batch_vis_feats = data['batch_vis_feats'].to(device)
160 | batch_vis_mask = data['batch_vis_mask'].squeeze().to(device)
161 |
162 | # compute predicted results
163 | _, start_logits, end_logits = model(batch_word_vectors,
164 | batch_txt_mask,
165 | batch_vis_feats,
166 | batch_vis_mask)
167 | start_indices, end_indices = model.extract_index(
168 | start_logits, end_logits)
169 | start_indices = start_indices.cpu().numpy()
170 | end_indices = end_indices.cpu().numpy()
171 | batch_vis_mask = batch_vis_mask.cpu().numpy()
172 | for vis_mask, start_index, end_index, anno in zip(
173 | batch_vis_mask, start_indices, end_indices, annos):
174 | start_time, end_time = index_to_time(start_index, end_index,
175 | vis_mask.sum(),
176 | anno["duration"])
177 | iou = calculate_iou(i0=[start_time, end_time],
178 | i1=anno['times'])
179 | ious.append(iou)
180 | r1i3 = calculate_iou_accuracy(ious, threshold=0.3)
181 | r1i5 = calculate_iou_accuracy(ious, threshold=0.5)
182 | r1i7 = calculate_iou_accuracy(ious, threshold=0.7)
183 | mi = np.mean(ious) * 100.0
184 | # write the scores
185 | score_str = "Epoch {}, Step {}:\n".format(epoch, global_step)
186 | score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3)
187 | score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5)
188 | score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7)
189 | score_str += "mean IoU: {:.2f}\n".format(mi)
190 | return r1i3, r1i5, r1i7, mi, score_str
--------------------------------------------------------------------------------
/models/PGBP/fusion.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.nn.modules.container import ModuleList
5 | import numpy as np
6 | import math
7 | from .attention import TemporalMaxer,Cross_Attention,MultiHeadAttention
8 | from .operation import Conv1D, mask_logits
9 |
10 |
11 | class CQFusion(nn.Module):
12 | def __init__(self, configs, drop_rate=0.0):
13 | dim = configs.dim
14 | super(CQFusion, self).__init__()
15 | w4C = torch.empty(dim, 1)
16 | w4Q = torch.empty(dim, 1)
17 | w4mlu = torch.empty(1, 1, dim)
18 | nn.init.xavier_uniform_(w4C)
19 | nn.init.xavier_uniform_(w4Q)
20 | nn.init.xavier_uniform_(w4mlu)
21 | self.w4C = nn.Parameter(w4C, requires_grad=True)
22 | self.w4Q = nn.Parameter(w4Q, requires_grad=True)
23 | self.w4mlu = nn.Parameter(w4mlu, requires_grad=True)
24 | self.dropout = nn.Dropout(p=drop_rate)
25 | self.cqa_linear = Conv1D(in_dim=4 * dim,
26 | out_dim=dim,
27 | kernel_size=1,
28 | stride=1,
29 | padding=0,
30 | bias=True)
31 |
32 | def forward(self, context, query, c_mask, q_mask):
33 | score = self.trilinear_attention(
34 | context, query) # (batch_size, c_seq_len, q_seq_len)
35 | score_ = torch.softmax(mask_logits(score, q_mask.unsqueeze(1)),
36 | dim=2) # (batch_size, c_seq_len, q_seq_len)
37 | score_t = torch.softmax(mask_logits(score, c_mask.unsqueeze(2)),
38 | dim=1) # (batch_size, c_seq_len, q_seq_len)
39 | score_t = score_t.transpose(1, 2) # (batch_size, q_seq_len, c_seq_len)
40 | c2q = torch.matmul(score_, query) # (batch_size, c_seq_len, dim)
41 | q2c = torch.matmul(torch.matmul(score_, score_t),
42 | context) # (batch_size, c_seq_len, dim)
43 | output = torch.cat(
44 | [context, c2q,
45 | torch.mul(context, c2q),
46 | torch.mul(context, q2c)],
47 | dim=2)
48 | output = self.cqa_linear(output) # (batch_size, c_seq_len, dim)
49 | return output * c_mask.unsqueeze(2)
50 |
51 | def trilinear_attention(self, context, query):
52 | batch_size, c_seq_len, dim = context.shape
53 | batch_size, q_seq_len, dim = query.shape
54 | context = self.dropout(context)
55 | query = self.dropout(query)
56 | subres0 = torch.matmul(context, self.w4C).expand(
57 | [-1, -1, q_seq_len]) # (batch_size, c_seq_len, q_seq_len)
58 | subres1 = torch.matmul(query, self.w4Q).transpose(1, 2).expand(
59 | [-1, c_seq_len, -1])
60 | subres2 = torch.matmul(context * self.w4mlu, query.transpose(1, 2))
61 | res = subres0 + subres1 + subres2 # (batch_size, c_seq_len, q_seq_len)
62 | return res
63 |
64 | class multiscale_Fusion(nn.Module):
65 | def __init__(self, configs):
66 | super(multiscale_Fusion, self).__init__()
67 | self.branch = nn.ModuleList()
68 | self.fusion = nn.ModuleList()
69 | self.fusion.append(Cross_Attention(configs))
70 | self.MULTI_SCALE = configs.MULTI_SCALE
71 | if configs.MULTI_SCALE == True:
72 | for idx in range(configs.MULTI_SCALE_LEN):
73 | self.branch.append(TemporalMaxer(kernel_size=3,
74 | stride=2,
75 | padding=1,
76 | n_embd=configs.dim))
77 | self.fusion.append(Cross_Attention(configs))
78 | self.attention = MultiHeadAttention(configs)
79 |
80 | def forward(self, context, query, c_mask, q_mask):
81 | b,l,d = context.shape
82 | fusion = self.fusion[0](context,query,c_mask,q_mask)
83 | if self.MULTI_SCALE == True:
84 | for i in range(len(self.branch)):
85 | if i == 0:
86 | multi_feature,multi_feature_mask = self.branch[i](context,c_mask)
87 | else:
88 | multi_feature,multi_feature_mask = self.branch[i](multi_feature,multi_feature_mask)
89 | multi_fusion = self.fusion[i+1](multi_feature,query,multi_feature_mask,q_mask)
90 | fusion = torch.cat((fusion,multi_fusion),dim = 1)
91 | c_mask = torch.cat((c_mask,multi_feature_mask),dim = 1)
92 | fusion = self.attention(fusion,c_mask)
93 | fusion = fusion[:,:l,:]
94 | c_mask = c_mask[:,:l]
95 | return fusion
96 |
97 |
98 | class multiscale_CQFusion(nn.Module):
99 | def __init__(self, configs):
100 | super(multiscale_CQFusion, self).__init__()
101 | self.branch = nn.ModuleList()
102 | self.fusion = nn.ModuleList()
103 | self.fusion.append(CQFusion(configs))
104 | self.MULTI_SCALE = configs.MULTI_SCALE
105 | if configs.MULTI_SCALE == True:
106 | for idx in range(configs.MULTI_SCALE_LEN):
107 | self.branch.append(TemporalMaxer(kernel_size=3,
108 | stride=2,
109 | padding=1,
110 | n_embd=configs.dim))
111 | self.fusion.append(CQFusion(configs))
112 | self.attention = MultiHeadAttention(configs)
113 |
114 | def forward(self, context, query, c_mask, q_mask):
115 | b,l,d = context.shape
116 | fusion = self.fusion[0](context,query,c_mask,q_mask)
117 | if self.MULTI_SCALE == True:
118 | for i in range(len(self.branch)):
119 | if i == 0:
120 | multi_feature,multi_feature_mask = self.branch[i](context,c_mask)
121 | else:
122 | multi_feature,multi_feature_mask = self.branch[i](multi_feature,multi_feature_mask)
123 | multi_fusion = self.fusion[i+1](multi_feature,query,multi_feature_mask,q_mask)
124 | #修改
125 | # fusion = self.muti_fuse[i](fusion,multi_fusion,multi_feature_mask)
126 | fusion = torch.cat((fusion,multi_fusion),dim = 1)
127 | c_mask = torch.cat((c_mask,multi_feature_mask),dim = 1)
128 | fusion = self.attention(fusion,c_mask)
129 | fusion = fusion[:,:l,:]
130 | c_mask = c_mask[:,:l]
131 | fusion = fusion * c_mask.unsqueeze(2)
132 | return fusion
133 |
134 |
135 | class multiscale_CQFusion1(nn.Module):
136 | def __init__(self, configs):
137 | super(multiscale_CQFusion1, self).__init__()
138 | self.branch = nn.ModuleList()
139 | self.fusion = nn.ModuleList()
140 | self.muti_fuse = nn.ModuleList()
141 | self.fusion.append(CQFusion(configs))
142 | self.MULTI_SCALE = configs.MULTI_SCALE
143 | self.fusion_attention = configs.fusion_attention
144 | if configs.MULTI_SCALE == True:
145 | for idx in range(configs.MULTI_SCALE_LEN):
146 | self.branch.append(TemporalMaxer(kernel_size=3,
147 | stride=2,
148 | padding=1,
149 | n_embd=configs.dim))
150 | self.fusion.append(CQFusion(configs))
151 | self.muti_fuse.append(MutiFuse(configs))
152 | self.attention = MultiHeadAttention(configs)
153 |
154 | def forward(self, context, query, c_mask, q_mask):
155 | b,l,d = context.shape
156 | fusion = self.fusion[0](context,query,c_mask,q_mask)
157 | if self.fusion_attention is True:
158 | fusion = self.attention(fusion,c_mask)
159 | if self.MULTI_SCALE == True:
160 | for i in range(len(self.branch)):
161 | if i == 0:
162 | multi_feature,multi_feature_mask = self.branch[i](context,c_mask)
163 | else:
164 | multi_feature,multi_feature_mask = self.branch[i](multi_feature,multi_feature_mask)
165 | multi_fusion = self.fusion[i+1](multi_feature,query,multi_feature_mask,q_mask)
166 | #修改
167 | fusion = self.muti_fuse[i](fusion,multi_fusion,multi_feature_mask)
168 | fusion = fusion * c_mask.unsqueeze(2)
169 | return fusion
170 |
171 | class MutiFuse(nn.Module):
172 | def __init__(self, cfg):
173 | super(MutiFuse, self).__init__()
174 | self.txt_softmax = nn.Softmax(1)
175 | self.txt_linear1 = nn.Linear(cfg.dim, 1)
176 | self.layernorm = nn.LayerNorm(cfg.dim, eps=1e-6)
177 |
178 | def forward(self, vis_encoded, txt_encoded,txt_mask):
179 | # vis_encoded: B, C, T
180 | # txt_encoded: B, L, C
181 | vis_encoded = vis_encoded.permute(0,2,1)
182 | txt_attn = self.txt_softmax(self.txt_linear1(txt_encoded)) # B, L, 1
183 | txt_attn = txt_attn * txt_mask.unsqueeze(2)
184 | txt_pool = torch.sum(txt_attn * txt_encoded, dim=1)[:,:,None] # B, C, 1
185 | # 先计算注意力权重,并在词维度进行sum,最后得到的是一个二维,方便计算所以增加一个第三维度
186 | vis_fused =self.layernorm((txt_pool * vis_encoded).permute(0,2,1)) + vis_encoded.permute(0,2,1) # B, C, T
187 | return vis_fused
--------------------------------------------------------------------------------
/datasets/BaseDataset.py:
--------------------------------------------------------------------------------
1 | """ Dataset loader for the ActivityNet Captions dataset """
2 | import os
3 | import json
4 | import h5py
5 | from nltk.tag import pos_tag
6 | import torch
7 | from torch import nn
8 | from torch._C import _resolve_type_from_object
9 | import torch.nn.functional as F
10 | import torch.utils.data as data
11 | import torchtext
12 | import numpy as np
13 |
14 | from . import average_to_fixed_length
15 | from core.config import config
16 | import nltk
17 |
18 | if nltk.data.find('taggers/averaged_perceptron_tagger') is not None:
19 | print("averged_perceptron_tagger has been downloaded.")
20 | else:
21 | nltk.download("averaged_perceptron_tagger") # download data for the first time run
22 |
23 |
24 | class BaseDataset(data.Dataset):
25 | vocab = torchtext.vocab.pretrained_aliases["glove.840B.300d"]()
26 | vocab.itos.extend([""])
27 | vocab.stoi[""] = vocab.vectors.shape[0]
28 | vocab.vectors = torch.cat([vocab.vectors, torch.zeros(1, vocab.dim)], dim=0)
29 | word_embedding = nn.Embedding.from_pretrained(vocab.vectors)
30 |
31 | # CC 并列连词 0 NNS 名词复数 1 UH 感叹词 2
32 | # CD 基数词 3 NNP 专有名词 1 VB 动词原型 4
33 | # DT 限定符 5 NNP 专有名词复数 1 VBD 动词过去式 4
34 | # EX 存在词 6 PDT 前置限定词 5 VBG 动名词或现在分词 4
35 | # FW 外来词 7 POS 所有格结尾 8 VBN 动词过去分词 4
36 | # IN 介词或从属连词 9 PRP 人称代词 10 VBP 非第三人称单数的现在时 4
37 | # JJ 形容词 11 PRP$ 所有格代词 17 VBZ 第三人称单数的现在时 4
38 | # JJR 比较级的形容词 11 RB 副词 12 WDT 以wh开头的限定词 18
39 | # JJS 最高级的形容词 11 RBR 副词比较级 12 WP 以wh开头的代词 19
40 | # LS 列表项标记 13 RBS 副词最高级 12 WP$ 以wh开头的所有格代词 20
41 | # MD 情态动词 4 RP 小品词 14 WRB 以wh开头的副词 21
42 | # NN 名词单数 1 SYM 符号 15 TO to 16
43 | # ',': 22, '.': 23,
44 |
45 | # 三个数据集的词性占比,charades, activitynet, TACoS
46 | # 1: 38.6, 25.8, 27.8 || 4: 20, 18, 16 || 5: 22, 16, 20 || 9: 11.4, 15,6, 11
47 | # 10: 2, 3.5, 5.7 || 12: 0.6, 2.8, 1.5 || 14: 2.1, 1.1, 3.3 || 16: 0.9, 2, 1.1
48 | # 17: 1.3, 1.3, 0.35 || 22: 0.03, 1 , 1,2 || 23: 0, 7.1, 8.3
49 |
50 | pos_tags = {
51 | "NNS": 0,
52 | "NNP": 0,
53 | "NN": 0,
54 | "VB": 1,
55 | "VBD": 1,
56 | "VBN": 1,
57 | "VBP": 1,
58 | "VBG": 1,
59 | "VBZ": 1,
60 | "MD": 1,
61 | "IN": 2,
62 | "JJ": 0,
63 | "PRP": 0,
64 | "JJR": 7,
65 | "JJS": 7,
66 | "RB": 1,
67 | "RBR": 1,
68 | "RBS": 1,
69 | "LS": 7,
70 | "RP": 0,
71 | "SYM": 7,
72 | "TO": 5,
73 | "PRP$": 0,
74 | "WDT": 5,
75 | "WP": 3,
76 | "WP$": 3,
77 | "WRB": 1,
78 | }
79 |
80 | def __init__(self, split):
81 | super(BaseDataset, self).__init__()
82 |
83 | self.anno_dirs = {}
84 | self.anno_dirs["Charades"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA"
85 | self.anno_dirs["Charades_len"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA"
86 | self.anno_dirs["Charades_mom"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA"
87 | self.anno_dirs["ActivityNet"] = "/media/HardDisk_A/users/zzb/dataset/ActivityNet"
88 | self.anno_dirs["TACoS"] = "/media/HardDisk_A/users/zzb/dataset/TACoS"
89 | self.feature_dirs = {}
90 | self.feature_dirs["Charades"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA"
91 | self.feature_dirs["Charades_mom"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA"
92 | self.feature_dirs["ActivityNet"] = "/media/HardDisk_A/users/zzb/dataset/ActivityNet"
93 | self.feature_dirs["TACoS"] = "/media/HardDisk_A/users/zzb/dataset/TACoS"
94 | self.feature_dirs["Charades_len"] = "/media/HardDisk_A/users/zzb/dataset/Charades-STA"
95 | self.input_type = {}
96 | self.input_type["Charades"] = "i3d_adam_epoch16_25fps"
97 | self.input_type["Charades_len"] = "vgg_rgb_features"
98 | self.input_type["Charades_mom"] = "vgg_rgb_features"
99 | self.input_type["ActivityNet"] = "cmcs_features"
100 | self.input_type["TACoS"] = "tall_c3d_features"
101 | self.split = split
102 | self.num_pairs = config.DATASET.num_pairs
103 | self.annotations = self.get_annotation()
104 | self.num_clips = config.DATASET.num_clips
105 |
106 | self.epsilon = 1e-10
107 |
108 | def __getitem__(self, index):
109 | video_id = self.annotations[index]["video"]
110 | gt_s_time, gt_e_time = self.annotations[index]["times"]
111 | sentence = self.annotations[index]["description"]
112 | duration = self.annotations[index]["duration"]
113 | dataset = self.annotations[index]["dataset"]
114 | # words = sentence.split()
115 | # 分词
116 | words = nltk.word_tokenize(sentence)
117 | if len(words) >= 30:
118 | words = words[:30]
119 | words_tags = nltk.pos_tag(words)
120 | word_idxs, pos_tags,keyword_mask,keyword_idxs = [], [],[],[]
121 | # print(sentence)
122 | for keyword, tag in words_tags:
123 | if tag in self.pos_tags.keys():
124 | keyword_idxs.append(self.vocab.stoi.get(keyword.lower(), 400000))
125 | pos_tags.append(self.pos_tags[tag] + 1)
126 | # print(word, self.pos_tags[tag] + 1)
127 | keyword_idxs = torch.tensor(keyword_idxs, dtype=torch.long)
128 | # print(sentence)
129 | for word in words:
130 | word_idxs.append(self.vocab.stoi.get(word.lower(), 400000))
131 | word_idxs = torch.tensor(word_idxs, dtype=torch.long)
132 | word_vectors = self.word_embedding(word_idxs)
133 | keyword_mask = [1 if v in keyword_idxs else 0 for v in word_idxs]
134 |
135 | (
136 | visual_input,
137 | visual_mask,
138 | extend_pre,
139 | extend_suf,
140 | flip_in_time_direction,
141 | ) = self.get_video_features(video_id, dataset)
142 |
143 | feat_length = visual_input.shape[0]
144 | ori_feat_length = feat_length - extend_pre - extend_suf
145 | fps = ori_feat_length / duration
146 | start_frame = int(fps * gt_s_time)
147 | end_frame = int(fps * gt_e_time)
148 | if end_frame >= ori_feat_length:
149 | end_frame = ori_feat_length - 1
150 | if start_frame > end_frame:
151 | start_frame = end_frame
152 |
153 | if flip_in_time_direction:
154 | start_frame, end_frame = (
155 | ori_feat_length - 1 - end_frame,
156 | ori_feat_length - 1 - start_frame,
157 | )
158 | assert start_frame <= end_frame
159 | assert 0 <= start_frame < ori_feat_length
160 | assert 0 <= end_frame < ori_feat_length
161 | start_frame += extend_pre
162 | end_frame += extend_pre
163 |
164 | start_label = np.ones(feat_length, dtype=np.float32) * self.epsilon
165 | end_label = np.ones(feat_length, dtype=np.float32) * self.epsilon
166 |
167 | y = (1 - (ori_feat_length - 3) * self.epsilon - 0.5) / 2
168 |
169 | if start_frame > 0:
170 | start_label[start_frame - 1] = y
171 | if start_frame < feat_length - 1:
172 | start_label[start_frame + 1] = y
173 | start_label[start_frame] = 0.5
174 |
175 | if end_frame > 0:
176 | end_label[end_frame - 1] = y
177 | if end_frame < feat_length - 1:
178 | end_label[end_frame + 1] = y
179 | end_label[end_frame] = 0.5
180 | # ---- above part is for ACRM use only------
181 |
182 | internel_label = np.zeros(feat_length, dtype=np.float32)
183 | extend_inner_len = round(
184 | config.DATASET.EXTEND_INNRE * float(end_frame - start_frame + 1)
185 | )
186 | if extend_inner_len > 0:
187 | st_ = max(0, start_frame - extend_inner_len)
188 | et_ = min(end_frame + extend_inner_len, feat_length - 1)
189 | internel_label[st_ : (et_ + 1)] = 1.0
190 | else:
191 | internel_label[start_frame:(end_frame+1)] = 1.0
192 |
193 | if np.all(internel_label==1.0):
194 | choice = np.random.choice([0, -1])
195 | internel_label[choice] = 0.0
196 | neg_label = 1.0 - internel_label
197 | if len(internel_label) ==1:
198 | internel_label[0] = neg_label[0] = 1.0
199 | positive_indices = np.nonzero(internel_label)[0] # 获取正样本的索引
200 | if len(positive_indices) == 0:
201 | print("wrong")
202 | positive_indices = positive_indices.tolist()
203 | np.random.shuffle(positive_indices)
204 | if len(positive_indices) >= self.num_pairs:
205 | selected_positive_indices = positive_indices[:self.num_pairs] # 随机选择 num_pairs 个正样本的索引
206 | else:
207 | selected_positive_indices = positive_indices
208 | while len(selected_positive_indices) < self.num_pairs:
209 | random_positive_indices = np.random.choice(positive_indices)
210 | selected_positive_indices = np.hstack((selected_positive_indices, random_positive_indices))
211 |
212 | # 随机选择相应的负样本的索引
213 | negative_indices = np.nonzero(neg_label)[0] # 获取正样本的索引
214 | if len(negative_indices) == 0:
215 | print("wrong")
216 | negative_indices = negative_indices.tolist()
217 | np.random.shuffle(negative_indices)
218 | if len(negative_indices) >=self.num_pairs:
219 | selected_negative_indices = negative_indices[:self.num_pairs] # 随机选择 num_pairs 个正样本的索引
220 | else:
221 | selected_negative_indices = negative_indices
222 | while len(selected_negative_indices) < self.num_pairs:
223 | random_negative_indices = np.random.choice(negative_indices)
224 | selected_negative_indices = np.hstack((selected_negative_indices, random_negative_indices))
225 |
226 | start_frame = np.array(start_frame)
227 | end_frame = np.array(end_frame)
228 | extend_pre = np.array(extend_pre)
229 | extend_suf = np.array(extend_suf)
230 | item = {
231 | "visual_input": visual_input,
232 | "vis_mask": visual_mask,
233 | "word_vectors": word_vectors,
234 | # "pos_tags": pos_tags,
235 | "txt_mask": torch.ones(word_vectors.shape[0], 1),
236 | "start_label": torch.from_numpy(start_label),
237 | "end_label": torch.from_numpy(end_label),
238 | "internel_label": torch.from_numpy(internel_label),
239 | "start_frame": torch.from_numpy(start_frame),
240 | "end_frame": torch.from_numpy(end_frame),
241 | "extend_pre": torch.from_numpy(extend_pre),
242 | "extend_suf": torch.from_numpy(extend_suf),
243 | "keyword_mask":torch.tensor(keyword_mask),
244 | "selected_positive_indices":np.array(selected_positive_indices),
245 | "selected_negative_indices":np.array(selected_negative_indices),
246 | "visual_len": len(visual_input)
247 |
248 | }
249 | return item, self.annotations[index]
250 |
251 | def __len__(self):
252 | return len(self.annotations)
253 |
254 | def get_video_features(self, vid, dataset):
255 | with h5py.File(os.path.join(self.feature_dirs[dataset], '{}.hdf5'.format(self.input_type[dataset])), 'r') as f:
256 | if dataset == "ActivityNet" and self.input_type["ActivityNet"]=="sub_activitynet_v1-3.c3d":
257 | features = torch.from_numpy(f[vid]['c3d_features'][:])
258 | else:
259 | features = torch.from_numpy(f[vid][:])
260 | if dataset != "Charades":
261 | if features.shape[0] > self.num_clips:
262 | features = average_to_fixed_length(features, num_sample_clips=self.num_clips)
263 | frame_rate = 1
264 | features = features[list(range(0, features.shape[0], frame_rate))]
265 | if config.DATASET.NORMALIZE:
266 | features = F.normalize(features, dim=1)
267 |
268 | # flip the input in time direction
269 | flip_in_time_direction = False # use for start/end label flip
270 | if (
271 | self.split == "train"
272 | and config.DATASET.FLIP_TIME
273 | and np.random.random() < 0.5
274 | ):
275 | features = torch.flip(features, dims=[0])
276 | flip_in_time_direction = True
277 |
278 | length = features.shape[0]
279 | prefix, suffix = 0, 0
280 | # add a mean_feature in front of and end of the video to double the time length
281 | if (
282 | self.split == "train"
283 | and config.DATASET.EXTEND_TIME
284 | and np.random.random() < 0.7
285 | ):
286 | # mean_feature = torch.mean(features, dim=0)
287 | # extend_feature = mean_feature.unsqueeze(0).repeat((prefix, 1)) # add mean feature
288 | # extend_feature = torch.zeros((prefix, features.shape[1])) # add zeros feature
289 | # --->add another_features start<---
290 | index = np.random.randint(len(self.annotations)) # another_video
291 | video_id = self.annotations[index]["video"]
292 | while video_id == vid:
293 | index = np.random.randint(len(self.annotations)) # another_video
294 | video_id = self.annotations[index]["video"]
295 | featurePath = os.path.join(self.feature_dirs[dataset], video_id + ".npy")
296 | another_features = np.load(featurePath)
297 | another_features = np.squeeze(another_features)
298 | another_features = torch.from_numpy(another_features).float()
299 | # 特征长度最长为1500lenth
300 | if another_features.shape[0] > 1500:
301 | another_features = average_to_fixed_length(
302 | another_features, num_sample_clips=1500
303 | )
304 | another_features = another_features[
305 | list(range(0, another_features.shape[0], frame_rate))
306 | ]
307 | prefix = round(np.random.random() * another_features.shape[0])
308 | extend_feature = another_features[:prefix]
309 | assert extend_feature.shape[0] == prefix
310 | # --->add another_features end<---
311 | features = torch.cat([extend_feature, features], dim=0)
312 | vis_mask = torch.ones((features.shape[0], 1))
313 |
314 | return features, vis_mask, prefix, suffix, flip_in_time_direction
315 |
316 | def get_annotation(self, dataset):
317 | raise NotImplementedError
318 |
--------------------------------------------------------------------------------
/models/PGBP/decoder.py:
--------------------------------------------------------------------------------
1 | import copy
2 | from typing import Optional
3 | import torch
4 | import torch.nn.functional as F
5 | from torch import nn, Tensor
6 | import math
7 | from .operation import Conv1D, mask_logits
8 | from torchvision.ops import RoIAlign
9 | from .layers import Prediction
10 |
11 | class MultiheadAttention(nn.Module):
12 | def __init__(self, dim,num_heads,dropout,dim_v):
13 | super(MultiheadAttention, self).__init__()
14 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (
15 | dim, num_heads)
16 | self.head_size, self.num_heads, self.dim = int(
17 | dim / num_heads), num_heads, dim
18 | self.head_size_v = int(dim_v/num_heads)
19 | self.dim_v = dim_v
20 | self.dropout = nn.Dropout(p=dropout)
21 | def transpose_for_scores(self, x):
22 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size)
23 | x = x.view(*new_x_shape)
24 | return x.permute(0, 2, 1,
25 | 3) # (batch_size, num_heads, w_seq_len, head_size)
26 |
27 | def transpose_for_scores_v(self, x):
28 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size_v)
29 | x = x.view(*new_x_shape)
30 | return x.permute(0, 2, 1,
31 | 3) # (batch_size, num_heads, w_seq_len, head_size)
32 | @staticmethod
33 | def combine_last_two_dim(x):
34 | old_shape = list(x.size())
35 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]]
36 | return x.reshape(shape=new_shape)
37 |
38 | def forward(self, q,k,v, mask=None):
39 | query = self.transpose_for_scores(
40 | q.permute(1, 0, 2)) # (batch_size, num_heads, seq_len, head_size)
41 | key = self.transpose_for_scores(k.permute(1, 0, 2))
42 | value = self.transpose_for_scores_v(v.permute(1, 0, 2))
43 | attention_scores = torch.matmul(query, key.transpose(
44 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len)
45 | attention_scores = attention_scores / math.sqrt(self.head_size)
46 | if mask is not None: # masking
47 | mask = mask.unsqueeze(1).unsqueeze(
48 | 2) # (batch_size, 1, 1, seq_len)
49 | attention_scores = mask_logits(attention_scores, mask)
50 | attention_probs = torch.softmax(
51 | attention_scores,
52 | dim=-1) # (batch_size, num_heads, seq_len, seq_len)
53 | attention_probs = self.dropout(attention_probs)
54 | value = torch.matmul(
55 | attention_probs,
56 | value) # (batch_size, num_heads, seq_len, head_size)
57 | value = self.combine_last_two_dim(value.permute(
58 | 0, 2, 1, 3)) # (batch_size, seq_len, dim)
59 | # intermediate layer
60 | return value.permute(1, 0, 2)
61 |
62 |
63 | class TransformerDecoder(nn.Module):
64 |
65 | def __init__(self, decoder_layer, configs, norm=None):
66 | super().__init__()
67 | self.layers = _get_clones(decoder_layer, configs.detr_layers)
68 | self.detr_layers = configs.detr_layers
69 | self.norm = norm
70 | self.return_intermediate = configs.return_intermediate
71 | assert configs.return_intermediate
72 | self.query_dim = configs.query_dim
73 | self.dim = configs.dim
74 | self.norm1 = nn.LayerNorm(configs.dim)
75 | self.norm2 = nn.LayerNorm(configs.dim)
76 | assert configs.query_scale_type in ['cond_elewise', 'cond_scalar', 'fix_elewise']
77 | self.query_scale_type = configs.query_scale_type
78 | if configs.query_scale_type == 'cond_elewise':
79 | self.query_scale = MLP(configs.dim, configs.dim, configs.dim, 2)
80 | elif configs.query_scale_type == 'cond_scalar':
81 | self.query_scale = MLP(configs.dim, configs.dim, 1, 2)
82 | elif configs.query_scale_type == 'fix_elewise':
83 | self.query_scale = nn.Embedding(configs.detr_layers, configs.dim)
84 | else:
85 | raise NotImplementedError("Unknown query_scale_type: {}".format(configs.query_scale_type))
86 |
87 | self.ref_point_head = MLP(configs.dim, configs.dim, configs.dim, 2)
88 |
89 | # self.bbox_embed = None
90 | # for DAB-deter
91 | if configs.bbox_embed_diff_each_layer:
92 | self.bbox_embed = nn.ModuleList([MLP(configs.dim, configs.dim, 2, 3) for i in range(configs.detr_layers)])
93 | else:
94 | self.bbox_embed = MLP(configs.dim, configs.dim, 2, 3)
95 | # init bbox_embed
96 | if configs.bbox_embed_diff_each_layer:
97 | for bbox_embed in self.bbox_embed:
98 | nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
99 | nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
100 | else:
101 | nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
102 | nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
103 | self.d_model =configs.dim
104 | self.modulate_t_attn = configs.modulate_t_attn
105 | self.bbox_embed_diff_each_layer = configs.bbox_embed_diff_each_layer
106 |
107 | if configs.modulate_t_attn:
108 | self.ref_anchor_head = MLP(configs.dim, configs.dim, 1, 2)
109 |
110 | if not configs.keep_query_pos:
111 | for layer_id in range(configs.detr_layers - 1):
112 | self.layers[layer_id + 1].ca_qpos_proj = None
113 |
114 | def forward(self,pos_feature,scale,tgt, memory,
115 | tgt_mask: Optional[Tensor] = None,
116 | memory_mask: Optional[Tensor] = None,
117 | tgt_key_padding_mask: Optional[Tensor] = None,
118 | memory_key_padding_mask: Optional[Tensor] = None,
119 | pos: Optional[Tensor] = None,
120 | refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2
121 | ):
122 | output =self.norm1(tgt) #torch.Size([10, 32, 256])
123 | memory = self.norm2(memory)
124 | intermediate = []
125 | reference_points = refpoints_unsigmoid.sigmoid()
126 | ref_points = [reference_points]
127 | # import ipdb; ipdb.set_trace()
128 |
129 | for layer_id, layer in enumerate(self.layers): #rence_points torch.Size([10, 32, 2])
130 | obj_center = reference_points[..., :self.query_dim]#torch.Size([10, 32, 2])
131 | # get sine embedding for the query vector
132 | query_sine_embed = gen_sineembed_for_position(obj_center,self.dim//2)
133 | # print('line230', query_sine_embed.shape)
134 | query_pos = self.ref_point_head(query_sine_embed) #torch.Size([10, 32, 256])
135 | # print('line232',query_sine_embed.shape)
136 | # For the first decoder layer, we do not apply transformation over p_s
137 | if self.query_scale_type != 'fix_elewise':
138 | if layer_id == 0:
139 | pos_transformation = 1
140 | else:
141 | pos_transformation = self.query_scale(output)
142 | else:
143 | pos_transformation = self.query_scale.weight[layer_id]
144 |
145 | # apply transformation
146 | # print(query_sine_embed.shape) # 10 32 512
147 | query_sine_embed = query_sine_embed * pos_transformation
148 |
149 | # modulated HW attentions
150 | if self.modulate_t_attn:
151 | reft_cond = self.ref_anchor_head(output).sigmoid() # nq, bs, 1
152 | # print(reft_cond.shape, reft_cond[..., 0].shape) # 10 32 1, 10 32
153 | # print(obj_center.shape, obj_center[..., 1].shape) # 10 32 2, 10 32
154 | # print(query_sine_embed.shape) # 10 32 256
155 |
156 | query_sine_embed *= (reft_cond[..., 0] / obj_center[..., 1]).unsqueeze(-1)
157 |
158 | output = layer(pos_feature,scale,reference_points,output, memory, tgt_mask=tgt_mask,
159 | memory_mask=memory_mask,
160 | tgt_key_padding_mask=tgt_key_padding_mask,
161 | memory_key_padding_mask=memory_key_padding_mask,
162 | pos=pos, query_pos=query_pos, query_sine_embed=query_sine_embed,
163 | is_first=(layer_id == 0)) #torch.Size([10, 32, 256])
164 |
165 | # iter update
166 | if self.bbox_embed is not None:
167 | if self.bbox_embed_diff_each_layer:
168 | tmp = self.bbox_embed[layer_id](output)
169 | else:
170 | tmp = self.bbox_embed(output)
171 | # import ipdb; ipdb.set_trace()
172 | tmp[..., :self.query_dim] += inverse_sigmoid(reference_points)
173 | new_reference_points = tmp[..., :self.query_dim].sigmoid()
174 | if layer_id != self.detr_layers - 1:
175 | ref_points.append(new_reference_points)
176 | reference_points = new_reference_points.detach() #torch.Size([10, 32, 2])
177 |
178 | if self.return_intermediate:
179 | intermediate.append(self.norm(output))
180 |
181 | if self.norm is not None:
182 | output = self.norm(output)
183 | if self.return_intermediate:
184 | intermediate.pop()
185 | intermediate.append(output)
186 |
187 | if self.return_intermediate:
188 | if self.bbox_embed is not None:
189 | return [
190 | torch.stack(intermediate).transpose(1, 2),
191 | torch.stack(ref_points).transpose(1, 2),
192 | ]
193 | else:
194 | return [
195 | torch.stack(intermediate).transpose(1, 2),
196 | reference_points.unsqueeze(0).transpose(1, 2)
197 | ]
198 |
199 | return output.unsqueeze(0)
200 |
201 | class TransformerDecoderLayer(nn.Module):
202 |
203 | def __init__(self, configs):
204 | super().__init__()
205 | # Decoder Self-Attention
206 | d_model = configs.dim
207 | nhead =configs.num_heads
208 | rm_self_attn_decoder = configs.rm_self_attn_decoder
209 | dropout = configs.dropout
210 | dim_feedforward = configs.feedforward
211 | beta = configs.beta
212 | self.sementic_fu = configs.sementic_fu
213 | self.aligned_len = configs.aligned_len
214 |
215 | if not rm_self_attn_decoder:
216 | self.sa_qcontent_proj = nn.Linear(d_model, d_model)
217 | self.sa_qpos_proj = nn.Linear(d_model, d_model)
218 | self.sa_kcontent_proj = nn.Linear(d_model, d_model)
219 | self.sa_kpos_proj = nn.Linear(d_model, d_model)
220 | self.sa_v_proj = nn.Linear(d_model, d_model)
221 | self.self_attn = MultiheadAttention(d_model, nhead,dropout,dim_v=d_model)
222 | self.norm1 = nn.LayerNorm(d_model)
223 | self.dropout1 = nn.Dropout(dropout)
224 |
225 | # Decoder Cross-Attention
226 | self.ca_qcontent_proj = nn.Linear(d_model, d_model)
227 | self.ca_qpos_proj = nn.Linear(d_model, d_model)
228 | self.ca_kcontent_proj = nn.Linear(d_model, d_model)
229 | self.ca_kpos_proj = nn.Linear(d_model, d_model)
230 | self.ca_v_proj = nn.Linear(d_model, d_model)
231 | self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
232 | self.cross_attn = MultiheadAttention(d_model * 2, nhead,dropout, dim_v=d_model)
233 |
234 | self.nhead = nhead
235 | self.rm_self_attn_decoder = rm_self_attn_decoder
236 |
237 | # Implementation of Feedforward model
238 | self.linear1 = nn.Linear(d_model, dim_feedforward)
239 | self.dropout = nn.Dropout(dropout)
240 | self.linear2 = nn.Linear(dim_feedforward, d_model)
241 |
242 | self.norm2 = nn.LayerNorm(d_model)
243 | self.norm3 = nn.LayerNorm(d_model)
244 | self.norm4 = nn.LayerNorm(d_model)
245 | self.dropout2 = nn.Dropout(dropout)
246 | self.dropout3 = nn.Dropout(dropout)
247 | self.dropout4 = nn.Dropout(dropout)
248 |
249 | self.activation = _get_activation_fn(configs.activation)
250 | self.normalize_before = configs.normalize_before
251 | self.keep_query_pos = configs.keep_query_pos
252 | if self.sementic_fu is True:
253 | self.sementic_fusion = semantic_align(d_model,dropout,beta,self.aligned_len)
254 | def with_pos_embed(self, tensor, pos: Optional[Tensor]):
255 | return tensor if pos is None else tensor + pos
256 |
257 | def forward(self, pos_feature,scale,ref_points,tgt, memory,
258 | tgt_mask: Optional[Tensor] = None,
259 | memory_mask: Optional[Tensor] = None,
260 | tgt_key_padding_mask: Optional[Tensor] = None,
261 | memory_key_padding_mask: Optional[Tensor] = None,
262 | pos: Optional[Tensor] = None,
263 | query_pos: Optional[Tensor] = None,
264 | query_sine_embed=None,
265 | is_first=False,
266 | ):
267 |
268 | # ========== Begin of Self-Attention =============
269 | if not self.rm_self_attn_decoder:
270 | # Apply projections here
271 | # shape: num_queries x batch_size x 256
272 | q_content = self.sa_qcontent_proj(tgt) # target is the input of the first decoder layer. zero by default.
273 | q_pos = self.sa_qpos_proj(query_pos)
274 | k_content = self.sa_kcontent_proj(tgt)
275 | k_pos = self.sa_kpos_proj(query_pos)
276 | v = self.sa_v_proj(tgt)
277 |
278 | num_queries, bs, n_model = q_content.shape
279 | hw, _, _ = k_content.shape
280 |
281 | q = q_content + q_pos
282 | k = k_content + k_pos
283 |
284 | tgt2 = self.self_attn(q, k, v, mask=tgt_key_padding_mask)
285 | # ========== End of Self-Attention =============
286 | box = ref_points.transpose(0,1) * scale.unsqueeze(1)
287 | tgt = tgt + self.dropout1(tgt2)
288 | tgt = self.norm1(tgt)
289 | if self.sementic_fu is True:
290 | tgt3 = self.sementic_fusion(memory.transpose(0,1),box,tgt.transpose(0,1),pos_feature)
291 | tgt3 =tgt + self.dropout4(tgt3)
292 | tgt = self.norm4(tgt3)
293 | # ========== Begin of Cross-Attention =============
294 | # Apply projections here
295 | # shape: num_queries x batch_size x 256
296 | q_content = self.ca_qcontent_proj(tgt)
297 | k_content = self.ca_kcontent_proj(memory)
298 | v = self.ca_v_proj(memory)
299 |
300 | num_queries, bs, n_model = q_content.shape
301 | hw, _, _ = k_content.shape
302 |
303 | k_pos = self.ca_kpos_proj(pos)
304 |
305 | # For the first decoder layer, we concatenate the positional embedding predicted from
306 | # the object query (the positional embedding) into the original query (key) in DETR.
307 | if is_first or self.keep_query_pos:
308 | q_pos = self.ca_qpos_proj(query_pos)
309 | q = q_content + q_pos
310 | k = k_content + k_pos
311 | else:
312 | q = q_content
313 | k = k_content
314 |
315 | q = q.view(num_queries, bs, self.nhead, n_model // self.nhead)
316 | query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
317 | query_sine_embed = query_sine_embed.view(num_queries, bs, self.nhead, n_model // self.nhead)
318 | q = torch.cat([q, query_sine_embed], dim=3).view(num_queries, bs, n_model * 2)
319 | k = k.view(hw, bs, self.nhead, n_model // self.nhead)
320 | k_pos = k_pos.view(hw, bs, self.nhead, n_model // self.nhead)
321 | k = torch.cat([k, k_pos], dim=3).view(hw, bs, n_model * 2)
322 |
323 | tgt2 = self.cross_attn(q,k,v,mask=memory_key_padding_mask)
324 | # ========== End of Cross-Attention =============
325 |
326 | tgt = tgt + self.dropout2(tgt2)
327 | tgt = self.norm2(tgt)
328 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
329 | tgt = tgt + self.dropout3(tgt2)
330 | tgt = self.norm3(tgt)
331 | return tgt
332 |
333 | def _get_activation_fn(activation):
334 | """Return an activation function given a string"""
335 | if activation == "relu":
336 | return F.relu
337 | if activation == "gelu":
338 | return F.gelu
339 | if activation == "glu":
340 | return F.glu
341 | if activation == "prelu":
342 | return nn.PReLU()
343 | if activation == "selu":
344 | return F.selu
345 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
346 |
347 | class MLP(nn.Module):
348 | """ Very simple multi-layer perceptron (also called FFN)"""
349 |
350 | def __init__(self, input_dim, hidden_dim, output_dim, detr_layers):
351 | super().__init__()
352 | self.detr_layers = detr_layers
353 | h = [hidden_dim] * (detr_layers - 1)
354 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
355 |
356 | def forward(self, x):
357 | for i, layer in enumerate(self.layers):
358 | x = F.relu(layer(x)) if i < self.detr_layers - 1 else layer(x)
359 | return x
360 |
361 | def inverse_sigmoid(x, eps=1e-3):
362 | x = x.clamp(min=0, max=1)
363 | x1 = x.clamp(min=eps)
364 | x2 = (1 - x).clamp(min=eps)
365 | return torch.log(x1/x2)
366 |
367 | def gen_sineembed_for_position(pos_tensor,dim):
368 | # n_query, bs, _ = pos_tensor.size()
369 | # sineembed_tensor = torch.zeros(n_query, bs, 256)
370 | scale = 2 * math.pi
371 | dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device)
372 | # dim_t = 10000 ** (2 * (dim_t // 2) / dim)
373 | dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode='trunc') / dim)
374 | center_embed = pos_tensor[:, :, 0] * scale
375 | pos_x = center_embed[:, :, None] / dim_t
376 | pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
377 |
378 | span_embed = pos_tensor[:, :, 1] * scale
379 | pos_w = span_embed[:, :, None] / dim_t
380 | pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
381 |
382 | pos = torch.cat((pos_x, pos_w), dim=2)
383 | return pos
384 |
385 | def _get_clones(module, N):
386 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
387 |
388 | class semantic_align(nn.Module):
389 | def __init__(self, dim, dropout,beta,aligned_len):
390 | super().__init__()
391 | self.aligned_len = aligned_len
392 | self.gate = Prediction(in_dim= 2*dim, hidden_dim= dim, out_dim=2,drop_rate=dropout)
393 | self.softmax = nn.Softmax(2)
394 | self.contrast1 = ContrastBlock(dim,beta)
395 | self.contrast2 = ContrastBlock(dim,beta)
396 | def forward(self,features,quires_box,quires_features,pos_feature):
397 | B, L1, _ = quires_box.shape
398 | _,L,C = features.shape
399 | batch_feature = []
400 | roi_start = torch.round(((quires_box[..., 0] - quires_box[..., 1] / 2)*L).clamp(0, L-1)).long()
401 | roi_end = torch.round(((quires_box[..., 0] + quires_box[..., 1] / 2)*L).clamp(0, L-1)).long()
402 | start_features = torch.gather(features, dim=1, index=roi_start.unsqueeze(-1).expand(-1, -1, C))
403 | start_features = self.contrast1(start_features,pos_feature).unsqueeze(-2)
404 | end_features = torch.gather(features, dim=1, index=roi_end.unsqueeze(-1).expand(-1, -1, C))
405 | end_features = self.contrast2(end_features,pos_feature).unsqueeze(-2)
406 | boundary_features = torch.cat((start_features,end_features),dim = -2)
407 | if self.aligned_len:
408 | pool_boundary_features = torch.mean(boundary_features, dim=2, keepdim=False)
409 | else:
410 | pool_boundary_features,_ = torch.max(boundary_features, dim=2, keepdim=False)
411 | x = torch.cat([pool_boundary_features ,quires_features],dim = -1)
412 | gate =self.softmax(self.gate(x))
413 | x = pool_boundary_features*gate[...,0:1] + quires_features*gate[...,1:2]
414 | return x.transpose(0,1)
415 |
416 |
417 | class ContrastBlock(nn.Module):
418 | def __init__(self, dim, beta):
419 | super(ContrastBlock, self).__init__()
420 | self.conv1 = nn.Conv1d(in_channels=dim,
421 | out_channels=dim//beta,
422 | kernel_size=1,
423 | stride=1,
424 | padding=0,
425 | bias=True)
426 | self.conv2 = nn.Conv1d(in_channels=dim//beta,
427 | out_channels=dim,
428 | kernel_size=1,
429 | stride=1,
430 | padding=0,
431 | bias=True)
432 | self.activation = nn.ReLU()
433 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
434 |
435 | def forward(self,v_1,v_2):
436 | v_1 = v_1.transpose(1, 2)
437 | v_2 = v_2.transpose(1, 2)
438 | v = v_1 * v_2
439 | v = self.conv1(v)
440 | v = self.activation(v)
441 | v = torch.sigmoid(self.layer_norm1(self.conv2(v).transpose(1, 2)))
442 | v = v * v_1.transpose(1, 2)
443 | return v
444 | # class semantic_align(nn.Module):
445 | # def __init__(self, dim,beta,aligned_len):
446 | # super().__init__()
447 | # self.aligned_len = aligned_len
448 | # self.gate = nn.Linear(in_features=2*dim,out_features=2)
449 | # self.softmax = nn.Softmax(2)
450 | # self.contrast = ContrastBlock(dim,beta)
451 | # def forward(self, features,quires_box,quires_features,pos_feature):
452 | # B, L1, _ = quires_box.shape
453 | # _,L,C = features.shape
454 | # batch_feature = []
455 | # roi_start = torch.round(((quires_box[..., 0] - quires_box[..., 1] / 2)*L).clamp(0, L-1)).long()
456 | # roi_end = torch.round(((quires_box[..., 0] + quires_box[..., 1] / 2)*L).clamp(0, L-1)).long()
457 | # start_features = torch.gather(features, dim=1, index=roi_start.unsqueeze(-1).expand(-1, -1, C)).unsqueeze(-2)
458 | # end_features = torch.gather(features, dim=1, index=roi_end.unsqueeze(-1).expand(-1, -1, C)).unsqueeze(-2)
459 | # boundary_features = torch.cat((start_features,end_features),dim = -2)
460 | # boundary_features = self.contrast(boundary_features,pos_feature)
461 | # if self.aligned_len:
462 | # pool_boundary_features = torch.mean(boundary_features, dim=2, keepdim=False)
463 | # else:
464 | # pool_boundary_features,_ = torch.max(boundary_features, dim=2, keepdim=False)
465 | # x = torch.cat([pool_boundary_features ,quires_features],dim = -1)
466 | # gate =self.softmax(self.gate(x))
467 | # x = pool_boundary_features*gate[...,0:1] + quires_features*gate[...,1:2]
468 | # return x.transpose(0,1)
469 |
470 |
471 | # class ContrastBlock(nn.Module):
472 | # def __init__(self, dim, beta):
473 | # super(ContrastBlock, self).__init__()
474 | # self.conv1 = nn.Conv1d(in_channels=dim,
475 | # out_channels=dim//beta,
476 | # kernel_size=1,
477 | # stride=1,
478 | # padding=0,
479 | # bias=True)
480 | # self.conv2 = nn.Conv1d(in_channels=dim//beta,
481 | # out_channels=dim,
482 | # kernel_size=1,
483 | # stride=1,
484 | # padding=0,
485 | # bias=True)
486 | # self.activation = nn.ReLU()
487 | # self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
488 |
489 | # def forward(self,v_1,v_2):
490 | # v_1 = v_1.transpose(1, 2)
491 | # v_2 = v_2.transpose(1, 2)
492 | # v = v_1 * v_2
493 | # v = self.conv1(v)
494 | # v = self.activation(v)
495 | # v = torch.sigmoid(self.layer_norm1(self.conv2(v).transpose(1, 2)))
496 | # v = v * v_1.transpose(1, 2)
497 | # return v
--------------------------------------------------------------------------------
/models/PGBP/attention.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import math
4 | import torch.nn.functional as F
5 | from .operation import Conv1D, mask_logits
6 | from .encoder import MultiStepLSTMEncoder, TemporalContextModule
7 | from .phraseEncoder import PhraseEncodeNet
8 |
9 | class TemporalMaxer(nn.Module):
10 | def __init__(
11 | self,
12 | kernel_size,
13 | stride,
14 | padding,
15 | n_embd):
16 | super().__init__()
17 | self.ds_pooling = nn.MaxPool1d(
18 | kernel_size, stride=stride, padding=padding)
19 |
20 | self.stride = stride
21 |
22 | def forward(self, x, mask):
23 |
24 | # out, out_mask = self.channel_att(x, mask)
25 | x = x.permute(0,2,1)
26 | mask = mask.unsqueeze(1)
27 | if self.stride > 1:
28 | # downsample the mask using nearest neighbor
29 | out_mask = F.interpolate(
30 | mask.to(x.dtype), size=(x.size(-1)+self.stride-1)//self.stride, mode='nearest')
31 | else:
32 | # masking out the features
33 | out_mask = mask
34 |
35 | out = self.ds_pooling(x) * out_mask.to(x.dtype)
36 | out = out.permute(0,2,1)
37 | return out, out_mask.squeeze(1)
38 |
39 | class DETR_Decoder(nn.Module):
40 | def __init__(self, configs):
41 | super(DETR_Decoder, self).__init__()
42 | dim = configs.dim
43 | num_heads = configs.num_heads
44 | drop_rate = configs.drop_rate
45 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (
46 | dim, num_heads)
47 | self.head_size, self.num_heads, self.dim = int(
48 | dim / num_heads), num_heads, dim
49 | self.attention = Cross_Attention(configs)
50 | self.dropout = nn.Dropout(p=drop_rate)
51 | self.query = Conv1D(in_dim=dim,
52 | out_dim=dim,
53 | kernel_size=1,
54 | stride=1,
55 | padding=0,
56 | bias=True)
57 | self.key = Conv1D(in_dim=dim,
58 | out_dim=dim,
59 | kernel_size=1,
60 | stride=1,
61 | padding=0,
62 | bias=True)
63 | self.value = Conv1D(in_dim=dim,
64 | out_dim=dim,
65 | kernel_size=1,
66 | stride=1,
67 | padding=0,
68 | bias=True)
69 | # self.value_visual = None
70 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
71 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6)
72 | self.out_layer1 = Conv1D(in_dim=dim,
73 | out_dim=dim,
74 | kernel_size=1,
75 | stride=1,
76 | padding=0,
77 | bias=True)
78 | self.output_activation = nn.GELU()
79 | self.out_layer2 = Conv1D(in_dim=dim,
80 | out_dim=dim,
81 | kernel_size=1,
82 | stride=1,
83 | padding=0,
84 | bias=True)
85 |
86 | def transpose_for_scores(self, x):
87 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size)
88 | x = x.view(*new_x_shape)
89 | return x.permute(0, 2, 1,
90 | 3) # (batch_size, num_heads, w_seq_len, head_size)
91 |
92 | @staticmethod
93 | def combine_last_two_dim(x):
94 | old_shape = list(x.size())
95 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]]
96 | return x.reshape(shape=new_shape)
97 |
98 | def forward(self, memory, x,mask = None):
99 | output = self.layer_norm1(memory)
100 | query = self.transpose_for_scores(
101 | self.query(output)) # (batch_size, num_heads, seq_len, head_size)
102 | key = self.transpose_for_scores(self.key(output))
103 | value = self.transpose_for_scores(self.value(output))
104 | attention_scores = torch.matmul(query, key.transpose(
105 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len)
106 | attention_scores = attention_scores / math.sqrt(self.head_size)
107 | attention_probs = torch.softmax(
108 | attention_scores,
109 | dim=-1) # (batch_size, num_heads, seq_len, seq_len)
110 | attention_probs = self.dropout(attention_probs)
111 | value = torch.matmul(
112 | attention_probs,
113 | value) # (batch_size, num_heads, seq_len, head_size)
114 | value = self.combine_last_two_dim(value.permute(
115 | 0, 2, 1, 3)) # (batch_size, seq_len, dim)
116 | # intermediate layer
117 | output = self.dropout(value)
118 | residual = output + memory
119 | residual = self.layer_norm2(residual)
120 | output = self.attention(residual,x,mask)
121 | return output
122 |
123 | class Cross_Attention(nn.Module):
124 | def __init__(self, configs):
125 | super(Cross_Attention, self).__init__()
126 | dim = configs.dim
127 | num_heads = configs.num_heads
128 | drop_rate = configs.drop_rate
129 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (
130 | dim, num_heads)
131 | self.head_size, self.num_heads, self.dim = int(
132 | dim / num_heads), num_heads, dim
133 | self.dropout = nn.Dropout(p=drop_rate)
134 | self.query = Conv1D(in_dim=dim,
135 | out_dim=dim,
136 | kernel_size=1,
137 | stride=1,
138 | padding=0,
139 | bias=True)
140 | self.key = Conv1D(in_dim=dim,
141 | out_dim=dim,
142 | kernel_size=1,
143 | stride=1,
144 | padding=0,
145 | bias=True)
146 | self.value = Conv1D(in_dim=dim,
147 | out_dim=dim,
148 | kernel_size=1,
149 | stride=1,
150 | padding=0,
151 | bias=True)
152 | # self.value_visual = None
153 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
154 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6)
155 | self.layer_norm3 = nn.LayerNorm(dim, eps=1e-6)
156 | self.out_layer1 = Conv1D(in_dim=dim,
157 | out_dim=dim,
158 | kernel_size=1,
159 | stride=1,
160 | padding=0,
161 | bias=True)
162 | self.output_activation = nn.GELU()
163 | self.out_layer2 = Conv1D(in_dim=dim,
164 | out_dim=dim,
165 | kernel_size=1,
166 | stride=1,
167 | padding=0,
168 | bias=True)
169 |
170 | def transpose_for_scores(self, x):
171 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size)
172 | x = x.view(*new_x_shape)
173 | return x.permute(0, 2, 1,
174 | 3) # (batch_size, num_heads, w_seq_len, head_size)
175 |
176 | @staticmethod
177 | def combine_last_two_dim(x):
178 | old_shape = list(x.size())
179 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]]
180 | return x.reshape(shape=new_shape)
181 |
182 | def forward(self, memory,x,mask = None):
183 | output = self.layer_norm1(memory)
184 | x = self.layer_norm3(x)
185 | query = self.transpose_for_scores(
186 | self.query(output)) # (batch_size, num_heads, seq_len, head_size)
187 | key = self.transpose_for_scores(self.key(x))
188 | value = self.transpose_for_scores(self.value(x))
189 | attention_scores = torch.matmul(query, key.transpose(
190 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len)
191 | attention_scores = attention_scores / math.sqrt(self.head_size)
192 | if mask is not None: # masking
193 | mask = mask.unsqueeze(1).unsqueeze(
194 | 2) # (batch_size, 1, 1, seq_len)
195 | attention_scores = mask_logits(attention_scores, mask)
196 | attention_probs = torch.softmax(
197 | attention_scores,
198 | dim=-1) # (batch_size, num_heads, seq_len, seq_len)
199 | attention_probs = self.dropout(attention_probs)
200 | value = torch.matmul(
201 | attention_probs,
202 | value) # (batch_size, num_heads, seq_len, head_size)
203 | value = self.combine_last_two_dim(value.permute(
204 | 0, 2, 1, 3)) # (batch_size, seq_len, dim)
205 | # intermediate layer
206 | output = self.dropout(value)
207 | residual = output + memory
208 | output = self.layer_norm2(residual)
209 | output = self.out_layer1(output)
210 | output = self.output_activation(output)
211 | output = self.dropout(output)
212 | output = self.out_layer2(output) + residual
213 | return output
214 |
215 | class MultiHeadAttention(nn.Module):
216 | def __init__(self, configs):
217 | super(MultiHeadAttention, self).__init__()
218 | dim = configs.dim
219 | num_heads = configs.num_heads
220 | drop_rate = configs.drop_rate
221 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (
222 | dim, num_heads)
223 | self.head_size, self.num_heads, self.dim = int(
224 | dim / num_heads), num_heads, dim
225 | self.dropout = nn.Dropout(p=drop_rate)
226 | self.query = Conv1D(in_dim=dim,
227 | out_dim=dim,
228 | kernel_size=1,
229 | stride=1,
230 | padding=0,
231 | bias=True)
232 | self.key = Conv1D(in_dim=dim,
233 | out_dim=dim,
234 | kernel_size=1,
235 | stride=1,
236 | padding=0,
237 | bias=True)
238 | self.value = Conv1D(in_dim=dim,
239 | out_dim=dim,
240 | kernel_size=1,
241 | stride=1,
242 | padding=0,
243 | bias=True)
244 | # self.value_visual = None
245 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
246 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6)
247 | self.out_layer1 = Conv1D(in_dim=dim,
248 | out_dim=dim,
249 | kernel_size=1,
250 | stride=1,
251 | padding=0,
252 | bias=True)
253 | self.output_activation = nn.GELU()
254 | self.out_layer2 = Conv1D(in_dim=dim,
255 | out_dim=dim,
256 | kernel_size=1,
257 | stride=1,
258 | padding=0,
259 | bias=True)
260 |
261 | def transpose_for_scores(self, x):
262 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size)
263 | x = x.view(*new_x_shape)
264 | return x.permute(0, 2, 1,
265 | 3) # (batch_size, num_heads, w_seq_len, head_size)
266 |
267 | @staticmethod
268 | def combine_last_two_dim(x):
269 | old_shape = list(x.size())
270 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]]
271 | return x.reshape(shape=new_shape)
272 |
273 | def forward(self, x, mask=None):
274 | output = self.layer_norm1(x) # (batch_size, seq_len, dim)
275 | # output = self.dropout(output)
276 | # multi-head attention layer
277 | query = self.transpose_for_scores(
278 | self.query(output)) # (batch_size, num_heads, seq_len, head_size)
279 | key = self.transpose_for_scores(self.key(output))
280 | value = self.transpose_for_scores(self.value(output))
281 | attention_scores = torch.matmul(query, key.transpose(
282 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len)
283 | attention_scores = attention_scores / math.sqrt(self.head_size)
284 | if mask is not None: # masking
285 | mask = mask.unsqueeze(1).unsqueeze(
286 | 2) # (batch_size, 1, 1, seq_len)
287 | attention_scores = mask_logits(attention_scores, mask)
288 | attention_probs = torch.softmax(
289 | attention_scores,
290 | dim=-1) # (batch_size, num_heads, seq_len, seq_len)
291 | attention_probs = self.dropout(attention_probs)
292 | value = torch.matmul(
293 | attention_probs,
294 | value) # (batch_size, num_heads, seq_len, head_size)
295 | value = self.combine_last_two_dim(value.permute(
296 | 0, 2, 1, 3)) # (batch_size, seq_len, dim)
297 | # intermediate layer
298 | output = self.dropout(value)
299 | residual = x + output
300 | output = self.layer_norm2(residual)
301 | output = self.out_layer1(output)
302 | output = self.output_activation(output)
303 | output = self.dropout(output)
304 | output = self.out_layer2(output) + residual
305 | return output
306 |
307 |
308 | class MultiLSTMAttention(nn.Module):
309 | def __init__(self, configs):
310 | super(MultiLSTMAttention, self).__init__()
311 | dim = configs.dim
312 | num_heads = configs.num_heads
313 | drop_rate = configs.drop_rate
314 | num_layers = configs.num_layers
315 | num_step = configs.num_step
316 | bi_direction = configs.bi_direction
317 |
318 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (
319 | dim, num_heads)
320 | self.head_size, self.num_heads, self.dim = int(
321 | dim / num_heads), num_heads, dim
322 | self.dropout = nn.Dropout(p=drop_rate)
323 | self.query = MultiStepLSTMEncoder(in_dim=dim,
324 | out_dim=dim,
325 | num_layers=num_layers,
326 | num_step=num_step,
327 | bi_direction=bi_direction,
328 | drop_rate=drop_rate)
329 | self.key = MultiStepLSTMEncoder(in_dim=dim,
330 | out_dim=dim,
331 | num_layers=num_layers,
332 | num_step=num_step,
333 | bi_direction=bi_direction,
334 | drop_rate=drop_rate)
335 | self.value = MultiStepLSTMEncoder(in_dim=dim,
336 | out_dim=dim,
337 | num_layers=num_layers,
338 | num_step=num_step,
339 | bi_direction=bi_direction,
340 | drop_rate=drop_rate)
341 | # self.value_visual = None
342 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
343 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6)
344 | self.out_layer1 = Conv1D(in_dim=dim,
345 | out_dim=dim,
346 | kernel_size=1,
347 | stride=1,
348 | padding=0,
349 | bias=True)
350 | self.output_activation = nn.GELU()
351 | self.out_layer2 = Conv1D(in_dim=dim,
352 | out_dim=dim,
353 | kernel_size=1,
354 | stride=1,
355 | padding=0,
356 | bias=True)
357 |
358 | def transpose_for_scores(self, x):
359 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size)
360 | x = x.view(*new_x_shape)
361 | return x.permute(0, 2, 1,
362 | 3) # (batch_size, num_heads, w_seq_len, head_size)
363 |
364 | @staticmethod
365 | def combine_last_two_dim(x):
366 | old_shape = list(x.size())
367 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]]
368 | return x.reshape(shape=new_shape)
369 |
370 | def forward(self, x, mask=None):
371 | output = self.layer_norm1(x) # (batch_size, seq_len, dim)
372 | # output = self.dropout(output)
373 | # multi-head attention layer
374 | query = self.transpose_for_scores(
375 | self.query(output)) # (batch_size, num_heads, seq_len, head_size)
376 | key = self.transpose_for_scores(self.key(output))
377 | value = self.transpose_for_scores(self.value(output))
378 | attention_scores = torch.matmul(query, key.transpose(
379 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len)
380 | attention_scores = attention_scores / math.sqrt(self.head_size)
381 | if mask is not None: # masking
382 | mask = mask.unsqueeze(1).unsqueeze(
383 | 2) # (batch_size, 1, 1, seq_len)
384 | attention_scores = mask_logits(attention_scores, mask)
385 | attention_probs = torch.softmax(
386 | attention_scores,
387 | dim=-1) # (batch_size, num_heads, seq_len, seq_len)
388 | attention_probs = self.dropout(attention_probs)
389 | value = torch.matmul(
390 | attention_probs,
391 | value) # (batch_size, num_heads, seq_len, head_size)
392 | value = self.combine_last_two_dim(value.permute(
393 | 0, 2, 1, 3)) # (batch_size, seq_len, dim)
394 | # intermediate layer
395 | output = self.dropout(value)
396 | residual = x + output
397 | output = self.layer_norm2(residual)
398 | output = self.out_layer1(output)
399 | output = self.output_activation(output)
400 | output = self.dropout(output)
401 | output = self.out_layer2(output) + residual
402 | return output
403 |
404 |
405 | class MultiConvAttention(nn.Module):
406 | def __init__(self, configs):
407 | super(MultiConvAttention, self).__init__()
408 | dim = configs.dim
409 | num_heads = configs.num_heads
410 | drop_rate = configs.drop_rate
411 | kernels = configs.kernels
412 |
413 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (
414 | dim, num_heads)
415 | self.head_size, self.num_heads, self.dim = int(
416 | dim / num_heads), num_heads, dim
417 | self.dropout = nn.Dropout(p=drop_rate)
418 | self.query = TemporalContextModule(in_dim=dim,
419 | out_dim=dim,
420 | kernels=kernels,
421 | drop_rate=drop_rate)
422 | self.key = TemporalContextModule(in_dim=dim,
423 | out_dim=dim,
424 | kernels=kernels,
425 | drop_rate=drop_rate)
426 | self.value = TemporalContextModule(in_dim=dim,
427 | out_dim=dim,
428 | kernels=kernels,
429 | drop_rate=drop_rate)
430 | # self.value_visual = None
431 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
432 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6)
433 | self.out_layer1 = Conv1D(in_dim=dim,
434 | out_dim=dim,
435 | kernel_size=1,
436 | stride=1,
437 | padding=0,
438 | bias=True)
439 | self.output_activation = nn.GELU()
440 | self.out_layer2 = Conv1D(in_dim=dim,
441 | out_dim=dim,
442 | kernel_size=1,
443 | stride=1,
444 | padding=0,
445 | bias=True)
446 |
447 | def transpose_for_scores(self, x):
448 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size)
449 | x = x.view(*new_x_shape)
450 | return x.permute(0, 2, 1,
451 | 3) # (batch_size, num_heads, w_seq_len, head_size)
452 |
453 | @staticmethod
454 | def combine_last_two_dim(x):
455 | old_shape = list(x.size())
456 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]]
457 | return x.reshape(shape=new_shape)
458 |
459 | def forward(self, x, mask=None):
460 | output = self.layer_norm1(x) # (batch_size, seq_len, dim)
461 | # output = self.dropout(output)
462 | # multi-head attention layer
463 | query = self.transpose_for_scores(
464 | self.query(output)) # (batch_size, num_heads, seq_len, head_size)
465 | key = self.transpose_for_scores(self.key(output))
466 | value = self.transpose_for_scores(self.value(output))
467 | attention_scores = torch.matmul(query, key.transpose(
468 | -1, -2)) # (batch_size, num_heads, seq_len, seq_len)
469 | attention_scores = attention_scores / math.sqrt(self.head_size)
470 | if mask is not None: # masking
471 | mask = mask.unsqueeze(1).unsqueeze(
472 | 2) # (batch_size, 1, 1, seq_len)
473 | attention_scores = mask_logits(attention_scores, mask)
474 | attention_probs = torch.softmax(
475 | attention_scores,
476 | dim=-1) # (batch_size, num_heads, seq_len, seq_len)
477 | attention_probs = self.dropout(attention_probs)
478 | value = torch.matmul(
479 | attention_probs,
480 | value) # (batch_size, num_heads, seq_len, head_size)
481 | value = self.combine_last_two_dim(value.permute(
482 | 0, 2, 1, 3)) # (batch_size, seq_len, dim)
483 | # intermediate layer
484 | output = self.dropout(value)
485 | residual = x + output
486 | output = self.layer_norm2(residual)
487 | output = self.out_layer1(output)
488 | output = self.output_activation(output)
489 | output = self.dropout(output)
490 | output = self.out_layer2(output) + residual
491 | return output
492 |
493 | class ConvMultiAttention(nn.Module):
494 | def __init__(self, configs):
495 | super(ConvMultiAttention, self).__init__()
496 | self.attention = MultiHeadAttention(configs)
497 | self.multi_grain = PhraseEncodeNet(configs.dim)
498 |
499 | def forward(self, x, mask=None):
500 | x = self.attention(x,mask)
501 | x = self.multi_grain(x)
502 | return x * mask.unsqueeze(2)
503 |
504 | class ContrastBlock(nn.Module):
505 | def __init__(self, dim, beta):
506 | super(ContrastBlock, self).__init__()
507 | self.conv1 = nn.Conv1d(in_channels=dim,
508 | out_channels=dim//beta,
509 | kernel_size=1,
510 | stride=1,
511 | padding=0,
512 | bias=True)
513 | self.conv2 = nn.Conv1d(in_channels=dim//beta,
514 | out_channels=dim,
515 | kernel_size=1,
516 | stride=1,
517 | padding=0,
518 | bias=True)
519 | self.activation = nn.ReLU()
520 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
521 |
522 | def forward(self,v_1,v_2):
523 | v_1 = v_1.transpose(1, 2)
524 | v_2 = v_2.transpose(1, 2)
525 | v = v_1 * v_2
526 | v = self.conv1(v)
527 | v = self.activation(v)
528 | v = torch.sigmoid(self.layer_norm1(self.conv2(v).transpose(1, 2)))
529 | v = v * v_1.transpose(1, 2)
530 | return v
531 |
532 | class MLP(nn.Module):
533 | """ Very simple multi-layer perceptron (also called FFN)"""
534 |
535 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
536 | super().__init__()
537 | self.num_layers = num_layers
538 | h = [hidden_dim] * (num_layers - 1)
539 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
540 |
541 | def forward(self, x):
542 | for i, layer in enumerate(self.layers):
543 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
544 | return x
545 |
546 |
--------------------------------------------------------------------------------
/models/PGBP/PGBP.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import numpy as np
6 | from tqdm import tqdm
7 | import copy
8 | from core.config import config
9 | from core.runner_utils import index_to_time2, calculate_iou, calculate_iou_accuracy, cal_statistics
10 | from . import attention
11 | from .encoder import LSTMEncoder, MultiStepLSTMEncoder, TemporalContextModule
12 | from . import fusion
13 | from .layers import Projection, Prediction, PositionalEmbedding, PositionEmbeddingSine
14 | from .operation import Conv1D, mask_logits,cw2se
15 | from .triplet_loss import batch_all_triplet_loss, pairwise_distances
16 | import random
17 | from .slidewindow import find_most_relevant_frame
18 | from .gauss import generate_gaussian_tensor
19 | from einops import repeat,rearrange
20 | from .decoder import TransformerDecoder,TransformerDecoderLayer
21 | from torchvision.ops import sigmoid_focal_loss
22 | from .matcher import HungarianMatcher
23 | # torch.set_printoptions(profile="full", linewidth=1000, precision=2)
24 |
25 | def inverse_sigmoid(x, eps=1e-3):
26 | x = x.clamp(min=0, max=1)
27 | x1 = x.clamp(min=eps)
28 | x2 = (1 - x).clamp(min=eps)
29 | return torch.log(x1/x2)
30 |
31 |
32 | class PGBP(nn.Module):
33 | def __init__(self):
34 | super(PGBP, self).__init__()
35 | configs = config.MODEL.PARAMS
36 | self.use_keyword = configs.use_keyword
37 | self.windowsize = configs.windowsize
38 | self.debug_print = configs.DEBUG
39 | self.top_k = configs.top_k
40 | self.top_k0=configs.top_k0
41 | self.neg = configs.neg
42 | self.pos =configs.pos
43 | self.detr_layers = configs.detr_layers
44 | self.content_prior = configs.content_prior
45 | self.match = HungarianMatcher(configs.cost_class,configs.cost_span,configs.cost_giou)
46 | empty_weight = torch.ones(2)
47 | empty_weight[-1] = configs.eos_coef # lower weight for background (index 1, foreground index 0)
48 | self.register_buffer('empty_weight', empty_weight)
49 | self.video_affine = Projection(in_dim=configs.video_feature_dim,
50 | dim=configs.dim,
51 | drop_rate=configs.drop_rate)
52 |
53 | self.query_affine = Projection(in_dim=configs.query_feature_dim,
54 | dim=configs.dim,
55 | drop_rate=configs.drop_rate)
56 | self.query_position = configs.query_position
57 | self.video_position = configs.video_position
58 | if self.query_position:
59 | self.q_pos_embedding = PositionalEmbedding(configs.dim, 30)
60 | if self.video_position:
61 | self.v_pos_embedding = PositionEmbeddingSine(configs.dim, normalize=True)
62 | if configs.content_prior == "learned":
63 | self.pattern = nn.Embedding(configs.num_queries, configs.dim)
64 | # self.pos_embedding = TransformerPositionalEmbedding(configs.dim, 500,drop_rate=configs.drop_rate)
65 | self.query_embeddings = nn.Embedding(configs.num_queries, 2)
66 | query_attention_layer = getattr(attention,
67 | configs.query_attention)(configs)
68 | video_attention_layer = getattr(attention,
69 | configs.video_attention)(configs)
70 | decoder_layer = TransformerDecoderLayer(configs)
71 | decoder_norm = nn.LayerNorm(configs.dim)
72 | self.detr_decoder = TransformerDecoder(decoder_layer,configs,decoder_norm)
73 | self.query_encoder = nn.Sequential(*[
74 | copy.deepcopy(query_attention_layer)
75 | for _ in range(configs.query_attention_layers)
76 | ])
77 | self.video_encoder = nn.Sequential(*[
78 | copy.deepcopy(video_attention_layer)
79 | for _ in range(configs.video_attention_layers)
80 | ])
81 | early_attention_layer = getattr(attention,
82 | configs.early_attention)(configs)
83 | self.early_encoder = nn.Sequential(*[
84 | copy.deepcopy(early_attention_layer)
85 | for _ in range(configs.early_attention_layers)
86 | ])
87 | self.contrastlayer = copy.deepcopy(video_attention_layer)
88 | self.fg_prediction_layer = Prediction(in_dim=configs.dim,
89 | hidden_dim=configs.dim // 2,
90 | out_dim=1,
91 | drop_rate=configs.drop_rate)
92 | self.early_fusion_layer = getattr(fusion,
93 | configs.early_fusion_module)(configs)
94 |
95 | self.fusion_layer = getattr(fusion, configs.fusion_module)(configs)
96 |
97 | post_attention_layer = getattr(attention,
98 | configs.post_attention)(configs)
99 | self.post_attention_layer = nn.Sequential(*[
100 | copy.deepcopy(post_attention_layer)
101 | for _ in range(configs.post_attention_layers)
102 | ])
103 | self.video_encoder2 = nn.Sequential(*[
104 | copy.deepcopy(post_attention_layer)
105 | for _ in range(configs.video_attention_layers)
106 | ])
107 | self.linear = nn.Linear(in_features=2*configs.dim,out_features=configs.dim,bias= True)
108 | cw_pred = Prediction(in_dim=configs.dim,
109 | hidden_dim=configs.dim // 2,
110 | out_dim=2,
111 | drop_rate=configs.drop_rate)
112 | self.cw_pred = nn.Sequential(*[
113 | copy.deepcopy(cw_pred)
114 | for _ in range(configs.detr_layers)
115 | ])
116 | pred_results = Prediction(in_dim=configs.dim,
117 | hidden_dim=configs.dim // 2,
118 | out_dim=2,
119 | drop_rate=configs.drop_rate)
120 | self.pred_results = nn.Sequential(*[
121 | copy.deepcopy(pred_results)
122 | for _ in range(configs.detr_layers)
123 | ])
124 | self.intering = Prediction(in_dim=configs.dim,
125 | hidden_dim=configs.dim // 2,
126 | out_dim=1,
127 | drop_rate=configs.drop_rate)
128 | self.pos_fused_layer =attention.ContrastBlock(configs.dim,configs.beta)
129 | self.neg_fused_layer =attention.ContrastBlock(configs.dim,configs.beta)
130 | self.pn_fused_layer =attention.ContrastBlock(configs.dim,configs.beta)
131 |
132 | def forward(self, batch_visual_scale,batch_word_vectors, batch_keyword_mask, batch_txt_mask,
133 | batch_vis_feats, batch_vis_mask):
134 | batch_vis_feats = self.video_affine(batch_vis_feats)
135 | batch_vis_feats = batch_vis_feats * batch_vis_mask.unsqueeze(2)
136 | for i, module in enumerate(self.video_encoder):
137 | if i == 0:
138 | video_features = module(batch_vis_feats, batch_vis_mask)
139 | else:
140 | video_features = module(video_features, batch_vis_mask)
141 | for i, module in enumerate(self.video_encoder2):
142 | if i == 0:
143 | video_features2 = module(batch_vis_feats, batch_vis_mask)
144 | else:
145 | video_features2 = module(video_features2, batch_vis_mask)
146 |
147 | batch_word_vectors = self.query_affine(batch_word_vectors)
148 | if self.query_position:
149 | batch_word_vectors = batch_word_vectors + self.q_pos_embedding(
150 | batch_word_vectors)
151 | batch_word_vectors = batch_word_vectors * batch_txt_mask.unsqueeze(2)
152 | for i, module in enumerate(self.query_encoder):
153 | if i == 0:
154 | query_features = module(batch_word_vectors, batch_txt_mask)
155 | else:
156 | query_features = module(query_features, batch_txt_mask)
157 | if self.use_keyword:
158 | entity_features = batch_word_vectors * batch_keyword_mask.unsqueeze(2)
159 | entity_features = query_features + entity_features
160 | else:
161 | entity_features = query_features
162 | # First stage
163 | entity_video_fused = self.early_fusion_layer(video_features,
164 | entity_features,
165 | batch_vis_mask,
166 | batch_txt_mask)
167 | for i, module in enumerate(self.early_encoder):
168 | entity_video_fused = module(entity_video_fused, batch_vis_mask)
169 | fg_prob = self.fg_prediction_layer(entity_video_fused)
170 |
171 | fg_prob1 =torch.sigmoid(fg_prob.squeeze(2))
172 |
173 | pos_values, pos_indices = torch.topk(fg_prob1.masked_fill(~batch_vis_mask.bool(), float('0.0')), k=self.top_k0, dim=1, largest=True)
174 | neg_values, neg_indices = torch.topk(fg_prob1.masked_fill(~batch_vis_mask.bool(), float('1.0')), k=self.top_k, dim=1, largest=False)
175 | B,l,c = entity_video_fused.shape
176 | if self.top_k0>1:
177 | pos=torch.gather(entity_video_fused, dim=1, index=pos_indices.unsqueeze(-1).expand(-1, -1, c))
178 | pos=F.max_pool1d(pos.transpose(1,2),kernel_size=self.top_k0).transpose(1,2)
179 | else:
180 | pos = torch.gather(entity_video_fused, 1, pos_indices.view(-1, 1).expand(-1, c).unsqueeze(1))
181 | neg = torch.gather(entity_video_fused, dim=1, index=neg_indices.unsqueeze(-1).expand(-1, -1, c))
182 | if not self.training and self.debug_print:
183 | print('fg_prob', torch.sigmoid(fg_prob))
184 | fg_vis_feature = (video_features2 +
185 | video_features) * torch.sigmoid(fg_prob)
186 | fused_pos_feature = self.pos_fused_layer(fg_vis_feature,pos)
187 | contrast_feature = self.contrastlayer(fg_vis_feature,batch_vis_mask)
188 | if self.pos is True:
189 | contrast_feature = contrast_feature + fused_pos_feature
190 | if self.neg is True:
191 | fused_neg_feature = torch.mean(self.neg_fused_layer(neg,pos),dim= 1).unsqueeze(1)
192 | fused_pn_feature = fg_vis_feature - self.pn_fused_layer(fg_vis_feature,fused_neg_feature)
193 | contrast_feature =contrast_feature + fused_pn_feature
194 | fg_vis_feature = torch.cat((fg_vis_feature,contrast_feature),dim=2)
195 | fg_vis_feature = self.linear(fg_vis_feature)
196 | fused_action_feature = self.fusion_layer(fg_vis_feature,
197 | entity_features,
198 | batch_vis_mask,
199 | batch_txt_mask)
200 | for i, module in enumerate(self.post_attention_layer):
201 | fused_action_feature = module(fused_action_feature, batch_vis_mask)
202 | query_embeddings = self.query_embeddings.weight
203 | refpoint_embed = repeat(query_embeddings, "nq d -> b nq d", b=B).transpose(0,1)
204 | if self.content_prior == "learned":
205 | pattern = self.pattern.weight
206 | tgt = repeat(pattern, "nq d -> b nq d", b=B).transpose(0,1)
207 | else:
208 | tgt = torch.zeros(refpoint_embed.shape[0],B,c).cuda()
209 | pred_start = []
210 | pred_end = []
211 | results = []
212 | memory_local = fused_action_feature.permute(1, 0, 2)
213 | pos_embed_local = self.v_pos_embedding(fused_action_feature,batch_vis_mask).permute(1, 0, 2)
214 | hs, references = self.detr_decoder(pos,batch_visual_scale,tgt, memory_local, memory_key_padding_mask=batch_vis_mask,
215 | pos=pos_embed_local, refpoints_unsigmoid=refpoint_embed)
216 | reference_before_sigmoid = inverse_sigmoid(references)
217 | for i in range(self.detr_layers):
218 | results.append(self.pred_results[i](hs[i,...]).squeeze(2))
219 | d_cw = self.cw_pred[i](hs[i,...])
220 | cw = (reference_before_sigmoid[i,...] + d_cw)
221 | se = cw2se(torch.sigmoid(cw))
222 | pred_start.append(se[...,0])
223 | pred_end.append(se[...,1])
224 | pred_inter = self.intering(fused_action_feature).squeeze(2)
225 |
226 |
227 | return pred_start,pred_end,pred_inter, query_features, video_features2, fg_prob.squeeze(
228 | 2), video_features, batch_word_vectors, batch_vis_feats,results,pos_indices,neg_indices,\
229 | contrast_feature
230 |
231 | def _get_src_permutation_idx(self, indices):
232 | # permute predictions following indices
233 | batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
234 | src_idx = torch.cat([src for (src, _) in indices])
235 | return batch_idx, src_idx # two 1D tensors of the same length
236 |
237 | def contrast_loss(self,mask,key_frame,inter_label,contrast_feature,hy_sigma=1,weight = True):
238 | b,l,c = contrast_feature.shape
239 | gauss_weights = generate_gaussian_tensor(inter_label, key_frame, hy_sigma)
240 | contrast_feature = F.normalize(contrast_feature, p=2, dim=2)
241 | key_frame_feature = torch.gather(contrast_feature, 1, key_frame.view(-1, 1).expand(-1, c).unsqueeze(1))
242 | score = torch.bmm(contrast_feature,key_frame_feature.transpose(1,2)).squeeze(2)
243 | loss = nn.BCEWithLogitsLoss(reduction='none')(score,inter_label)
244 | if weight is True:
245 | loss = loss * gauss_weights
246 | mask = mask.type(torch.float32)
247 | loss = torch.sum(loss * mask,
248 | dim=1) / (torch.sum(mask, dim=1) + 1e-13)
249 | return loss.mean()
250 |
251 | def PNcontrast_loss(self,mask,pos_frame,neg_frame,inter_label,contrast_feature,hy_sigma=1,weight = True):
252 | if self.top_k0>1:
253 | pos_loss = self.contrast_loss(mask,pos_frame[:,0],inter_label,contrast_feature,hy_sigma,False)
254 | else:
255 | pos_loss = self.contrast_loss(mask,pos_frame,inter_label,contrast_feature,hy_sigma,False)
256 | B,l = neg_frame.shape
257 | neg_loss = 0.
258 | if self.neg is True:
259 | for i in range(l):
260 | neg_loss = neg_loss + self.contrast_loss(mask,neg_frame[:,i],(1.0-inter_label),contrast_feature,hy_sigma,weight)
261 | return pos_loss + neg_loss/l
262 |
263 | def compute_loss(self, pred_start,pred_end, pred_inter, start_labels,
264 | end_labels, inter_label, mask,duration,pred_pro):
265 | bce_loss,iou_loss,L1_loss = 0,0,0
266 | for i in range(len(pred_start)):
267 | pred_times = torch.cat([pred_start[i].unsqueeze(2),pred_end[i].unsqueeze(2)],dim=2)
268 | b,l,_ = pred_times.shape
269 | times = torch.cat([(start_labels/duration).unsqueeze(1),(end_labels/duration).unsqueeze(1)],dim=1)
270 | indices = self.match(pred_pro[i],pred_times,times)
271 | idx = self._get_src_permutation_idx(indices)
272 | src_spans = pred_times[idx]
273 | L1_loss =L1_loss + F.l1_loss(src_spans, times, reduction='none').mean()
274 | iou_loss = iou_loss + (1- self.calculate_giou(src_spans, times)[1]).mean()
275 | target_classes = torch.full(pred_pro[i].shape[:2], 1,
276 | dtype=torch.int64, device=pred_pro[i].device)
277 | target_classes[idx] = 0
278 | bce_loss = bce_loss + self.bce_rescale_loss(pred_pro[i],target_classes)
279 |
280 | inter_loss = self.compute_location_loss(pred_inter, inter_label, mask)
281 | return L1_loss, inter_loss, iou_loss,bce_loss
282 |
283 | def bce_rescale_loss(self,scores, targets):
284 | loss_value = F.cross_entropy(scores.transpose(1, 2),targets,self.empty_weight, reduction="none")
285 | loss_value = loss_value.mean()
286 | return loss_value
287 |
288 | def calculate_giou(self,box1, box2):
289 | iou,union = self.calculate_iou(box1,box2)
290 | box1_left, box1_right = box1[..., 0], box1[..., 1]
291 | box2_left, box2_right = box2[..., 0], box2[..., 1]
292 | right = torch.maximum(box2_right, box1_right)
293 | left = torch.minimum(box2_left, box1_left)
294 | enclosing_area = (right - left).clamp(min=0)
295 | giou = iou - (enclosing_area - union) / enclosing_area
296 | return iou,giou
297 |
298 | def calculate_iou(self,box1, box2):
299 | box1_left, box1_right = box1[..., 0], box1[..., 1]
300 | box2_left, box2_right = box2[..., 0], box2[..., 1]
301 | areas1 = box1_right-box1_left
302 | areas2 = box2_right-box2_left
303 | inter_left = torch.maximum(box1_left, box2_left)
304 | inter_right = torch.minimum(box1_right, box2_right)
305 | inter = (inter_right - inter_left).clamp(min=0)
306 | union = areas1 + areas2 - inter
307 | iou = inter/ union
308 | return iou,union
309 |
310 | def compute_boundary_loss(self, pred, targets):
311 | return F.cross_entropy(pred, targets.long())
312 |
313 | def compute_location_loss(self, pred, targets, mask):
314 | weights_per_location = torch.where(targets == 0.0, targets + 1.0,
315 | 1.0 * targets)
316 | loss_per_location = nn.BCEWithLogitsLoss(reduction='none')(pred,
317 | targets)
318 | loss_per_location = loss_per_location * weights_per_location
319 | mask = mask.type(torch.float32)
320 | loss = torch.sum(loss_per_location * mask,
321 | dim=1) / (torch.sum(mask, dim=1) + 1e-13)
322 | return loss.mean()
323 |
324 |
325 | def compute_sim_loss(self, pred, pos, neg, saliency_margin = 0.2):
326 | b, l = pred.shape
327 | _, num_indices = pos.shape
328 | pos_indices = pos + (torch.arange(0, b).reshape(-1, 1) * l).cuda()
329 | neg_indices = neg + (torch.arange(0, b).reshape(-1, 1) * l).cuda()
330 | pred_score = torch.sigmoid(pred)
331 | pos_scores = pred_score.view(-1)[pos_indices.view(-1)].view(b, num_indices)
332 | neg_scores = pred_score.view(-1)[neg_indices.view(-1)].view(b, num_indices)
333 | loss_sim = torch.clamp(saliency_margin + neg_scores - pos_scores, min=0).sum() \
334 | / (b * num_indices) * 2 # * 2 to keep the loss the same scale
335 | return loss_sim
336 |
337 |
338 | def early_pred_loss(self, video_features, pred, targets, mask):
339 | return self.compute_location_loss(pred, targets, mask)
340 |
341 | def aligment_score(self,
342 | query_features,
343 | video_features,
344 | query_mask,
345 | video_mask,
346 | inner_label,
347 | GT_inner=True):
348 | B, T, channels = video_features.shape
349 |
350 | query_features = query_features.sum(1) / query_mask.sum(1).unsqueeze(1)
351 | query_features = F.normalize(query_features, p=2, dim=1) # B, channels
352 |
353 | if GT_inner:
354 | frame_weights = inner_label / video_mask.sum(1, keepdim=True)
355 | else:
356 | norm_video = F.normalize(video_features, p=2, dim=-1)
357 | frame_weights = torch.bmm(query_features.unsqueeze(1),
358 | norm_video.transpose(1, 2)) # B,1,T
359 | frame_weights = mask_logits(frame_weights.squeeze(1),
360 | video_mask) # B,T
361 | frame_weights = torch.softmax(frame_weights, dim=-1)
362 |
363 | video_features = video_features * frame_weights.unsqueeze(2)
364 | video_features = video_features.sum(1)
365 | video_features = F.normalize(video_features, p=2, dim=1)
366 | video_sim = torch.matmul(video_features, video_features.T)
367 | video_sim = torch.softmax(video_sim, dim=-1)
368 | query_sim = torch.matmul(query_features, query_features.T)
369 | query_sim = torch.softmax(query_sim, dim=-1)
370 | kl_loss = (F.kl_div(query_sim.log(), video_sim, reduction='sum') +
371 | F.kl_div(video_sim.log(), query_sim, reduction='sum')) / 2
372 |
373 | return kl_loss
374 |
375 | @staticmethod
376 | def extract_index(start_logits, end_logits):
377 | start_prob = nn.Softmax(dim=1)(start_logits)
378 | end_prob = nn.Softmax(dim=1)(end_logits)
379 | outer = torch.matmul(start_prob.unsqueeze(dim=2),
380 | end_prob.unsqueeze(dim=1))
381 | outer = torch.triu(outer, diagonal=0)
382 | _, start_index = torch.max(torch.max(outer, dim=2)[0],
383 | dim=1) # (batch_size, )
384 | _, end_index = torch.max(torch.max(outer, dim=1)[0],
385 | dim=1) # (batch_size, )
386 | return start_index, end_index
387 |
388 | @staticmethod
389 | def eval_test(model,
390 | data_loader,
391 | device,
392 | mode='test',
393 | epoch=None,
394 | shuffle_video_frame=False):
395 | ious = []
396 | pos_labels = []
397 | pseudo=[]
398 | preds, durations,names,times = [], [],[],[]
399 | with torch.no_grad():
400 | for idx, batch_data in tqdm(enumerate(data_loader),
401 | total=len(data_loader),
402 | desc='evaluate {}'.format(mode)):
403 | data, annos = batch_data
404 | batch_word_vectors = data['batch_word_vectors'].to(device)
405 | batch_keyword_mask = data['batch_keyword_mask'].to(device)
406 | batch_txt_mask = data['batch_txt_mask'].squeeze(2).to(device)
407 | batch_vis_feats = data['batch_vis_feats'].to(device)
408 | batch_vis_mask = data['batch_vis_mask'].squeeze(2).to(device)
409 | batch_extend_pre = data['batch_extend_pre'].to(device)
410 | batch_extend_suf = data['batch_extend_suf'].to(device)
411 | batch_visual_scale = data["visual_scale"].unsqueeze(-1).to(device)
412 | if shuffle_video_frame:
413 | B = batch_vis_feats.shape[0]
414 | for i in range(B):
415 | T = batch_vis_mask[i].sum().int().item()
416 | pre = batch_extend_pre[i].item()
417 | new_T = torch.randperm(T)
418 | batch_vis_feats[i, torch.arange(T) +
419 | pre] = batch_vis_feats[i, new_T + pre]
420 | # compute predicted results
421 | with torch.cuda.amp.autocast():
422 | output = model(batch_visual_scale,batch_word_vectors, batch_keyword_mask,
423 | batch_txt_mask, batch_vis_feats,
424 | batch_vis_mask)
425 | pseudo_pros=output[5]
426 | probalities_class = torch.softmax(output[9][-1],dim = -1)
427 | probalities = probalities_class[...,0]
428 | pred_p = torch.argmax(probalities,dim = 1)
429 | start_logits, end_logits = output[0][-1], output[1][-1]
430 | start_logits = torch.gather(start_logits, 1, pred_p.view(-1, 1)).clamp(0,1).squeeze(1)
431 | start_logits[torch.isnan(start_logits)] = 0.
432 | end_logits = torch.gather(end_logits, 1, pred_p.view(-1, 1)).clamp(0,1).squeeze(1)
433 | end_logits[torch.isnan(end_logits)] = 1.
434 | pos_frames = output[-3]
435 |
436 | start_indices = start_logits.cpu().numpy()
437 | end_indices = end_logits.cpu().numpy()
438 | batch_vis_mask = batch_vis_mask.cpu().numpy()
439 | batch_extend_pre = batch_extend_pre.cpu().numpy()
440 | batch_extend_suf = batch_extend_suf.cpu().numpy()
441 | pos_frames = pos_frames.cpu().numpy()
442 |
443 |
444 | for vis_mask, start_index, end_index, extend_pre, extend_suf, anno,pos_frame,pseudo_pro in zip(
445 | batch_vis_mask, start_indices, end_indices,
446 | batch_extend_pre, batch_extend_suf, annos,pos_frames,pseudo_pros):
447 |
448 | start_time, end_time = index_to_time2(
449 | start_index, end_index, vis_mask.sum(), extend_pre,
450 | extend_suf, anno["duration"])
451 |
452 | iou = calculate_iou(i0=[start_time, end_time],
453 | i1=anno['times'])
454 | ious.append(iou)
455 | preds.append((start_time, end_time))
456 | durations.append(anno["duration"])
457 | times.append(anno["times"])
458 | names.append(anno["video"])
459 | pseudo.append(pseudo_pro)
460 | import pandas as pd
461 | df = pd.DataFrame({
462 | 'Column1': names,
463 | 'Column2': times,
464 | 'Column3': preds,
465 | "Column4":pseudo
466 | })
467 | df.to_excel('output.xlsx', index=False, engine='openpyxl')
468 |
469 | statistics_str = cal_statistics(preds, durations)
470 | r1i1 = calculate_iou_accuracy(ious, threshold=0.1)
471 | r1i2 = calculate_iou_accuracy(ious, threshold=0.2)
472 | r1i3 = calculate_iou_accuracy(ious, threshold=0.3)
473 | r1i4 = calculate_iou_accuracy(ious, threshold=0.4)
474 | r1i5 = calculate_iou_accuracy(ious, threshold=0.5)
475 | r1i6 = calculate_iou_accuracy(ious, threshold=0.6)
476 | r1i7 = calculate_iou_accuracy(ious, threshold=0.7)
477 | r1i8 = calculate_iou_accuracy(ious, threshold=0.8)
478 | r1i9 = calculate_iou_accuracy(ious, threshold=0.9)
479 |
480 | mi = np.mean(ious) * 100.0
481 | # write the scores
482 | score_str = "Epoch {}\n".format(epoch)
483 | score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3)
484 | score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5)
485 | score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7)
486 | score_str += "mean IoU: {:.2f}\n".format(mi)
487 | return r1i3, r1i5, r1i7, mi, score_str, statistics_str
488 |
--------------------------------------------------------------------------------