├── .gitignore ├── LICENSE ├── README.md ├── TF_logger.py ├── __init__.py ├── ava ├── __init__.py ├── ava_action_list_v2.0.csv ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt ├── label_map_util.py ├── metrics.py ├── np_box_list.py ├── np_box_list_ops.py ├── np_box_mask_list.py ├── np_box_mask_list_ops.py ├── np_box_ops.py ├── np_mask_ops.py ├── object_detection_evaluation.py ├── per_image_evaluation.py ├── standard_fields.py └── teat.py ├── backbone ├── __init__.py ├── base.py ├── hidden_for_roi.py ├── hidden_for_roi2.py ├── hidden_for_roi_maxpool.py ├── resnet101.py ├── resnet18.py ├── resnet50.py ├── slowfast_res101.py ├── slowfast_res50.py └── slowfastnet.py ├── bbox.py ├── bbox1.py ├── cfg ├── tiny-yolo-voc.cfg ├── yolo-voc.cfg ├── yolo.cfg └── yolov3.cfg ├── config ├── config.py ├── eval_config.py └── train_config.py ├── darknet.py ├── data └── pam.gif ├── dataset ├── AVA.py ├── AVA_video_OLD.py ├── AVA_video_v1.py ├── AVA_video_v2.py ├── base.py ├── coco2017.py ├── coco2017_animal.py ├── coco2017_car.py ├── coco2017_person.py ├── voc2007.py └── voc2007_cat_dog.py ├── deep ├── __init__.py ├── checkpoint │ └── original_ckpt.t7 ├── evaluate.py ├── feature_extractor.py ├── model.py ├── original_model.py ├── test.py ├── train.jpg └── train.py ├── deep_sort.py ├── det ├── det_dog.jpg ├── det_eagle.jpg ├── det_giraffe.jpg ├── det_herd_of_horses.jpg ├── det_img1.jpg ├── det_img2.jpg ├── det_img3.jpg ├── det_img4.jpg ├── det_messi.jpg ├── det_person.jpg └── det_scream.jpg ├── detect.py ├── eval.py ├── evaluator.py ├── extention ├── functional.py └── lr_scheduler.py ├── f.py ├── functional.py ├── get_ava_performance.py ├── img_to_video.py ├── imgs ├── dog.jpg ├── eagle.jpg ├── giraffe.jpg ├── herd_of_horses.jpg ├── img1.jpg ├── img2.jpg ├── img3.jpg ├── img4.jpg ├── messi.jpg ├── person.jpg └── scream.jpg ├── imshow_result.py ├── imshow_result_OLD.py ├── infer.py ├── infer_stream.py ├── infer_websocket.py ├── logger.py ├── logs ├── events.out.tfevents.1555900792.aiuser-Z390-GAMING-X └── events.out.tfevents.1555900949.aiuser-Z390-GAMING-X ├── model.py ├── outputs └── frames │ └── blank.TXT ├── preprocess.py ├── requirements.txt ├── roi ├── pooler.py └── pooler_.py ├── rpn ├── mkf.py └── region_proposal_network.py ├── runs ├── Apr15_19-42-07_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555328527.aiuser-Z390-GAMING-X ├── Apr15_19-42-31_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555328551.aiuser-Z390-GAMING-X ├── Apr15_19-42-47_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555328567.aiuser-Z390-GAMING-X ├── Apr15_19-44-13_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555328653.aiuser-Z390-GAMING-X ├── Apr15_19-47-03_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555328823.aiuser-Z390-GAMING-X ├── Apr15_19-53-21_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555329201.aiuser-Z390-GAMING-X ├── Apr15_19-56-51_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555329411.aiuser-Z390-GAMING-X ├── Apr15_20-00-31_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555329631.aiuser-Z390-GAMING-X └── Apr15_20-12-31_aiuser-Z390-GAMING-XNet1 │ └── events.out.tfevents.1555330351.aiuser-Z390-GAMING-X ├── scripts ├── coco2017 │ ├── eval.sh │ ├── infer.sh │ ├── train-bs1.sh │ ├── train-bs16.sh │ ├── train-bs2.sh │ ├── train-bs4.sh │ └── train-bs8.sh └── voc2007 │ ├── eval.sh │ ├── infer.sh │ ├── train-bs1.sh │ ├── train-bs16.sh │ ├── train-bs2.sh │ ├── train-bs4.sh │ └── train-bs8.sh ├── slow_fast.ipynb ├── sort ├── __init__.py ├── detection.py ├── iou_matching.py ├── kalman_filter.py ├── linear_assignment.py ├── nn_matching.py ├── preprocessing.py ├── track.py └── tracker.py ├── support ├── layer │ ├── nms.py │ └── roi_align.py ├── setup.py └── src │ ├── ROIAlign.h │ ├── cpu │ ├── ROIAlign_cpu.cpp │ ├── nms_cpu.cpp │ └── vision.h │ ├── cuda │ ├── ROIAlign_cuda.cu │ ├── nms.cu │ └── vision.h │ ├── nms.h │ └── vision.cpp ├── test.py ├── test └── nms │ ├── nms-large-input.npy │ ├── nms-large-output.npy │ └── test_nms.py ├── test_con.py ├── test_daptice.py ├── test_nms.py ├── tiny-yolo-voc.cfg ├── train.py ├── trainvideo.py ├── util.py ├── video_demo.py ├── voc_eval.py ├── yolo-voc.cfg ├── yolo.cfg └── yolov3.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Slow-Fast-pytorch-implementation with Colab notebook 2 | ![](data/pam.gif) 3 | 4 | # Run the demo on your own data 5 | 6 | 1.Clone the repository: git clone https://github.com/vaib-saxena/Slow-Fast-pytorch-implementation.git 7 | 8 | 2.Download Yolo v3 weights: https://drive.google.com/file/d/1SSpVueL6W_4BE3sFDkzAgdMd35Mtl2N5/view?usp=sharing and paste in the directory 9 | 10 | 3.Download DeepSort re-id weights: https://drive.google.com/file/d/1bwLHXS5TocUfDL2-iLNJLs8WfUOZtg9B/view?usp=sharing and paste in deep\checkpoint directory 11 | 12 | 4.Download Pre-trained SlowFast Network weights: https://drive.google.com/file/d/1ooE-qh7LBL7kWceZRHPyIIBslWCBwdwy/view?usp=sharing and paste in the directory 13 | 14 | 5.Modify the weights path and your video path in video_demo.py. 15 | 16 | 6.Run video_demo.py. 17 | 18 | # Colab notebook 19 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vaib-saxena/Slow-Fast-pytorch-implementation/blob/master/slow_fast.ipynb) 20 | 21 | 22 | # Dependencies 23 | - python 3 (python2 not sure) 24 | - numpy 25 | - scipy 26 | - opencv-python 27 | - torch >= 1.0.0 28 | - torchvision = 0.2.1 29 | - youtube-dl 30 | - ffmpeg 31 | 32 | 33 | # Reference 34 | - paper: [Slow Fast Networks](https://arxiv.org/pdf/1812.03982.pdf) 35 | 36 | - code: [facebookresearch/SlowFast](https://github.com/facebookresearch/SlowFast) 37 | 38 | - https://github.com/MagicChuyi/SlowFast-Network-pytorch 39 | 40 | - paper: [Simple Online and Realtime Tracking with a Deep Association Metric](https://arxiv.org/abs/1703.07402) 41 | 42 | - code: [nwojke/deep_sort](https://github.com/nwojke/deep_sort) 43 | 44 | - paper: [YOLOv3](https://pjreddie.com/media/files/papers/YOLOv3.pdf) 45 | 46 | - code: [Joseph Redmon/yolov3](https://pjreddie.com/darknet/yolo/) 47 | -------------------------------------------------------------------------------- /TF_logger.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import scipy.misc 4 | try: 5 | from StringIO import StringIO # Python 2.7 6 | except ImportError: 7 | from io import BytesIO # Python 3.x 8 | 9 | 10 | class Logger(object): 11 | 12 | def __init__(self, log_dir): 13 | """Create a summary writer logging to log_dir.""" 14 | # 创建一个指向log文件夹的summary writer 15 | self.writer = tf.summary.FileWriter(log_dir) 16 | 17 | def scalar_summary(self, tag, value, step): 18 | """Log a scalar variable.""" 19 | # 标量信息 日志 20 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 21 | self.writer.add_summary(summary, step) 22 | 23 | def image_summary(self, tag, images, step): 24 | """Log a list of images.""" 25 | # 图像信息 日志 26 | img_summaries = [] 27 | for i, img in enumerate(images): 28 | # Write the image to a string 29 | try: 30 | s = StringIO() 31 | except: 32 | s = BytesIO() 33 | scipy.misc.toimage(img).save(s, format="png") 34 | 35 | # Create an Image object 36 | img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), 37 | height=img.shape[0], 38 | width=img.shape[1]) 39 | # Create a Summary value 40 | img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) 41 | 42 | # Create and write Summary 43 | summary = tf.Summary(value=img_summaries) 44 | self.writer.add_summary(summary, step) 45 | 46 | def histo_summary(self, tag, values, step, bins=1000): 47 | """Log a histogram of the tensor of values.""" 48 | # 直方图信息 日志 49 | # Create a histogram using numpy 50 | counts, bin_edges = np.histogram(values, bins=bins) 51 | 52 | # Fill the fields of the histogram proto 53 | hist = tf.HistogramProto() 54 | hist.min = float(np.min(values)) 55 | hist.max = float(np.max(values)) 56 | hist.num = int(np.prod(values.shape)) 57 | hist.sum = float(np.sum(values)) 58 | hist.sum_squares = float(np.sum(values ** 2)) 59 | 60 | # Drop the start of the first bin 61 | bin_edges = bin_edges[1:] 62 | 63 | # Add bin edges and counts 64 | for edge in bin_edges: 65 | hist.bucket_limit.append(edge) 66 | for c in counts: 67 | hist.bucket.append(c) 68 | 69 | # Create and write Summary 70 | summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) 71 | self.writer.add_summary(summary, step) 72 | self.writer.flush() -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/__init__.py -------------------------------------------------------------------------------- /ava/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/ava/__init__.py -------------------------------------------------------------------------------- /ava/ava_action_list_v2.0.csv: -------------------------------------------------------------------------------- 1 | label_id,label_name,label_type 2 | 1,bend/bow (at the waist),PERSON_MOVEMENT 3 | 2,crawl,PERSON_MOVEMENT 4 | 3,crouch/kneel,PERSON_MOVEMENT 5 | 4,dance,PERSON_MOVEMENT 6 | 5,fall down,PERSON_MOVEMENT 7 | 6,get up,PERSON_MOVEMENT 8 | 7,jump/leap,PERSON_MOVEMENT 9 | 8,lie/sleep,PERSON_MOVEMENT 10 | 9,martial art,PERSON_MOVEMENT 11 | 10,run/jog,PERSON_MOVEMENT 12 | 11,sit,PERSON_MOVEMENT 13 | 12,stand,PERSON_MOVEMENT 14 | 13,swim,PERSON_MOVEMENT 15 | 14,walk,PERSON_MOVEMENT 16 | 15,answer phone,OBJECT_MANIPULATION 17 | 16,brush teeth,OBJECT_MANIPULATION 18 | 17,carry/hold (an object),OBJECT_MANIPULATION 19 | 18,catch (an object),OBJECT_MANIPULATION 20 | 19,chop,OBJECT_MANIPULATION 21 | 20,"climb (e.g., a mountain)",OBJECT_MANIPULATION 22 | 21,clink glass,OBJECT_MANIPULATION 23 | 22,"close (e.g., a door, a box)",OBJECT_MANIPULATION 24 | 23,cook,OBJECT_MANIPULATION 25 | 24,cut,OBJECT_MANIPULATION 26 | 25,dig,OBJECT_MANIPULATION 27 | 26,dress/put on clothing,OBJECT_MANIPULATION 28 | 27,drink,OBJECT_MANIPULATION 29 | 28,"drive (e.g., a car, a truck)",OBJECT_MANIPULATION 30 | 29,eat,OBJECT_MANIPULATION 31 | 30,enter,OBJECT_MANIPULATION 32 | 31,exit,OBJECT_MANIPULATION 33 | 32,extract,OBJECT_MANIPULATION 34 | 33,fishing,OBJECT_MANIPULATION 35 | 34,hit (an object),OBJECT_MANIPULATION 36 | 35,kick (an object),OBJECT_MANIPULATION 37 | 36,lift/pick up,OBJECT_MANIPULATION 38 | 37,"listen (e.g., to music)",OBJECT_MANIPULATION 39 | 38,"open (e.g., a window, a car door)",OBJECT_MANIPULATION 40 | 39,paint,OBJECT_MANIPULATION 41 | 40,play board game,OBJECT_MANIPULATION 42 | 41,play musical instrument,OBJECT_MANIPULATION 43 | 42,play with pets,OBJECT_MANIPULATION 44 | 43,point to (an object),OBJECT_MANIPULATION 45 | 44,press,OBJECT_MANIPULATION 46 | 45,pull (an object),OBJECT_MANIPULATION 47 | 46,push (an object),OBJECT_MANIPULATION 48 | 47,put down,OBJECT_MANIPULATION 49 | 48,read,OBJECT_MANIPULATION 50 | 49,"ride (e.g., a bike, a car, a horse)",OBJECT_MANIPULATION 51 | 50,row boat,OBJECT_MANIPULATION 52 | 51,sail boat,OBJECT_MANIPULATION 53 | 52,shoot,OBJECT_MANIPULATION 54 | 53,shovel,OBJECT_MANIPULATION 55 | 54,smoke,OBJECT_MANIPULATION 56 | 55,stir,OBJECT_MANIPULATION 57 | 56,take a photo,OBJECT_MANIPULATION 58 | 57,text on/look at a cellphone,OBJECT_MANIPULATION 59 | 58,throw,OBJECT_MANIPULATION 60 | 59,touch (an object),OBJECT_MANIPULATION 61 | 60,"turn (e.g., a screwdriver)",OBJECT_MANIPULATION 62 | 61,"watch (e.g., TV)",OBJECT_MANIPULATION 63 | 62,work on a computer,OBJECT_MANIPULATION 64 | 63,write,OBJECT_MANIPULATION 65 | 64,fight/hit (a person),PERSON_INTERACTION 66 | 65,give/serve (an object) to (a person),PERSON_INTERACTION 67 | 66,grab (a person),PERSON_INTERACTION 68 | 67,hand clap,PERSON_INTERACTION 69 | 68,hand shake,PERSON_INTERACTION 70 | 69,hand wave,PERSON_INTERACTION 71 | 70,hug (a person),PERSON_INTERACTION 72 | 71,kick (a person),PERSON_INTERACTION 73 | 72,kiss (a person),PERSON_INTERACTION 74 | 73,lift (a person),PERSON_INTERACTION 75 | 74,listen to (a person),PERSON_INTERACTION 76 | 75,play with kids,PERSON_INTERACTION 77 | 76,push (another person),PERSON_INTERACTION 78 | 77,"sing to (e.g., self, a person, a group)",PERSON_INTERACTION 79 | 78,take (an object) from (a person),PERSON_INTERACTION 80 | 79,"talk to (e.g., self, a person, a group)",PERSON_INTERACTION 81 | 80,watch (a person),PERSON_INTERACTION 82 | -------------------------------------------------------------------------------- /ava/ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt: -------------------------------------------------------------------------------- 1 | item { 2 | name: "bend/bow (at the waist)" 3 | id: 1 4 | } 5 | item { 6 | name: "crouch/kneel" 7 | id: 3 8 | } 9 | item { 10 | name: "dance" 11 | id: 4 12 | } 13 | item { 14 | name: "fall down" 15 | id: 5 16 | } 17 | item { 18 | name: "get up" 19 | id: 6 20 | } 21 | item { 22 | name: "jump/leap" 23 | id: 7 24 | } 25 | item { 26 | name: "lie/sleep" 27 | id: 8 28 | } 29 | item { 30 | name: "martial art" 31 | id: 9 32 | } 33 | item { 34 | name: "run/jog" 35 | id: 10 36 | } 37 | item { 38 | name: "sit" 39 | id: 11 40 | } 41 | item { 42 | name: "stand" 43 | id: 12 44 | } 45 | item { 46 | name: "swim" 47 | id: 13 48 | } 49 | item { 50 | name: "walk" 51 | id: 14 52 | } 53 | item { 54 | name: "answer phone" 55 | id: 15 56 | } 57 | item { 58 | name: "carry/hold (an object)" 59 | id: 17 60 | } 61 | item { 62 | name: "climb (e.g., a mountain)" 63 | id: 20 64 | } 65 | item { 66 | name: "close (e.g., a door, a box)" 67 | id: 22 68 | } 69 | item { 70 | name: "cut" 71 | id: 24 72 | } 73 | item { 74 | name: "dress/put on clothing" 75 | id: 26 76 | } 77 | item { 78 | name: "drink" 79 | id: 27 80 | } 81 | item { 82 | name: "drive (e.g., a car, a truck)" 83 | id: 28 84 | } 85 | item { 86 | name: "eat" 87 | id: 29 88 | } 89 | item { 90 | name: "enter" 91 | id: 30 92 | } 93 | item { 94 | name: "hit (an object)" 95 | id: 34 96 | } 97 | item { 98 | name: "lift/pick up" 99 | id: 36 100 | } 101 | item { 102 | name: "listen (e.g., to music)" 103 | id: 37 104 | } 105 | item { 106 | name: "open (e.g., a window, a car door)" 107 | id: 38 108 | } 109 | item { 110 | name: "play musical instrument" 111 | id: 41 112 | } 113 | item { 114 | name: "point to (an object)" 115 | id: 43 116 | } 117 | item { 118 | name: "pull (an object)" 119 | id: 45 120 | } 121 | item { 122 | name: "push (an object)" 123 | id: 46 124 | } 125 | item { 126 | name: "put down" 127 | id: 47 128 | } 129 | item { 130 | name: "read" 131 | id: 48 132 | } 133 | item { 134 | name: "ride (e.g., a bike, a car, a horse)" 135 | id: 49 136 | } 137 | item { 138 | name: "sail boat" 139 | id: 51 140 | } 141 | item { 142 | name: "shoot" 143 | id: 52 144 | } 145 | item { 146 | name: "smoke" 147 | id: 54 148 | } 149 | item { 150 | name: "take a photo" 151 | id: 56 152 | } 153 | item { 154 | name: "text on/look at a cellphone" 155 | id: 57 156 | } 157 | item { 158 | name: "throw" 159 | id: 58 160 | } 161 | item { 162 | name: "touch (an object)" 163 | id: 59 164 | } 165 | item { 166 | name: "turn (e.g., a screwdriver)" 167 | id: 60 168 | } 169 | item { 170 | name: "watch (e.g., TV)" 171 | id: 61 172 | } 173 | item { 174 | name: "work on a computer" 175 | id: 62 176 | } 177 | item { 178 | name: "write" 179 | id: 63 180 | } 181 | item { 182 | name: "fight/hit (a person)" 183 | id: 64 184 | } 185 | item { 186 | name: "give/serve (an object) to (a person)" 187 | id: 65 188 | } 189 | item { 190 | name: "grab (a person)" 191 | id: 66 192 | } 193 | item { 194 | name: "hand clap" 195 | id: 67 196 | } 197 | item { 198 | name: "hand shake" 199 | id: 68 200 | } 201 | item { 202 | name: "hand wave" 203 | id: 69 204 | } 205 | item { 206 | name: "hug (a person)" 207 | id: 70 208 | } 209 | item { 210 | name: "kiss (a person)" 211 | id: 72 212 | } 213 | item { 214 | name: "lift (a person)" 215 | id: 73 216 | } 217 | item { 218 | name: "listen to (a person)" 219 | id: 74 220 | } 221 | item { 222 | name: "push (another person)" 223 | id: 76 224 | } 225 | item { 226 | name: "sing to (e.g., self, a person, a group)" 227 | id: 77 228 | } 229 | item { 230 | name: "take (an object) from (a person)" 231 | id: 78 232 | } 233 | item { 234 | name: "talk to (e.g., self, a person, a group)" 235 | id: 79 236 | } 237 | item { 238 | name: "watch (a person)" 239 | id: 80 240 | } 241 | -------------------------------------------------------------------------------- /ava/label_map_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Label map utility functions.""" 16 | 17 | import logging 18 | 19 | # from google.protobuf import text_format 20 | # from google3.third_party.tensorflow_models.object_detection.protos import string_int_label_map_pb2 21 | 22 | 23 | def _validate_label_map(label_map): 24 | """Checks if a label map is valid. 25 | 26 | Args: 27 | label_map: StringIntLabelMap to validate. 28 | 29 | Raises: 30 | ValueError: if label map is invalid. 31 | """ 32 | for item in label_map.item: 33 | if item.id < 1: 34 | raise ValueError('Label map ids should be >= 1.') 35 | 36 | 37 | def create_category_index(categories): 38 | """Creates dictionary of COCO compatible categories keyed by category id. 39 | 40 | Args: 41 | categories: a list of dicts, each of which has the following keys: 42 | 'id': (required) an integer id uniquely identifying this category. 43 | 'name': (required) string representing category name 44 | e.g., 'cat', 'dog', 'pizza'. 45 | 46 | Returns: 47 | category_index: a dict containing the same entries as categories, but keyed 48 | by the 'id' field of each category. 49 | """ 50 | category_index = {} 51 | for cat in categories: 52 | category_index[cat['id']] = cat 53 | return category_index 54 | 55 | 56 | def get_max_label_map_index(label_map): 57 | """Get maximum index in label map. 58 | 59 | Args: 60 | label_map: a StringIntLabelMapProto 61 | 62 | Returns: 63 | an integer 64 | """ 65 | return max([item.id for item in label_map.item]) 66 | 67 | 68 | def convert_label_map_to_categories(label_map, 69 | max_num_classes, 70 | use_display_name=True): 71 | """Loads label map proto and returns categories list compatible with eval. 72 | 73 | This function loads a label map and returns a list of dicts, each of which 74 | has the following keys: 75 | 'id': (required) an integer id uniquely identifying this category. 76 | 'name': (required) string representing category name 77 | e.g., 'cat', 'dog', 'pizza'. 78 | We only allow class into the list if its id-label_id_offset is 79 | between 0 (inclusive) and max_num_classes (exclusive). 80 | If there are several items mapping to the same id in the label map, 81 | we will only keep the first one in the categories list. 82 | 83 | Args: 84 | label_map: a StringIntLabelMapProto or None. If None, a default categories 85 | list is created with max_num_classes categories. 86 | max_num_classes: maximum number of (consecutive) label indices to include. 87 | use_display_name: (boolean) choose whether to load 'display_name' field 88 | as category name. If False or if the display_name field does not exist, 89 | uses 'name' field as category names instead. 90 | Returns: 91 | categories: a list of dictionaries representing all possible categories. 92 | """ 93 | categories = [] 94 | list_of_ids_already_added = [] 95 | if not label_map: 96 | label_id_offset = 1 97 | for class_id in range(max_num_classes): 98 | categories.append({ 99 | 'id': class_id + label_id_offset, 100 | 'name': 'category_{}'.format(class_id + label_id_offset) 101 | }) 102 | return categories 103 | for item in label_map.item: 104 | if not 0 < item.id <= max_num_classes: 105 | logging.info('Ignore item %d since it falls outside of requested ' 106 | 'label range.', item.id) 107 | continue 108 | if use_display_name and item.HasField('display_name'): 109 | name = item.display_name 110 | else: 111 | name = item.name 112 | if item.id not in list_of_ids_already_added: 113 | list_of_ids_already_added.append(item.id) 114 | categories.append({'id': item.id, 'name': name}) 115 | return categories 116 | 117 | 118 | def load_labelmap(path): 119 | """Loads label map proto. 120 | 121 | Args: 122 | path: path to StringIntLabelMap proto text file. 123 | Returns: 124 | a StringIntLabelMapProto 125 | """ 126 | with open(path, 'r') as fid: 127 | label_map_string = fid.read() 128 | label_map = string_int_label_map_pb2.StringIntLabelMap() 129 | try: 130 | text_format.Merge(label_map_string, label_map) 131 | except text_format.ParseError: 132 | label_map.ParseFromString(label_map_string) 133 | _validate_label_map(label_map) 134 | return label_map 135 | 136 | 137 | def get_label_map_dict(label_map_path, use_display_name=False): 138 | """Reads a label map and returns a dictionary of label names to id. 139 | 140 | Args: 141 | label_map_path: path to label_map. 142 | use_display_name: whether to use the label map items' display names as keys. 143 | 144 | Returns: 145 | A dictionary mapping label names to id. 146 | """ 147 | label_map = load_labelmap(label_map_path) 148 | label_map_dict = {} 149 | for item in label_map.item: 150 | if use_display_name: 151 | label_map_dict[item.display_name] = item.id 152 | else: 153 | label_map_dict[item.name] = item.id 154 | return label_map_dict 155 | 156 | 157 | def create_category_index_from_labelmap(label_map_path): 158 | """Reads a label map and returns a category index. 159 | 160 | Args: 161 | label_map_path: Path to `StringIntLabelMap` proto text file. 162 | 163 | Returns: 164 | A category index, which is a dictionary that maps integer ids to dicts 165 | containing categories, e.g. 166 | {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...} 167 | """ 168 | label_map = load_labelmap(label_map_path) 169 | max_num_classes = max(item.id for item in label_map.item) 170 | categories = convert_label_map_to_categories(label_map, max_num_classes) 171 | return create_category_index(categories) 172 | 173 | 174 | def create_class_agnostic_category_index(): 175 | """Creates a category index with a single `object` class.""" 176 | return {1: {'id': 1, 'name': 'object'}} 177 | -------------------------------------------------------------------------------- /ava/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Functions for computing metrics like precision, recall, CorLoc and etc.""" 17 | from __future__ import division 18 | 19 | import numpy as np 20 | 21 | 22 | def compute_precision_recall(scores, labels, num_gt): 23 | """Compute precision and recall. 24 | 25 | Args: 26 | scores: A float numpy array representing detection score 27 | labels: A boolean numpy array representing true/false positive labels 28 | num_gt: Number of ground truth instances 29 | 30 | Raises: 31 | ValueError: if the input is not of the correct format 32 | 33 | Returns: 34 | precision: Fraction of positive instances over detected ones. This value is 35 | None if no ground truth labels are present. 36 | recall: Fraction of detected positive instance over all positive instances. 37 | This value is None if no ground truth labels are present. 38 | 39 | """ 40 | if not isinstance( 41 | labels, np.ndarray) or labels.dtype != np.bool or len(labels.shape) != 1: 42 | raise ValueError("labels must be single dimension bool numpy array") 43 | 44 | if not isinstance( 45 | scores, np.ndarray) or len(scores.shape) != 1: 46 | raise ValueError("scores must be single dimension numpy array") 47 | 48 | if num_gt < np.sum(labels): 49 | raise ValueError("Number of true positives must be smaller than num_gt.") 50 | 51 | if len(scores) != len(labels): 52 | raise ValueError("scores and labels must be of the same size.") 53 | 54 | if num_gt == 0: 55 | return None, None 56 | 57 | sorted_indices = np.argsort(scores) 58 | sorted_indices = sorted_indices[::-1] 59 | labels = labels.astype(int) 60 | true_positive_labels = labels[sorted_indices] 61 | false_positive_labels = 1 - true_positive_labels 62 | cum_true_positives = np.cumsum(true_positive_labels) 63 | cum_false_positives = np.cumsum(false_positive_labels) 64 | precision = cum_true_positives.astype(float) / ( 65 | cum_true_positives + cum_false_positives) 66 | recall = cum_true_positives.astype(float) / num_gt 67 | return precision, recall 68 | 69 | 70 | def compute_average_precision(precision, recall): 71 | """Compute Average Precision according to the definition in VOCdevkit. 72 | 73 | Precision is modified to ensure that it does not decrease as recall 74 | decrease. 75 | 76 | Args: 77 | precision: A float [N, 1] numpy array of precisions 78 | recall: A float [N, 1] numpy array of recalls 79 | 80 | Raises: 81 | ValueError: if the input is not of the correct format 82 | 83 | Returns: 84 | average_precison: The area under the precision recall curve. NaN if 85 | precision and recall are None. 86 | 87 | """ 88 | if precision is None: 89 | if recall is not None: 90 | raise ValueError("If precision is None, recall must also be None") 91 | return np.NAN 92 | 93 | if not isinstance(precision, np.ndarray) or not isinstance(recall, 94 | np.ndarray): 95 | raise ValueError("precision and recall must be numpy array") 96 | if precision.dtype != np.float or recall.dtype != np.float: 97 | raise ValueError("input must be float numpy array.") 98 | if len(precision) != len(recall): 99 | raise ValueError("precision and recall must be of the same size.") 100 | if not precision.size: 101 | return 0.0 102 | if np.amin(precision) < 0 or np.amax(precision) > 1: 103 | raise ValueError("Precision must be in the range of [0, 1].") 104 | if np.amin(recall) < 0 or np.amax(recall) > 1: 105 | raise ValueError("recall must be in the range of [0, 1].") 106 | if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): 107 | raise ValueError("recall must be a non-decreasing array") 108 | 109 | recall = np.concatenate([[0], recall, [1]]) 110 | precision = np.concatenate([[0], precision, [0]]) 111 | 112 | # Preprocess precision to be a non-decreasing array 113 | for i in range(len(precision) - 2, -1, -1): 114 | precision[i] = np.maximum(precision[i], precision[i + 1]) 115 | 116 | indices = np.where(recall[1:] != recall[:-1])[0] + 1 117 | average_precision = np.sum( 118 | (recall[indices] - recall[indices - 1]) * precision[indices]) 119 | return average_precision 120 | 121 | 122 | def compute_cor_loc(num_gt_imgs_per_class, 123 | num_images_correctly_detected_per_class): 124 | """Compute CorLoc according to the definition in the following paper. 125 | 126 | https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf 127 | 128 | Returns nans if there are no ground truth images for a class. 129 | 130 | Args: 131 | num_gt_imgs_per_class: 1D array, representing number of images containing 132 | at least one object instance of a particular class 133 | num_images_correctly_detected_per_class: 1D array, representing number of 134 | images that are correctly detected at least one object instance of a 135 | particular class 136 | 137 | Returns: 138 | corloc_per_class: A float numpy array represents the corloc score of each 139 | class 140 | """ 141 | # Divide by zero expected for classes with no gt examples. 142 | with np.errstate(divide="ignore", invalid="ignore"): 143 | return np.where( 144 | num_gt_imgs_per_class == 0, np.nan, 145 | num_images_correctly_detected_per_class / num_gt_imgs_per_class) 146 | -------------------------------------------------------------------------------- /ava/np_box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxList classes and functions.""" 17 | 18 | import numpy as np 19 | 20 | 21 | class BoxList(object): 22 | """Box collection. 23 | 24 | BoxList represents a list of bounding boxes as numpy array, where each 25 | bounding box is represented as a row of 4 numbers, 26 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a 27 | given list correspond to a single image. 28 | 29 | Optionally, users can add additional related fields (such as 30 | objectness/classification scores). 31 | """ 32 | 33 | def __init__(self, data): 34 | """Constructs box collection. 35 | 36 | Args: 37 | data: a numpy array of shape [N, 4] representing box coordinates 38 | 39 | Raises: 40 | ValueError: if bbox data is not a numpy array 41 | ValueError: if invalid dimensions for bbox data 42 | """ 43 | if not isinstance(data, np.ndarray): 44 | raise ValueError('data must be a numpy array.') 45 | if len(data.shape) != 2 or data.shape[1] != 4: 46 | raise ValueError('Invalid dimensions for box data.') 47 | if data.dtype != np.float32 and data.dtype != np.float64: 48 | raise ValueError('Invalid data type for box data: float is required.') 49 | if not self._is_valid_boxes(data): 50 | raise ValueError('Invalid box data. data must be a numpy array of ' 51 | 'N*[y_min, x_min, y_max, x_max]') 52 | self.data = {'boxes': data} 53 | 54 | def num_boxes(self): 55 | """Return number of boxes held in collections.""" 56 | return self.data['boxes'].shape[0] 57 | 58 | def get_extra_fields(self): 59 | """Return all non-box fields.""" 60 | return [k for k in self.data.keys() if k != 'boxes'] 61 | 62 | def has_field(self, field): 63 | return field in self.data 64 | 65 | def add_field(self, field, field_data): 66 | """Add data to a specified field. 67 | 68 | Args: 69 | field: a string parameter used to speficy a related field to be accessed. 70 | field_data: a numpy array of [N, ...] representing the data associated 71 | with the field. 72 | Raises: 73 | ValueError: if the field is already exist or the dimension of the field 74 | data does not matches the number of boxes. 75 | """ 76 | if self.has_field(field): 77 | raise ValueError('Field ' + field + 'already exists') 78 | if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(): 79 | raise ValueError('Invalid dimensions for field data') 80 | self.data[field] = field_data 81 | 82 | def get(self): 83 | """Convenience function for accesssing box coordinates. 84 | 85 | Returns: 86 | a numpy array of shape [N, 4] representing box corners 87 | """ 88 | return self.get_field('boxes') 89 | 90 | def get_field(self, field): 91 | """Accesses data associated with the specified field in the box collection. 92 | 93 | Args: 94 | field: a string parameter used to speficy a related field to be accessed. 95 | 96 | Returns: 97 | a numpy 1-d array representing data of an associated field 98 | 99 | Raises: 100 | ValueError: if invalid field 101 | """ 102 | if not self.has_field(field): 103 | raise ValueError('field {} does not exist'.format(field)) 104 | return self.data[field] 105 | 106 | def get_coordinates(self): 107 | """Get corner coordinates of boxes. 108 | 109 | Returns: 110 | a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] 111 | """ 112 | box_coordinates = self.get() 113 | y_min = box_coordinates[:, 0] 114 | x_min = box_coordinates[:, 1] 115 | y_max = box_coordinates[:, 2] 116 | x_max = box_coordinates[:, 3] 117 | return [y_min, x_min, y_max, x_max] 118 | 119 | def _is_valid_boxes(self, data): 120 | """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin]. 121 | 122 | Args: 123 | data: a numpy array of shape [N, 4] representing box coordinates 124 | 125 | Returns: 126 | a boolean indicating whether all ymax of boxes are equal or greater than 127 | ymin, and all xmax of boxes are equal or greater than xmin. 128 | """ 129 | if data.shape[0] > 0: 130 | for i in range(data.shape[0]): 131 | if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]: 132 | return False 133 | return True 134 | -------------------------------------------------------------------------------- /ava/np_box_mask_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxMaskList classes and functions.""" 17 | 18 | import numpy as np 19 | import ava.np_box_list as np_box_list 20 | 21 | 22 | class BoxMaskList(np_box_list.BoxList): 23 | """Convenience wrapper for BoxList with masks. 24 | 25 | BoxMaskList extends the np_box_list.BoxList to contain masks as well. 26 | In particular, its constructor receives both boxes and masks. Note that the 27 | masks correspond to the full image. 28 | """ 29 | 30 | def __init__(self, box_data, mask_data): 31 | """Constructs box collection. 32 | 33 | Args: 34 | box_data: a numpy array of shape [N, 4] representing box coordinates 35 | mask_data: a numpy array of shape [N, height, width] representing masks 36 | with values are in {0,1}. The masks correspond to the full 37 | image. The height and the width will be equal to image height and width. 38 | 39 | Raises: 40 | ValueError: if bbox data is not a numpy array 41 | ValueError: if invalid dimensions for bbox data 42 | ValueError: if mask data is not a numpy array 43 | ValueError: if invalid dimension for mask data 44 | """ 45 | super(BoxMaskList, self).__init__(box_data) 46 | if not isinstance(mask_data, np.ndarray): 47 | raise ValueError('Mask data must be a numpy array.') 48 | if len(mask_data.shape) != 3: 49 | raise ValueError('Invalid dimensions for mask data.') 50 | if mask_data.dtype != np.uint8: 51 | raise ValueError('Invalid data type for mask data: uint8 is required.') 52 | if mask_data.shape[0] != box_data.shape[0]: 53 | raise ValueError('There should be the same number of boxes and masks.') 54 | self.data['masks'] = mask_data 55 | 56 | def get_masks(self): 57 | """Convenience function for accessing masks. 58 | 59 | Returns: 60 | a numpy array of shape [N, height, width] representing masks 61 | """ 62 | return self.get_field('masks') 63 | 64 | -------------------------------------------------------------------------------- /ava/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, 4] numpy arrays representing bounding boxes. 17 | 18 | Example box operations that are supported: 19 | * Areas: compute bounding box areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | import numpy as np 23 | 24 | 25 | def area(boxes): 26 | """Computes area of boxes. 27 | 28 | Args: 29 | boxes: Numpy array with shape [N, 4] holding N boxes 30 | 31 | Returns: 32 | a numpy array with shape [N*1] representing box areas 33 | """ 34 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 35 | 36 | 37 | def intersection(boxes1, boxes2): 38 | """Compute pairwise intersection areas between boxes. 39 | 40 | Args: 41 | boxes1: a numpy array with shape [N, 4] holding N boxes 42 | boxes2: a numpy array with shape [M, 4] holding M boxes 43 | 44 | Returns: 45 | a numpy array with shape [N*M] representing pairwise intersection area 46 | """ 47 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 48 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 49 | 50 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 51 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 52 | intersect_heights = np.maximum( 53 | np.zeros(all_pairs_max_ymin.shape), 54 | all_pairs_min_ymax - all_pairs_max_ymin) 55 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 56 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 57 | intersect_widths = np.maximum( 58 | np.zeros(all_pairs_max_xmin.shape), 59 | all_pairs_min_xmax - all_pairs_max_xmin) 60 | return intersect_heights * intersect_widths 61 | 62 | 63 | def iou(boxes1, boxes2): 64 | """Computes pairwise intersection-over-union between box collections. 65 | 66 | Args: 67 | boxes1: a numpy array with shape [N, 4] holding N boxes. 68 | boxes2: a numpy array with shape [M, 4] holding N boxes. 69 | 70 | Returns: 71 | a numpy array with shape [N, M] representing pairwise iou scores. 72 | """ 73 | intersect = intersection(boxes1, boxes2) 74 | area1 = area(boxes1) 75 | area2 = area(boxes2) 76 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 77 | area2, axis=0) - intersect 78 | return intersect / union 79 | 80 | 81 | def ioa(boxes1, boxes2): 82 | """Computes pairwise intersection-over-area between box collections. 83 | 84 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 85 | their intersection area over box2's area. Note that ioa is not symmetric, 86 | that is, IOA(box1, box2) != IOA(box2, box1). 87 | 88 | Args: 89 | boxes1: a numpy array with shape [N, 4] holding N boxes. 90 | boxes2: a numpy array with shape [M, 4] holding N boxes. 91 | 92 | Returns: 93 | a numpy array with shape [N, M] representing pairwise ioa scores. 94 | """ 95 | intersect = intersection(boxes1, boxes2) 96 | areas = np.expand_dims(area(boxes2), axis=0) 97 | return intersect / areas 98 | -------------------------------------------------------------------------------- /ava/np_mask_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, height, width] numpy arrays representing masks. 17 | 18 | Example mask operations that are supported: 19 | * Areas: compute mask areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | import numpy as np 23 | 24 | EPSILON = 1e-7 25 | 26 | 27 | def area(masks): 28 | """Computes area of masks. 29 | 30 | Args: 31 | masks: Numpy array with shape [N, height, width] holding N masks. Masks 32 | values are of type np.uint8 and values are in {0,1}. 33 | 34 | Returns: 35 | a numpy array with shape [N*1] representing mask areas. 36 | 37 | Raises: 38 | ValueError: If masks.dtype is not np.uint8 39 | """ 40 | if masks.dtype != np.uint8: 41 | raise ValueError('Masks type should be np.uint8') 42 | return np.sum(masks, axis=(1, 2), dtype=np.float32) 43 | 44 | 45 | def intersection(masks1, masks2): 46 | """Compute pairwise intersection areas between masks. 47 | 48 | Args: 49 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 50 | values are of type np.uint8 and values are in {0,1}. 51 | masks2: a numpy array with shape [M, height, width] holding M masks. Masks 52 | values are of type np.uint8 and values are in {0,1}. 53 | 54 | Returns: 55 | a numpy array with shape [N*M] representing pairwise intersection area. 56 | 57 | Raises: 58 | ValueError: If masks1 and masks2 are not of type np.uint8. 59 | """ 60 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 61 | raise ValueError('masks1 and masks2 should be of type np.uint8') 62 | n = masks1.shape[0] 63 | m = masks2.shape[0] 64 | answer = np.zeros([n, m], dtype=np.float32) 65 | for i in np.arange(n): 66 | for j in np.arange(m): 67 | answer[i, j] = np.sum(np.minimum(masks1[i], masks2[j]), dtype=np.float32) 68 | return answer 69 | 70 | 71 | def iou(masks1, masks2): 72 | """Computes pairwise intersection-over-union between mask collections. 73 | 74 | Args: 75 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 76 | values are of type np.uint8 and values are in {0,1}. 77 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 78 | values are of type np.uint8 and values are in {0,1}. 79 | 80 | Returns: 81 | a numpy array with shape [N, M] representing pairwise iou scores. 82 | 83 | Raises: 84 | ValueError: If masks1 and masks2 are not of type np.uint8. 85 | """ 86 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 87 | raise ValueError('masks1 and masks2 should be of type np.uint8') 88 | intersect = intersection(masks1, masks2) 89 | area1 = area(masks1) 90 | area2 = area(masks2) 91 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 92 | area2, axis=0) - intersect 93 | return intersect / np.maximum(union, EPSILON) 94 | 95 | 96 | def ioa(masks1, masks2): 97 | """Computes pairwise intersection-over-area between box collections. 98 | 99 | Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as 100 | their intersection area over mask2's area. Note that ioa is not symmetric, 101 | that is, IOA(mask1, mask2) != IOA(mask2, mask1). 102 | 103 | Args: 104 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 105 | values are of type np.uint8 and values are in {0,1}. 106 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 107 | values are of type np.uint8 and values are in {0,1}. 108 | 109 | Returns: 110 | a numpy array with shape [N, M] representing pairwise ioa scores. 111 | 112 | Raises: 113 | ValueError: If masks1 and masks2 are not of type np.uint8. 114 | """ 115 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 116 | raise ValueError('masks1 and masks2 should be of type np.uint8') 117 | intersect = intersection(masks1, masks2) 118 | areas = np.expand_dims(area(masks2), axis=0) 119 | return intersect / (areas + EPSILON) 120 | -------------------------------------------------------------------------------- /ava/teat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def t(): 4 | list=[1,2,3,4,5] 5 | list=np.array(list) 6 | list2=[2,3] 7 | print(list[list2]) 8 | if __name__ == '__main__': 9 | t() -------------------------------------------------------------------------------- /backbone/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/backbone/__init__.py -------------------------------------------------------------------------------- /backbone/base.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Type 2 | 3 | from torch import nn 4 | 5 | 6 | class Base(object): 7 | OPTIONS = ['resnet18', 'resnet50', 'resnet101','slowfastnet'] 8 | @staticmethod 9 | def from_name(name: str) -> Type['Base']: 10 | if name == 'resnet18': 11 | from backbone.resnet18 import ResNet18 12 | return ResNet18 13 | elif name == 'resnet50': 14 | from backbone.resnet50 import ResNet50 15 | return ResNet50 16 | elif name == 'resnet101': 17 | from backbone.resnet101 import ResNet101 18 | return ResNet101 19 | elif name == 'slowfastnet101': 20 | from backbone.slowfast_res101 import slowfast_res101 21 | return slowfast_res101 22 | elif name == 'slowfastnet50': 23 | from backbone.slowfast_res50 import slowfast_res50 24 | return slowfast_res50 25 | else: 26 | raise ValueError 27 | 28 | def __init__(self, pretrained: bool): 29 | super().__init__() 30 | self._pretrained = pretrained 31 | 32 | 33 | 34 | def features(self) -> Tuple[nn.Module, nn.Module, int, int]: 35 | raise NotImplementedError -------------------------------------------------------------------------------- /backbone/hidden_for_roi.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | 7 | 8 | class Hidden(nn.Module): 9 | 10 | def __init__(self, inplanes, planes, stride=1): 11 | super(Hidden, self).__init__() 12 | self.conv1 = nn.Conv2d(inplanes, inplanes, kernel_size=3,padding=1, bias=False) 13 | self.bn1 = nn.BatchNorm2d(planes) 14 | self.conv2 = nn.Conv2d(inplanes, inplanes, kernel_size=3, stride=stride, 15 | padding=1, bias=False) 16 | self.bn2 = nn.BatchNorm2d(planes) 17 | self.conv3 = nn.Conv2d(inplanes, planes, kernel_size=3, padding=1,bias=False) 18 | self.bn3 = nn.BatchNorm2d(planes) 19 | self.relu = nn.ReLU(inplace=True) 20 | 21 | def forward(self, x): 22 | out = self.conv1(x) 23 | out = self.bn1(out) 24 | out = self.relu(out) 25 | 26 | out = self.conv2(out) 27 | out = self.bn2(out) 28 | out = self.relu(out) 29 | 30 | out = self.conv3(out) 31 | out = self.bn3(out) 32 | out = self.relu(out) 33 | out = nn.AdaptiveAvgPool2d(1)(out) 34 | out = out.view(-1, out.size(1)) 35 | return out 36 | 37 | def weight_init(m): 38 | # if isinstance(m, nn.Linear): 39 | # nn.init.xavier_normal_(m.weight) 40 | # nn.init.constant_(m.bias, 0) 41 | # 也可以判断是否为conv2d,使用相应的初始化方式 42 | if isinstance(m, nn.Conv3d): 43 | print("using kaiming") 44 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 45 | # 是否为批归一化层 46 | # elif isinstance(m, nn.BatchNorm3d): 47 | # nn.init.constant_(m.weight, 1) 48 | # nn.init.constant_(m.bias, 0) 49 | def hidden50(**kwargs): 50 | """Constructs a ResNet-50 model. 51 | """ 52 | model = Hidden(2304,2304,2) 53 | # model.apply(weight_init) 54 | print('model', model) 55 | return model -------------------------------------------------------------------------------- /backbone/hidden_for_roi2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | 7 | class Bottleneck(nn.Module): 8 | expansion = 4 9 | 10 | def __init__(self, inplanes, planes, stride=1, downsample=None, head_conv=1): 11 | super(Bottleneck, self).__init__() 12 | # 2d 1*1 13 | if head_conv == 1: 14 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False,dilation=2) 15 | self.bn1 = nn.BatchNorm3d(planes) 16 | 17 | #3d 1*1 18 | elif head_conv == 3: 19 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(3, 1, 1), bias=False, padding=(2, 0, 0),dilation=2) 20 | self.bn1 = nn.BatchNorm3d(planes) 21 | else: 22 | raise ValueError("Unsupported head_conv!") 23 | self.conv2 = nn.Conv3d( 24 | planes, planes, kernel_size=(1, 3, 3), stride=(1,stride,stride), padding=(0, 2, 2), bias=False,dilation=2) 25 | self.bn2 = nn.BatchNorm3d(planes) 26 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False,dilation=2) 27 | self.bn3 = nn.BatchNorm3d(planes * 4) 28 | self.relu = nn.ReLU(inplace=True) 29 | self.downsample = downsample 30 | self.stride = stride 31 | 32 | def forward(self, x): 33 | residual = x 34 | 35 | out = self.conv1(x) 36 | out = self.bn1(out) 37 | out = self.relu(out) 38 | 39 | out = self.conv2(out) 40 | out = self.bn2(out) 41 | out = self.relu(out) 42 | 43 | out = self.conv3(out) 44 | out = self.bn3(out) 45 | 46 | if self.downsample is not None: 47 | residual = self.downsample(x) 48 | out += residual 49 | out = self.relu(out) 50 | 51 | return out 52 | 53 | 54 | class Hidden(nn.Module): 55 | def __init__(self, block=Bottleneck, layers=[3, 4, 6, 3], class_num=10, dropout=0.5): 56 | super(Hidden, self).__init__() 57 | self.slow_inplanes = 1280 58 | self.fast_inplanes = 128 59 | self.fast_res5 = self._make_layer_fast( 60 | block, 64, layers[3], stride=1, head_conv=3) 61 | self.slow_res5 = self._make_layer_slow( 62 | block, 512, layers[3], stride=1, head_conv=3) 63 | 64 | 65 | def _make_layer_fast(self, block, planes, blocks, stride=1, head_conv=1): 66 | downsample = None 67 | if stride != 1 or self.fast_inplanes != planes * block.expansion: 68 | downsample = nn.Sequential( 69 | nn.Conv3d( 70 | self.fast_inplanes, 71 | planes * block.expansion, 72 | kernel_size=1, 73 | stride=(1,stride,stride), 74 | bias=False,dilation=2), nn.BatchNorm3d(planes * block.expansion)) 75 | 76 | layers = [] 77 | layers.append(block(self.fast_inplanes, planes, stride, downsample, head_conv=head_conv)) 78 | self.fast_inplanes = planes * block.expansion 79 | for i in range(1, blocks): 80 | layers.append(block(self.fast_inplanes, planes, head_conv=head_conv)) 81 | return nn.Sequential(*layers) 82 | 83 | def _make_layer_slow(self, block, planes, blocks, stride=1, head_conv=1): 84 | #print('_make_layer_slow',planes) 85 | downsample = None 86 | if stride != 1 or self.slow_inplanes != planes * block.expansion: 87 | #print('self.slow_inplanes',self.slow_inplanes) 88 | downsample = nn.Sequential( 89 | nn.Conv3d( 90 | self.slow_inplanes, 91 | planes * block.expansion, 92 | kernel_size=1, 93 | stride=(1,stride,stride), 94 | bias=False,dilation=2), nn.BatchNorm3d(planes * block.expansion)) 95 | layers = [] 96 | layers.append(block(self.slow_inplanes, planes, stride, downsample, head_conv=head_conv)) 97 | self.slow_inplanes = planes * block.expansion 98 | for i in range(1, blocks): 99 | layers.append(block(self.slow_inplanes, planes, head_conv=head_conv)) 100 | #self.slow_inplanes = planes * block.expansion + planes * block.expansion // 8 * 2 101 | self.slow_inplanes = planes * block.expansion 102 | return nn.Sequential(*layers) 103 | 104 | def forward(self,fast_input,slow_input): 105 | fast_output=self.fast_res5(fast_input) 106 | slow_output=self.slow_res5(slow_input) 107 | x1 = nn.AdaptiveAvgPool3d(1)(fast_output) 108 | x2 = nn.AdaptiveAvgPool3d(1)(slow_output) 109 | x1 = x1.view(-1, x1.size(1)) 110 | x2 = x2.view(-1, x2.size(1)) 111 | x = torch.cat([x1, x2], dim=1) 112 | return x 113 | 114 | 115 | def hidden50(**kwargs): 116 | """Constructs a ResNet-50 model. 117 | """ 118 | model = Hidden(Bottleneck, [3, 4, 6, 3], **kwargs) 119 | print('model', model) 120 | return model -------------------------------------------------------------------------------- /backbone/hidden_for_roi_maxpool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | 7 | 8 | class Hidden(nn.Module): 9 | 10 | def __init__(self, inplanes, planes, stride=1): 11 | super(Hidden, self).__init__() 12 | # self.conv1 = nn.Conv2d(inplanes, inplanes, kernel_size=3,padding=1, bias=False) 13 | # self.bn1 = nn.BatchNorm2d(planes) 14 | # self.conv2 = nn.Conv2d(inplanes, inplanes, kernel_size=3, stride=stride, 15 | # padding=1, bias=False) 16 | # self.bn2 = nn.BatchNorm2d(planes) 17 | # self.conv3 = nn.Conv2d(inplanes, planes, kernel_size=3, padding=1,bias=False) 18 | # self.bn3 = nn.BatchNorm2d(planes) 19 | # self.relu = nn.ReLU(inplace=True) 20 | #self.fc=nn.Linear(in_features=2304*3*3,out_features=4096) 21 | 22 | def forward(self, x): 23 | # out = self.conv1(x) 24 | # out = self.bn1(out) 25 | # out = self.relu(out) 26 | # 27 | # out = self.conv2(out) 28 | # out = self.bn2(out) 29 | # out = self.relu(out) 30 | # 31 | # out = self.conv3(out) 32 | # out = self.bn3(out) 33 | # out = self.relu(out) 34 | #x = nn.MaxPool2d(2,2)(x) 35 | out=x.view(x.shape[0],-1) 36 | #print(x.shape) 37 | #out=self.fc(x) 38 | out = out.view(-1, out.size(1)) 39 | return out 40 | 41 | def weight_init(m): 42 | # if isinstance(m, nn.Linear): 43 | # nn.init.xavier_normal_(m.weight) 44 | # nn.init.constant_(m.bias, 0) 45 | # 也可以判断是否为conv2d,使用相应的初始化方式 46 | if isinstance(m, nn.Conv3d): 47 | print("using kaiming") 48 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 49 | # 是否为批归一化层 50 | # elif isinstance(m, nn.BatchNorm3d): 51 | # nn.init.constant_(m.weight, 1) 52 | # nn.init.constant_(m.bias, 0) 53 | def hidden50(**kwargs): 54 | """Constructs a ResNet-50 model. 55 | """ 56 | model = Hidden(2304,2304,2) 57 | # model.apply(weight_init) 58 | #print('model', model) 59 | return model -------------------------------------------------------------------------------- /backbone/resnet101.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torchvision 4 | from torch import nn 5 | 6 | import backbone.base 7 | 8 | 9 | class ResNet101(backbone.base.Base): 10 | 11 | def __init__(self, pretrained: bool): 12 | super().__init__(pretrained) 13 | 14 | def features(self) -> Tuple[nn.Module, nn.Module, int, int]: 15 | resnet101 = torchvision.models.resnet101(pretrained=self._pretrained) 16 | 17 | # list(resnet101.children()) consists of following modules 18 | # [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU, 19 | # [3] = MaxPool2d, [4] = Sequential(Bottleneck...), 20 | # [5] = Sequential(Bottleneck...), 21 | # [6] = Sequential(Bottleneck...), 22 | # [7] = Sequential(Bottleneck...), 23 | # [8] = AvgPool2d, [9] = Linear 24 | children = list(resnet101.children()) 25 | features = children[:-3] 26 | num_features_out = 1024 27 | print('children',type(children)) 28 | hidden = children[-3] 29 | num_hidden_out = 2048 30 | 31 | for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]: 32 | for parameter in parameters: 33 | parameter.requires_grad = False 34 | 35 | features = nn.Sequential(*features) 36 | print('features',type(features)) 37 | print('hidden',type(hidden)) 38 | return features, hidden, num_features_out, num_hidden_out 39 | -------------------------------------------------------------------------------- /backbone/resnet18.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torchvision 4 | from torch import nn 5 | 6 | import backbone.base 7 | 8 | 9 | class ResNet18(backbone.base.Base): 10 | 11 | def __init__(self, pretrained: bool): 12 | super().__init__(pretrained) 13 | 14 | def features(self) -> Tuple[nn.Module, nn.Module, int, int]: 15 | resnet18 = torchvision.models.resnet18(pretrained=self._pretrained) 16 | 17 | # list(resnet18.children()) consists of following modules 18 | # [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU, 19 | # [3] = MaxPool2d, [4] = Sequential(Bottleneck...), 20 | # [5] = Sequential(Bottleneck...), 21 | # [6] = Sequential(Bottleneck...), 22 | # [7] = Sequential(Bottleneck...), 23 | # [8] = AvgPool2d, [9] = Linear 24 | children = list(resnet18.children()) 25 | features = children[:-3] 26 | num_features_out = 256 27 | 28 | hidden = children[-3] 29 | num_hidden_out = 512 30 | 31 | for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]: 32 | for parameter in parameters: 33 | parameter.requires_grad = False 34 | 35 | features = nn.Sequential(*features) 36 | 37 | return features, hidden, num_features_out, num_hidden_out 38 | -------------------------------------------------------------------------------- /backbone/resnet50.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torchvision 4 | from torch import nn 5 | 6 | import backbone.base 7 | 8 | 9 | class ResNet50(backbone.base.Base): 10 | 11 | def __init__(self, pretrained: bool): 12 | super().__init__(pretrained) 13 | 14 | def features(self) -> Tuple[nn.Module, nn.Module, int, int]: 15 | resnet50 = torchvision.models.resnet50(pretrained=self._pretrained) 16 | print("resnet50") 17 | # list(resnet50.children()) consists of following modules 18 | # [0] = Conv2d, [1] = BatchNorm2d, [2] = ReLU, 19 | # [3] = MaxPool2d, [4] = Sequential(Bottleneck...), 20 | # [5] = Sequential(Bottleneck...), 21 | # [6] = Sequential(Bottleneck...), 22 | # [7] = Sequential(Bottleneck...), 23 | # [8] = AvgPool2d, [9] = Linear 24 | children = list(resnet50.children()) 25 | features = children[:-3] 26 | num_features_out = 1024 27 | 28 | hidden = children[-3] 29 | num_hidden_out = 2048 30 | 31 | for parameters in [feature.parameters() for i, feature in enumerate(features) if i <= 4]: 32 | for parameter in parameters: 33 | parameter.requires_grad = False 34 | 35 | features = nn.Sequential(*features) 36 | 37 | return features, hidden, num_features_out, num_hidden_out 38 | -------------------------------------------------------------------------------- /backbone/slowfast_res101.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torchvision 4 | from torch import nn 5 | 6 | import backbone.base 7 | from backbone.slowfastnet import resnet101 as rs101 8 | from backbone.slowfastnet import resnet50 as rs50 9 | from backbone.hidden_for_roi import hidden50 10 | class slowfast_res101(backbone.base.Base): 11 | 12 | def __init__(self): 13 | super().__init__(False) 14 | 15 | def features(self): 16 | resnet101 = rs101() 17 | num_features_out = 1280 18 | hidden = hidden50() 19 | num_hidden_out = 2048 + 256 20 | return resnet101, hidden, num_features_out, num_hidden_out 21 | 22 | if __name__ == '__main__': 23 | s=slowfast_res101() 24 | s.features() 25 | -------------------------------------------------------------------------------- /backbone/slowfast_res50.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torchvision 4 | from torch import nn 5 | 6 | import backbone.base 7 | from backbone.slowfastnet import resnet101 as rs101 8 | from backbone.slowfastnet import resnet50 as rs50 9 | from backbone.hidden_for_roi_maxpool import hidden50 10 | class slowfast_res50(backbone.base.Base): 11 | 12 | def __init__(self): 13 | super().__init__(False) 14 | 15 | def features(self): 16 | print("slowfast_res50") 17 | resnet50 = rs50() 18 | hidden = hidden50() 19 | num_features_out = 2304 20 | num_hidden_out = 2304*3*3 21 | 22 | return resnet50, hidden, num_features_out, num_hidden_out 23 | 24 | if __name__ == '__main__': 25 | s=slowfast_res50() 26 | s.features() 27 | -------------------------------------------------------------------------------- /bbox.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import random 5 | 6 | import numpy as np 7 | import cv2 8 | 9 | def confidence_filter(result, confidence): 10 | conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2) 11 | result = result*conf_mask 12 | 13 | return result 14 | 15 | def confidence_filter_cls(result, confidence): 16 | max_scores = torch.max(result[:,:,5:25], 2)[0] 17 | res = torch.cat((result, max_scores),2) 18 | print(res.shape) 19 | 20 | 21 | cond_1 = (res[:,:,4] > confidence).float() 22 | cond_2 = (res[:,:,25] > 0.995).float() 23 | 24 | conf = cond_1 + cond_2 25 | conf = torch.clamp(conf, 0.0, 1.0) 26 | conf = conf.unsqueeze(2) 27 | result = result*conf 28 | return result 29 | 30 | 31 | 32 | def get_abs_coord(box): 33 | box[2], box[3] = abs(box[2]), abs(box[3]) 34 | x1 = (box[0] - box[2]/2) - 1 35 | y1 = (box[1] - box[3]/2) - 1 36 | x2 = (box[0] + box[2]/2) - 1 37 | y2 = (box[1] + box[3]/2) - 1 38 | return x1, y1, x2, y2 39 | 40 | 41 | 42 | def sanity_fix(box): 43 | if (box[0] > box[2]): 44 | box[0], box[2] = box[2], box[0] 45 | 46 | if (box[1] > box[3]): 47 | box[1], box[3] = box[3], box[1] 48 | 49 | return box 50 | 51 | def bbox_iou(box1, box2): 52 | """ 53 | Returns the IoU of two bounding boxes 54 | 55 | 56 | """ 57 | #Get the coordinates of bounding boxes 58 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3] 59 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3] 60 | 61 | #get the corrdinates of the intersection rectangle 62 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 63 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 64 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 65 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 66 | 67 | #Intersection area 68 | if torch.cuda.is_available(): 69 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) 70 | else: 71 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape)) 72 | 73 | #Union Area 74 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) 75 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) 76 | 77 | iou = inter_area / (b1_area + b2_area - inter_area) 78 | 79 | return iou 80 | 81 | 82 | def pred_corner_coord(prediction): 83 | #Get indices of non-zero confidence bboxes 84 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 85 | 86 | box = prediction[ind_nz[0], ind_nz[1]] 87 | 88 | 89 | box_a = box.new(box.shape) 90 | box_a[:,0] = (box[:,0] - box[:,2]/2) 91 | box_a[:,1] = (box[:,1] - box[:,3]/2) 92 | box_a[:,2] = (box[:,0] + box[:,2]/2) 93 | box_a[:,3] = (box[:,1] + box[:,3]/2) 94 | box[:,:4] = box_a[:,:4] 95 | 96 | prediction[ind_nz[0], ind_nz[1]] = box 97 | 98 | return prediction 99 | 100 | 101 | 102 | 103 | def write(x, batches, results, colors, classes): 104 | c1 = tuple(x[1:3].int()) 105 | c2 = tuple(x[3:5].int()) 106 | img = results[int(x[0])] 107 | cls = int(x[-1]) 108 | label = "{0}".format(classes[cls]) 109 | color = random.choice(colors) 110 | cv2.rectangle(img, c1, c2,color, 1) 111 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 112 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 113 | cv2.rectangle(img, c1, c2,color, -1) 114 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 115 | return img 116 | -------------------------------------------------------------------------------- /bbox1.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | from torch import Tensor 5 | 6 | 7 | class BBox(object): 8 | 9 | def __init__(self, left: float, top: float, right: float, bottom: float): 10 | super().__init__() 11 | self.left = left 12 | self.top = top 13 | self.right = right 14 | self.bottom = bottom 15 | 16 | def __repr__(self) -> str: 17 | return 'BBox[l={:.1f}, t={:.1f}, r={:.1f}, b={:.1f}]'.format( 18 | self.left, self.top, self.right, self.bottom) 19 | 20 | def tolist(self) -> List[float]: 21 | return [self.left, self.top, self.right, self.bottom] 22 | 23 | @staticmethod 24 | def to_center_base(bboxes: Tensor) -> Tensor: 25 | return torch.stack([ 26 | (bboxes[..., 0] + bboxes[..., 2]) / 2, 27 | (bboxes[..., 1] + bboxes[..., 3]) / 2, 28 | bboxes[..., 2] - bboxes[..., 0], 29 | bboxes[..., 3] - bboxes[..., 1] 30 | ], dim=-1) 31 | 32 | @staticmethod 33 | def from_center_base(center_based_bboxes: Tensor) -> Tensor: 34 | return torch.stack([ 35 | center_based_bboxes[..., 0] - center_based_bboxes[..., 2] / 2, 36 | center_based_bboxes[..., 1] - center_based_bboxes[..., 3] / 2, 37 | center_based_bboxes[..., 0] + center_based_bboxes[..., 2] / 2, 38 | center_based_bboxes[..., 1] + center_based_bboxes[..., 3] / 2 39 | ], dim=-1) 40 | 41 | @staticmethod 42 | def calc_transformer(src_bboxes: Tensor, dst_bboxes: Tensor) -> Tensor: 43 | center_based_src_bboxes = BBox.to_center_base(src_bboxes) 44 | center_based_dst_bboxes = BBox.to_center_base(dst_bboxes) 45 | transformers = torch.stack([ 46 | (center_based_dst_bboxes[..., 0] - center_based_src_bboxes[..., 0]) / center_based_dst_bboxes[..., 2], 47 | (center_based_dst_bboxes[..., 1] - center_based_src_bboxes[..., 1]) / center_based_dst_bboxes[..., 3], 48 | torch.log(center_based_dst_bboxes[..., 2] / center_based_src_bboxes[..., 2]), 49 | torch.log(center_based_dst_bboxes[..., 3] / center_based_src_bboxes[..., 3]) 50 | ], dim=-1) 51 | return transformers 52 | 53 | @staticmethod 54 | def apply_transformer(src_bboxes: Tensor, transformers: Tensor) -> Tensor: 55 | center_based_src_bboxes = BBox.to_center_base(src_bboxes) 56 | center_based_dst_bboxes = torch.stack([ 57 | transformers[..., 0] * center_based_src_bboxes[..., 2] + center_based_src_bboxes[..., 0], 58 | transformers[..., 1] * center_based_src_bboxes[..., 3] + center_based_src_bboxes[..., 1], 59 | torch.exp(transformers[..., 2]) * center_based_src_bboxes[..., 2], 60 | torch.exp(transformers[..., 3]) * center_based_src_bboxes[..., 3] 61 | ], dim=-1) 62 | dst_bboxes = BBox.from_center_base(center_based_dst_bboxes) 63 | return dst_bboxes 64 | 65 | @staticmethod 66 | def iou(source: Tensor, other: Tensor) -> Tensor: 67 | source, other = source.unsqueeze(dim=-2).repeat(1, 1, other.shape[-2], 1), \ 68 | other.unsqueeze(dim=-3).repeat(1, source.shape[-2], 1, 1) 69 | 70 | source_area = (source[..., 2] - source[..., 0]) * (source[..., 3] - source[..., 1]) 71 | other_area = (other[..., 2] - other[..., 0]) * (other[..., 3] - other[..., 1]) 72 | 73 | intersection_left = torch.max(source[..., 0], other[..., 0]) 74 | intersection_top = torch.max(source[..., 1], other[..., 1]) 75 | intersection_right = torch.min(source[..., 2], other[..., 2]) 76 | intersection_bottom = torch.min(source[..., 3], other[..., 3]) 77 | intersection_width = torch.clamp(intersection_right - intersection_left, min=0) 78 | intersection_height = torch.clamp(intersection_bottom - intersection_top, min=0) 79 | intersection_area = intersection_width * intersection_height 80 | 81 | return intersection_area / (source_area + other_area - intersection_area) 82 | 83 | @staticmethod 84 | def inside(bboxes: Tensor, left: float, top: float, right: float, bottom: float) -> Tensor: 85 | return ((bboxes[..., 0] >= left) * (bboxes[..., 1] >= top) * 86 | (bboxes[..., 2] <= right) * (bboxes[..., 3] <= bottom)) 87 | 88 | @staticmethod 89 | def clip(bboxes: Tensor, left: float, top: float, right: float, bottom: float) -> Tensor: 90 | bboxes[..., [0, 2]] = bboxes[..., [0, 2]].clamp(min=left, max=right) 91 | bboxes[..., [1, 3]] = bboxes[..., [1, 3]].clamp(min=top, max=bottom) 92 | return bboxes 93 | -------------------------------------------------------------------------------- /cfg/tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /cfg/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /config/config.py: -------------------------------------------------------------------------------- 1 | import ast 2 | from typing import Tuple, List 3 | 4 | from roi.pooler_ import Pooler 5 | 6 | 7 | class Config(object): 8 | ANCHOR_RATIOS = [(1, 2), (1, 1), (2, 1)] 9 | #ANCHOR_SIZES = [128, 256, 512] 10 | ANCHOR_SIZES = [64, 128] 11 | POOLER_MODE = Pooler.Mode.POOLING 12 | BACKBONE_NAME='slowfastnet50' 13 | #DETECTOR_RESULT_PATH='detection_train_result.txt' 14 | DETECTOR_RESULT_PATH = 'detection_train_result.txt' 15 | @classmethod 16 | def describe(cls): 17 | text = '\nConfig:\n' 18 | attrs = [attr for attr in dir(cls) if not callable(getattr(cls, attr)) and not attr.startswith('__')] 19 | text += '\n'.join(['\t{:s} = {:s}'.format(attr, str(getattr(cls, attr))) for attr in attrs]) + '\n' 20 | return text 21 | @classmethod 22 | def setup(cls, image_min_side: float = None, image_max_side: float = None, 23 | anchor_ratios: List[Tuple[int, int]] = None, anchor_sizes: List[int] = None, pooler_mode: str = None): 24 | if image_min_side is not None: 25 | cls.IMAGE_MIN_SIDE = image_min_side 26 | if image_max_side is not None: 27 | cls.IMAGE_MAX_SIDE = image_max_side 28 | 29 | if anchor_ratios is not None: 30 | cls.ANCHOR_RATIOS = ast.literal_eval(anchor_ratios) 31 | if anchor_sizes is not None: 32 | cls.ANCHOR_SIZES = ast.literal_eval(anchor_sizes) 33 | if pooler_mode is not None: 34 | cls.POOLER_MODE = Pooler.Mode(pooler_mode) 35 | -------------------------------------------------------------------------------- /config/eval_config.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from config.config import Config 4 | 5 | 6 | class EvalConfig(Config): 7 | 8 | RPN_PRE_NMS_TOP_N = 6000 9 | RPN_POST_NMS_TOP_N = 300 10 | VAL_DATA='ava_train_v2.2_sub_5.txt' 11 | PATH_TO_CHECKPOINT='/home/aiuser/Downloads/NEW-FRCNN-rewrite_with_yolo/temp_3/model-20700-v100.pth' 12 | PATH_TO_RESULTS='result.txt' 13 | #PATH_TO_ACTION_LIST='ava_action_list_v2.2.pbtxt' 14 | PATH_TO_ACTION_LIST='ava_action_list_v2.2_for_activitynet_2019.pbtxt' 15 | PATH_TO_LABLE='ava_train_v2.2_sub_5.txt' 16 | KEEP=0.05 17 | 18 | -------------------------------------------------------------------------------- /config/train_config.py: -------------------------------------------------------------------------------- 1 | import ast 2 | from typing import List, Tuple 3 | 4 | from config.config import Config 5 | 6 | 7 | class TrainConfig(Config): 8 | 9 | RPN_PRE_NMS_TOP_N= 12000 10 | RPN_POST_NMS_TOP_N = 2000 11 | 12 | ANCHOR_SMOOTH_L1_LOSS_BETA = 1.0 13 | PROPOSAL_SMOOTH_L1_LOSS_BETA = 1.0 14 | 15 | BATCH_SIZE=4 16 | LEARNING_RATE = 0.0001 17 | MOMENTUM = 0.9 18 | WEIGHT_DECAY = 0.0005 19 | STEP_LR_SIZES = [90000,180000] 20 | STEP_LR_GAMMA = 0.1 21 | WARM_UP_FACTOR = 0.3333 22 | WARM_UP_NUM_ITERS = 500 23 | NUM_STEPS_TO_DISPLAY = 20 24 | NUM_STEPS_TO_SNAPSHOT = 20000 25 | NUM_STEPS_TO_FINISH = 222670 26 | TRAIN_DATA='ava_train_v2.2_remove_badlist.csv' 27 | 28 | #PATH_TO_RESUMEING_CHECKPOINT='/home/aiuser/Downloads/NEW-FRCNN-rewrite/temp_3/model-19800.pth' 29 | PATH_TO_RESUMEING_CHECKPOINT =None 30 | PATH_TO_OUTPUTS_DIR = '/home/aiuser/Downloads/NEW-FRCNN-rewrite/outputs/' 31 | 32 | 33 | -------------------------------------------------------------------------------- /data/pam.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/data/pam.gif -------------------------------------------------------------------------------- /deep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/deep/__init__.py -------------------------------------------------------------------------------- /deep/checkpoint/original_ckpt.t7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/deep/checkpoint/original_ckpt.t7 -------------------------------------------------------------------------------- /deep/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | features = torch.load("features.pth") 4 | qf = features["qf"] 5 | ql = features["ql"] 6 | gf = features["gf"] 7 | gl = features["gl"] 8 | 9 | scores = qf.mm(gf.t()) 10 | res = scores.topk(5, dim=1)[1][:,0] 11 | top1correct = gl[res].eq(ql).sum().item() 12 | 13 | print("Acc top1:{:.3f}".format(top1correct/ql.size(0))) 14 | 15 | 16 | -------------------------------------------------------------------------------- /deep/feature_extractor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision.transforms as transforms 3 | import numpy as np 4 | import cv2 5 | 6 | from .model import Net 7 | 8 | class Extractor(object): 9 | def __init__(self, model_path, use_cuda=True): 10 | self.net = Net(reid=True) 11 | self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu" 12 | state_dict = torch.load(model_path)['net_dict'] 13 | self.net.load_state_dict(state_dict) 14 | print("Loading weights from {}... Done!".format(model_path)) 15 | self.net.to(self.device) 16 | self.norm = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 17 | 18 | def __call__(self, img): 19 | assert isinstance(img, np.ndarray), "type error" 20 | img = img.astype(np.float)#/255. 21 | img = cv2.resize(img, (64,128)) 22 | img = torch.from_numpy(img).float().permute(2,0,1) 23 | img = self.norm(img).unsqueeze(0) 24 | with torch.no_grad(): 25 | img = img.to(self.device) 26 | feature = self.net(img) 27 | return feature.cpu().numpy() 28 | 29 | 30 | if __name__ == '__main__': 31 | img = cv2.imread("demo.jpg")[:,:,(2,1,0)] 32 | extr = Extractor("checkpoint/ckpt.t7") 33 | feature = extr(img) 34 | print(feature.shape) 35 | 36 | -------------------------------------------------------------------------------- /deep/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class BasicBlock(nn.Module): 6 | def __init__(self, c_in, c_out,is_downsample=False): 7 | super(BasicBlock,self).__init__() 8 | self.is_downsample = is_downsample 9 | if is_downsample: 10 | self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=2, padding=1, bias=False) 11 | else: 12 | self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=1, padding=1, bias=False) 13 | self.bn1 = nn.BatchNorm2d(c_out) 14 | self.relu = nn.ReLU(True) 15 | self.conv2 = nn.Conv2d(c_out,c_out,3,stride=1,padding=1, bias=False) 16 | self.bn2 = nn.BatchNorm2d(c_out) 17 | if is_downsample: 18 | self.downsample = nn.Sequential( 19 | nn.Conv2d(c_in, c_out, 1, stride=2, bias=False), 20 | nn.BatchNorm2d(c_out) 21 | ) 22 | elif c_in != c_out: 23 | self.downsample = nn.Sequential( 24 | nn.Conv2d(c_in, c_out, 1, stride=1, bias=False), 25 | nn.BatchNorm2d(c_out) 26 | ) 27 | self.is_downsample = True 28 | 29 | def forward(self,x): 30 | y = self.conv1(x) 31 | y = self.bn1(y) 32 | y = self.relu(y) 33 | y = self.conv2(y) 34 | y = self.bn2(y) 35 | if self.is_downsample: 36 | x = self.downsample(x) 37 | return F.relu(x.add(y),True) 38 | 39 | def make_layers(c_in,c_out,repeat_times, is_downsample=False): 40 | blocks = [] 41 | for i in range(repeat_times): 42 | if i ==0: 43 | blocks += [BasicBlock(c_in,c_out, is_downsample=is_downsample),] 44 | else: 45 | blocks += [BasicBlock(c_out,c_out),] 46 | return nn.Sequential(*blocks) 47 | 48 | class Net(nn.Module): 49 | def __init__(self, num_classes=751 ,reid=False): 50 | super(Net,self).__init__() 51 | # 3 128 64 52 | self.conv = nn.Sequential( 53 | nn.Conv2d(3,64,3,stride=1,padding=1), 54 | nn.BatchNorm2d(64), 55 | nn.ReLU(inplace=True), 56 | # nn.Conv2d(32,32,3,stride=1,padding=1), 57 | # nn.BatchNorm2d(32), 58 | # nn.ReLU(inplace=True), 59 | nn.MaxPool2d(3,2,padding=1), 60 | ) 61 | # 32 64 32 62 | self.layer1 = make_layers(64,64,2,False) 63 | # 32 64 32 64 | self.layer2 = make_layers(64,128,2,True) 65 | # 64 32 16 66 | self.layer3 = make_layers(128,256,2,True) 67 | # 128 16 8 68 | self.layer4 = make_layers(256,512,2,True) 69 | # 256 8 4 70 | self.avgpool = nn.AvgPool2d((8,4),1) 71 | # 256 1 1 72 | self.reid = reid 73 | self.classifier = nn.Sequential( 74 | nn.Linear(512, 256), 75 | nn.BatchNorm1d(256), 76 | nn.ReLU(inplace=True), 77 | nn.Dropout(), 78 | nn.Linear(256, num_classes), 79 | ) 80 | 81 | def forward(self, x): 82 | x = self.conv(x) 83 | x = self.layer1(x) 84 | x = self.layer2(x) 85 | x = self.layer3(x) 86 | x = self.layer4(x) 87 | x = self.avgpool(x) 88 | x = x.view(x.size(0),-1) 89 | # B x 128 90 | if self.reid: 91 | x = x.div(x.norm(p=2,dim=1,keepdim=True)) 92 | return x 93 | # classifier 94 | x = self.classifier(x) 95 | return x 96 | 97 | 98 | if __name__ == '__main__': 99 | net = Net() 100 | x = torch.randn(4,3,128,64) 101 | y = net(x) 102 | import ipdb; ipdb.set_trace() 103 | 104 | 105 | -------------------------------------------------------------------------------- /deep/original_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class BasicBlock(nn.Module): 6 | def __init__(self, c_in, c_out,is_downsample=False): 7 | super(BasicBlock,self).__init__() 8 | self.is_downsample = is_downsample 9 | if is_downsample: 10 | self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=2, padding=1, bias=False) 11 | else: 12 | self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=1, padding=1, bias=False) 13 | self.bn1 = nn.BatchNorm2d(c_out) 14 | self.relu = nn.ReLU(True) 15 | self.conv2 = nn.Conv2d(c_out,c_out,3,stride=1,padding=1, bias=False) 16 | self.bn2 = nn.BatchNorm2d(c_out) 17 | if is_downsample: 18 | self.downsample = nn.Sequential( 19 | nn.Conv2d(c_in, c_out, 1, stride=2, bias=False), 20 | nn.BatchNorm2d(c_out) 21 | ) 22 | elif c_in != c_out: 23 | self.downsample = nn.Sequential( 24 | nn.Conv2d(c_in, c_out, 1, stride=1, bias=False), 25 | nn.BatchNorm2d(c_out) 26 | ) 27 | self.is_downsample = True 28 | 29 | def forward(self,x): 30 | y = self.conv1(x) 31 | y = self.bn1(y) 32 | y = self.relu(y) 33 | y = self.conv2(y) 34 | y = self.bn2(y) 35 | if self.is_downsample: 36 | x = self.downsample(x) 37 | return F.relu(x.add(y),True) 38 | 39 | def make_layers(c_in,c_out,repeat_times, is_downsample=False): 40 | blocks = [] 41 | for i in range(repeat_times): 42 | if i ==0: 43 | blocks += [BasicBlock(c_in,c_out, is_downsample=is_downsample),] 44 | else: 45 | blocks += [BasicBlock(c_out,c_out),] 46 | return nn.Sequential(*blocks) 47 | 48 | class Net(nn.Module): 49 | def __init__(self, num_classes=625 ,reid=False): 50 | super(Net,self).__init__() 51 | # 3 128 64 52 | self.conv = nn.Sequential( 53 | nn.Conv2d(3,32,3,stride=1,padding=1), 54 | nn.BatchNorm2d(32), 55 | nn.ELU(inplace=True), 56 | nn.Conv2d(32,32,3,stride=1,padding=1), 57 | nn.BatchNorm2d(32), 58 | nn.ELU(inplace=True), 59 | nn.MaxPool2d(3,2,padding=1), 60 | ) 61 | # 32 64 32 62 | self.layer1 = make_layers(32,32,2,False) 63 | # 32 64 32 64 | self.layer2 = make_layers(32,64,2,True) 65 | # 64 32 16 66 | self.layer3 = make_layers(64,128,2,True) 67 | # 128 16 8 68 | self.dense = nn.Sequential( 69 | nn.Dropout(p=0.6), 70 | nn.Linear(128*16*8, 128), 71 | nn.BatchNorm1d(128), 72 | nn.ELU(inplace=True) 73 | ) 74 | # 256 1 1 75 | self.reid = reid 76 | self.batch_norm = nn.BatchNorm1d(128) 77 | self.classifier = nn.Sequential( 78 | nn.Linear(128, num_classes), 79 | ) 80 | 81 | def forward(self, x): 82 | x = self.conv(x) 83 | x = self.layer1(x) 84 | x = self.layer2(x) 85 | x = self.layer3(x) 86 | 87 | x = x.view(x.size(0),-1) 88 | if self.reid: 89 | x = self.dense[0](x) 90 | x = self.dense[1](x) 91 | x = x.div(x.norm(p=2,dim=1,keepdim=True)) 92 | return x 93 | x = self.dense(x) 94 | # B x 128 95 | # classifier 96 | x = self.classifier(x) 97 | return x 98 | 99 | 100 | if __name__ == '__main__': 101 | net = Net(reid=True) 102 | x = torch.randn(4,3,128,64) 103 | y = net(x) 104 | import ipdb; ipdb.set_trace() 105 | 106 | 107 | -------------------------------------------------------------------------------- /deep/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.backends.cudnn as cudnn 3 | import torchvision 4 | 5 | import argparse 6 | import os 7 | 8 | from model import Net 9 | 10 | parser = argparse.ArgumentParser(description="Train on market1501") 11 | parser.add_argument("--data-dir",default='data',type=str) 12 | parser.add_argument("--no-cuda",action="store_true") 13 | parser.add_argument("--gpu-id",default=0,type=int) 14 | args = parser.parse_args() 15 | 16 | # device 17 | device = "cuda:{}".format(args.gpu_id) if torch.cuda.is_available() and not args.no_cuda else "cpu" 18 | if torch.cuda.is_available() and not args.no_cuda: 19 | cudnn.benchmark = True 20 | 21 | # data loader 22 | root = args.data_dir 23 | query_dir = os.path.join(root,"query") 24 | gallery_dir = os.path.join(root,"gallery") 25 | transform = torchvision.transforms.Compose([ 26 | torchvision.transforms.Resize((128,64)), 27 | torchvision.transforms.ToTensor(), 28 | torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 29 | ]) 30 | queryloader = torch.utils.data.DataLoader( 31 | torchvision.datasets.ImageFolder(query_dir, transform=transform), 32 | batch_size=64, shuffle=False 33 | ) 34 | galleryloader = torch.utils.data.DataLoader( 35 | torchvision.datasets.ImageFolder(gallery_dir, transform=transform), 36 | batch_size=64, shuffle=False 37 | ) 38 | 39 | # net definition 40 | net = Net(reid=True) 41 | assert os.path.isfile("./checkpoint/ckpt.t7"), "Error: no checkpoint file found!" 42 | print('Loading from checkpoint/ckpt.t7') 43 | checkpoint = torch.load("./checkpoint/ckpt.t7") 44 | net_dict = checkpoint['net_dict'] 45 | net.load_state_dict(net_dict) 46 | net.eval() 47 | net.to(device) 48 | 49 | # compute features 50 | query_features = torch.tensor([]).float() 51 | query_labels = torch.tensor([]).long() 52 | gallery_features = torch.tensor([]).float() 53 | gallery_labels = torch.tensor([]).long() 54 | 55 | with torch.no_grad(): 56 | for idx,(inputs,labels) in enumerate(queryloader): 57 | inputs = inputs.to(device) 58 | features = net(inputs).cpu() 59 | query_features = torch.cat((query_features, features), dim=0) 60 | query_labels = torch.cat((query_labels, labels)) 61 | 62 | for idx,(inputs,labels) in enumerate(galleryloader): 63 | inputs = inputs.to(device) 64 | features = net(inputs).cpu() 65 | gallery_features = torch.cat((gallery_features, features), dim=0) 66 | gallery_labels = torch.cat((gallery_labels, labels)) 67 | 68 | gallery_labels -= 2 69 | 70 | # save features 71 | features = { 72 | "qf": query_features, 73 | "ql": query_labels, 74 | "gf": gallery_features, 75 | "gl": gallery_labels 76 | } 77 | torch.save(features,"features.pth") -------------------------------------------------------------------------------- /deep/train.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/deep/train.jpg -------------------------------------------------------------------------------- /deep/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import torch 8 | import torch.backends.cudnn as cudnn 9 | import torchvision 10 | 11 | from model import Net 12 | 13 | parser = argparse.ArgumentParser(description="Train on market1501") 14 | parser.add_argument("--data-dir",default='data',type=str) 15 | parser.add_argument("--no-cuda",action="store_true") 16 | parser.add_argument("--gpu-id",default=0,type=int) 17 | parser.add_argument("--lr",default=0.1, type=float) 18 | parser.add_argument("--interval",'-i',default=20,type=int) 19 | parser.add_argument('--resume', '-r',action='store_true') 20 | args = parser.parse_args() 21 | 22 | # device 23 | device = "cuda:{}".format(args.gpu_id) if torch.cuda.is_available() and not args.no_cuda else "cpu" 24 | if torch.cuda.is_available() and not args.no_cuda: 25 | cudnn.benchmark = True 26 | 27 | # data loading 28 | root = args.data_dir 29 | train_dir = os.path.join(root,"train") 30 | test_dir = os.path.join(root,"test") 31 | transform_train = torchvision.transforms.Compose([ 32 | torchvision.transforms.RandomCrop((128,64),padding=4), 33 | torchvision.transforms.RandomHorizontalFlip(), 34 | torchvision.transforms.ToTensor(), 35 | torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 36 | ]) 37 | transform_test = torchvision.transforms.Compose([ 38 | torchvision.transforms.Resize((128,64)), 39 | torchvision.transforms.ToTensor(), 40 | torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 41 | ]) 42 | trainloader = torch.utils.data.DataLoader( 43 | torchvision.datasets.ImageFolder(train_dir, transform=transform_train), 44 | batch_size=64,shuffle=True 45 | ) 46 | testloader = torch.utils.data.DataLoader( 47 | torchvision.datasets.ImageFolder(test_dir, transform=transform_test), 48 | batch_size=64,shuffle=True 49 | ) 50 | num_classes = len(trainloader.dataset.classes) 51 | 52 | # net definition 53 | start_epoch = 0 54 | net = Net(num_classes=num_classes) 55 | if args.resume: 56 | assert os.path.isfile("./checkpoint/ckpt.t7"), "Error: no checkpoint file found!" 57 | print('Loading from checkpoint/ckpt.t7') 58 | checkpoint = torch.load("./checkpoint/ckpt.t7") 59 | # import ipdb; ipdb.set_trace() 60 | net_dict = checkpoint['net_dict'] 61 | net.load_state_dict(net_dict) 62 | best_acc = checkpoint['acc'] 63 | start_epoch = checkpoint['epoch'] 64 | net.to(device) 65 | 66 | # loss and optimizer 67 | criterion = torch.nn.CrossEntropyLoss() 68 | optimizer = torch.optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=5e-4) 69 | best_acc = 0. 70 | 71 | # train function for each epoch 72 | def train(epoch): 73 | print("\nEpoch : %d"%(epoch+1)) 74 | net.train() 75 | training_loss = 0. 76 | train_loss = 0. 77 | correct = 0 78 | total = 0 79 | interval = args.interval 80 | start = time.time() 81 | for idx, (inputs, labels) in enumerate(trainloader): 82 | # forward 83 | inputs,labels = inputs.to(device),labels.to(device) 84 | outputs = net(inputs) 85 | loss = criterion(outputs, labels) 86 | 87 | # backward 88 | optimizer.zero_grad() 89 | loss.backward() 90 | optimizer.step() 91 | 92 | # accumurating 93 | training_loss += loss.item() 94 | train_loss += loss.item() 95 | correct += outputs.max(dim=1)[1].eq(labels).sum().item() 96 | total += labels.size(0) 97 | 98 | # print 99 | if (idx+1)%interval == 0: 100 | end = time.time() 101 | print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format( 102 | 100.*(idx+1)/len(trainloader), end-start, training_loss/interval, correct, total, 100.*correct/total 103 | )) 104 | training_loss = 0. 105 | start = time.time() 106 | 107 | return train_loss/len(trainloader), 1.- correct/total 108 | 109 | def test(epoch): 110 | global best_acc 111 | net.eval() 112 | test_loss = 0. 113 | correct = 0 114 | total = 0 115 | start = time.time() 116 | with torch.no_grad(): 117 | for idx, (inputs, labels) in enumerate(testloader): 118 | inputs, labels = inputs.to(device), labels.to(device) 119 | outputs = net(inputs) 120 | loss = criterion(outputs, labels) 121 | 122 | test_loss += loss.item() 123 | correct += outputs.max(dim=1)[1].eq(labels).sum().item() 124 | total += labels.size(0) 125 | 126 | print("Testing ...") 127 | end = time.time() 128 | print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format( 129 | 100.*(idx+1)/len(testloader), end-start, test_loss/len(testloader), correct, total, 100.*correct/total 130 | )) 131 | 132 | # saving checkpoint 133 | acc = 100.*correct/total 134 | if acc > best_acc: 135 | best_acc = acc 136 | print("Saving parameters to checkpoint/ckpt.t7") 137 | checkpoint = { 138 | 'net_dict':net.state_dict(), 139 | 'acc':acc, 140 | 'epoch':epoch, 141 | } 142 | if not os.path.isdir('checkpoint'): 143 | os.mkdir('checkpoint') 144 | torch.save(checkpoint, './checkpoint/ckpt.t7') 145 | 146 | return test_loss/len(testloader), 1.- correct/total 147 | 148 | # plot figure 149 | x_epoch = [] 150 | record = {'train_loss':[], 'train_err':[], 'test_loss':[], 'test_err':[]} 151 | fig = plt.figure() 152 | ax0 = fig.add_subplot(121, title="loss") 153 | ax1 = fig.add_subplot(122, title="top1err") 154 | def draw_curve(epoch, train_loss, train_err, test_loss, test_err): 155 | global record 156 | record['train_loss'].append(train_loss) 157 | record['train_err'].append(train_err) 158 | record['test_loss'].append(test_loss) 159 | record['test_err'].append(test_err) 160 | 161 | x_epoch.append(epoch) 162 | ax0.plot(x_epoch, record['train_loss'], 'bo-', label='train') 163 | ax0.plot(x_epoch, record['test_loss'], 'ro-', label='val') 164 | ax1.plot(x_epoch, record['train_err'], 'bo-', label='train') 165 | ax1.plot(x_epoch, record['test_err'], 'ro-', label='val') 166 | if epoch == 0: 167 | ax0.legend() 168 | ax1.legend() 169 | fig.savefig("train.jpg") 170 | 171 | # lr decay 172 | def lr_decay(): 173 | global optimizer 174 | for params in optimizer.param_groups: 175 | params['lr'] *= 0.1 176 | lr = params['lr'] 177 | print("Learning rate adjusted to {}".format(lr)) 178 | 179 | def main(): 180 | for epoch in range(start_epoch, start_epoch+40): 181 | train_loss, train_err = train(epoch) 182 | test_loss, test_err = test(epoch) 183 | draw_curve(epoch, train_loss, train_err, test_loss, test_err) 184 | if (epoch+1)%20==0: 185 | lr_decay() 186 | 187 | 188 | if __name__ == '__main__': 189 | main() -------------------------------------------------------------------------------- /deep_sort.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | from deep.feature_extractor import Extractor 5 | from sort.nn_matching import NearestNeighborDistanceMetric 6 | from sort.preprocessing import non_max_suppression 7 | from sort.detection import Detection 8 | from sort.tracker import Tracker 9 | 10 | 11 | class DeepSort(object): 12 | def __init__(self, model_path): 13 | self.min_confidence = 0.3 14 | self.nms_max_overlap = 1.0 15 | 16 | self.extractor = Extractor(model_path, use_cuda=True) 17 | 18 | max_cosine_distance = 0.2 19 | nn_budget = 100 20 | metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) 21 | self.tracker = Tracker(metric) 22 | 23 | def update(self, bbox_xywh, confidences, ori_img): 24 | self.height, self.width = ori_img.shape[:2] 25 | # generate detections 26 | # features:为特征向量 27 | features = self._get_features(bbox_xywh, ori_img) 28 | # dectections包含 self.tlwh(左上角xy),self.confidence,self.feature 29 | # dectections为ndarray格式 30 | # 置信度筛选和nms可以考虑删除 31 | detections = [Detection(bbox_xywh[i], conf, features[i]) for i,conf in enumerate(confidences) if conf>self.min_confidence] 32 | 33 | # run on non-maximum supression 34 | boxes = np.array([d.tlwh for d in detections]) 35 | scores = np.array([d.confidence for d in detections]) 36 | indices = non_max_suppression( boxes, self.nms_max_overlap, scores) 37 | detections = [detections[i] for i in indices] 38 | 39 | # update tracker 40 | self.tracker.predict() 41 | self.tracker.update(detections) 42 | 43 | # output bbox identities 44 | outputs = [] 45 | for track in self.tracker.tracks: 46 | if not track.is_confirmed() or track.time_since_update > 1: 47 | continue 48 | box = track.to_tlwh() 49 | x1,y1,x2,y2 = self._xywh_to_xyxy(box) 50 | track_id = track.track_id 51 | outputs.append(np.array([x1,y1,x2,y2,track_id], dtype=np.int)) 52 | if len(outputs) > 0: 53 | outputs = np.stack(outputs,axis=0) 54 | return outputs 55 | 56 | def _xywh_to_xyxy(self, bbox_xywh): 57 | x,y,w,h = bbox_xywh 58 | x1 = max(int(x-w/2),0) 59 | x2 = min(int(x+w/2),self.width-1) 60 | y1 = max(int(y-h/2),0) 61 | y2 = min(int(y+h/2),self.height-1) 62 | return x1,y1,x2,y2 63 | 64 | def _get_features(self, bbox_xywh, ori_img): 65 | features = [] 66 | for box in bbox_xywh: 67 | x1,y1,x2,y2 = self._xywh_to_xyxy(box) 68 | # print(y1,y2,x1,x2) 69 | im = ori_img[y1:y2,x1:x2] 70 | #cv2.imshow("d",im) 71 | #cv2.waitKey(0) 72 | feature = self.extractor(im)[0] 73 | features.append(feature) 74 | if len(features): 75 | features = np.stack(features, axis=0) 76 | else: 77 | features = np.array([]) 78 | return features 79 | 80 | 81 | 82 | if __name__ == '__main__': 83 | pass 84 | -------------------------------------------------------------------------------- /det/det_dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_dog.jpg -------------------------------------------------------------------------------- /det/det_eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_eagle.jpg -------------------------------------------------------------------------------- /det/det_giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_giraffe.jpg -------------------------------------------------------------------------------- /det/det_herd_of_horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_herd_of_horses.jpg -------------------------------------------------------------------------------- /det/det_img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_img1.jpg -------------------------------------------------------------------------------- /det/det_img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_img2.jpg -------------------------------------------------------------------------------- /det/det_img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_img3.jpg -------------------------------------------------------------------------------- /det/det_img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_img4.jpg -------------------------------------------------------------------------------- /det/det_messi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_messi.jpg -------------------------------------------------------------------------------- /det/det_person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_person.jpg -------------------------------------------------------------------------------- /det/det_scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/det/det_scream.jpg -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | 5 | import uuid 6 | 7 | from backbone.base import Base as BackboneBase 8 | from config.train_config import TrainConfig 9 | from config.eval_config import EvalConfig 10 | from config.config import Config 11 | from dataset.base import Base as DatasetBase 12 | from evaluator import Evaluator 13 | from logger import Logger as Log 14 | from model import Model 15 | from roi.pooler_ import Pooler 16 | from dataset.AVA_video_v2 import AVA_video 17 | def _eval(path_to_checkpoint, backbone_name, path_to_results_dir): 18 | dataset = AVA_video(EvalConfig.VAL_DATA) 19 | evaluator = Evaluator(dataset, path_to_results_dir) 20 | 21 | Log.i('Found {:d} samples'.format(len(dataset))) 22 | 23 | backbone = BackboneBase.from_name(backbone_name)() 24 | model = Model(backbone, dataset.num_classes(), pooler_mode=Config.POOLER_MODE, 25 | anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, 26 | rpn_pre_nms_top_n=TrainConfig.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=TrainConfig.RPN_POST_NMS_TOP_N).cuda() 27 | model.load(path_to_checkpoint) 28 | print("load from:",path_to_checkpoint) 29 | Log.i('Start evaluating with 1 GPU (1 batch per GPU)') 30 | mean_ap, detail = evaluator.evaluate(model) 31 | Log.i('Done') 32 | Log.i('mean AP = {:.4f}'.format(mean_ap)) 33 | Log.i('\n' + detail) 34 | 35 | 36 | if __name__ == '__main__': 37 | def main(): 38 | path_to_checkpoint = EvalConfig.PATH_TO_CHECKPOINT 39 | backbone_name = Config.BACKBONE_NAME 40 | path_to_results_dir='/home/aiuser/ava/ava/'+EvalConfig.PATH_TO_RESULTS 41 | Log.initialize(os.path.join('/home/aiuser/ava_v2.2', 'eval.log')) 42 | _eval(path_to_checkpoint, backbone_name, path_to_results_dir) 43 | 44 | main() 45 | -------------------------------------------------------------------------------- /extention/functional.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from torch import Tensor 4 | 5 | 6 | def beta_smooth_l1_loss(input: Tensor, target: Tensor, beta: float) -> Tensor: 7 | diff = torch.abs(input - target) 8 | loss = torch.where(diff < beta, 0.5 * diff ** 2 / beta, diff - 0.5 * beta) 9 | loss = loss.sum() / (input.numel() + 1e-8) 10 | return loss 11 | -------------------------------------------------------------------------------- /extention/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from torch.optim import Optimizer 3 | from torch.optim.lr_scheduler import MultiStepLR 4 | 5 | 6 | class WarmUpMultiStepLR(MultiStepLR): 7 | def __init__(self, optimizer: Optimizer, milestones: List[int], gamma: float = 0.1, 8 | factor: float = 0.3333, num_iters: int = 500, last_epoch: int = -1): 9 | self.factor = factor 10 | self.num_iters = num_iters 11 | super().__init__(optimizer, milestones, gamma, last_epoch) 12 | 13 | def get_lr(self) -> List[float]: 14 | if self.last_epoch < self.num_iters: 15 | alpha = self.last_epoch / self.num_iters 16 | factor = (1 - self.factor) * alpha + self.factor 17 | else: 18 | factor = 1 19 | 20 | return [lr * factor for lr in super().get_lr()] 21 | -------------------------------------------------------------------------------- /f.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | import warnings 5 | import math 6 | import types 7 | from torch.nn import functional as F 8 | import torch 9 | from torch._C import _infer_size, _add_docstr 10 | torch.nn.MultiLabelSoftMarginLoss 11 | @torch._jit_internal.weak_script 12 | def multilabel_soft_margin_loss(input, target, weight=None, size_average=None, 13 | reduce=None, reduction='mean'): 14 | # type: (Tensor, Tensor, Optional[Tensor], Optional[bool], Optional[bool], str) -> Tensor 15 | r"""multilabel_soft_margin_loss(input, target, weight=None, size_average=None) -> Tensor 16 | 17 | See :class:`~torch.nn.MultiLabelSoftMarginLoss` for details. 18 | """ 19 | loss = -(target * torch.log(input) + (1 - target) * torch.log(-input)) 20 | 21 | if weight is not None: 22 | loss = loss * torch.jit._unwrap_optional(weight) 23 | loss = loss.sum(dim=1) / input.size(1) # only return N loss values 24 | #loss = loss.sum(dim=1) 25 | if reduction == 'none': 26 | ret = loss 27 | elif reduction == 'mean': 28 | ret = loss.mean() 29 | elif reduction == 'sum': 30 | ret = loss.sum() 31 | else: 32 | ret = input 33 | raise ValueError(reduction + " is not valid") 34 | return ret 35 | 36 | def focal_cross_entropy(input, target, weight=None, ignore_index=-100,reduction='mean'): 37 | input=torch.mul(torch.mul((1-F.softmax(input, 1)),(1-F.softmax(input, 1))),(F.log_softmax(input, 1))) 38 | return F.nll_loss(input, target, weight, None, ignore_index, None, reduction) 39 | 40 | if __name__ == '__main__': 41 | input=[[0.4,0.9]] 42 | input=torch.tensor(input,dtype=torch.float) 43 | target=[0] 44 | target=torch.tensor(target,dtype=torch.long) 45 | print(F.softmax(input, 1)) 46 | print((1-F.softmax(input, 1))) 47 | print(torch.mul((1-F.softmax(input, 1)),(1-F.softmax(input, 1)))) 48 | print(F.log_softmax(input, 1)) 49 | print(F.cross_entropy(input,target)) 50 | print(focal_cross_entropy(input,target)) -------------------------------------------------------------------------------- /img_to_video.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | img_root = '/home/aiuser/frames/'#这里写你的文件夹路径,比如:/home/youname/data/img/,注意最后一个文件夹要有斜杠 4 | fps = 15 #保存视频的FPS,可以适当调整 5 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 6 | videoWriter = cv2.VideoWriter('/home/aiuser/frames/saveVideo.avi',fourcc,fps,(656,480)) 7 | for i in range(121): 8 | if i>=10: 9 | frame = cv2.imread(img_root + str(i) + '.jpg') 10 | videoWriter.write(frame) 11 | videoWriter.release() -------------------------------------------------------------------------------- /imgs/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/dog.jpg -------------------------------------------------------------------------------- /imgs/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/eagle.jpg -------------------------------------------------------------------------------- /imgs/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/giraffe.jpg -------------------------------------------------------------------------------- /imgs/herd_of_horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/herd_of_horses.jpg -------------------------------------------------------------------------------- /imgs/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/img1.jpg -------------------------------------------------------------------------------- /imgs/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/img2.jpg -------------------------------------------------------------------------------- /imgs/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/img3.jpg -------------------------------------------------------------------------------- /imgs/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/img4.jpg -------------------------------------------------------------------------------- /imgs/messi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/messi.jpg -------------------------------------------------------------------------------- /imgs/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/person.jpg -------------------------------------------------------------------------------- /imgs/scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/imgs/scream.jpg -------------------------------------------------------------------------------- /infer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import torch 5 | 6 | from PIL import ImageDraw 7 | from torchvision.transforms import transforms 8 | from dataset.base import Base as DatasetBase 9 | from backbone.base import Base as BackboneBase 10 | from bbox import BBox 11 | from model import Model 12 | from roi.pooler_ import Pooler 13 | from config.eval_config import EvalConfig as Config 14 | 15 | 16 | def _infer(path_to_input_image: str, path_to_output_image: str, path_to_checkpoint: str, dataset_name: str, backbone_name: str, prob_thresh: float): 17 | #dataset_class = DatasetBase.from_name(dataset_name) 18 | dataset_class=80 19 | backbone = BackboneBase.from_name(backbone_name)() 20 | # model = Model(backbone, dataset_class.num_classes(), pooler_mode=Config.POOLER_MODE, 21 | # anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, 22 | # rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N).cuda() 23 | model = Model(backbone, dataset_class, pooler_mode=Config.POOLER_MODE, 24 | anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, 25 | rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N).cuda() 26 | model.load(path_to_checkpoint) 27 | 28 | with torch.no_grad(): 29 | image = transforms.Image.open(path_to_input_image) 30 | image_tensor, scale = dataset_class.preprocess(image, Config.IMAGE_MIN_SIDE, Config.IMAGE_MAX_SIDE) 31 | 32 | detection_bboxes, detection_classes, detection_probs, _ = \ 33 | model.eval().forward(image_tensor.unsqueeze(dim=0).cuda()) 34 | detection_bboxes /= scale 35 | 36 | kept_indices = detection_probs > prob_thresh 37 | detection_bboxes = detection_bboxes[kept_indices] 38 | detection_classes = detection_classes[kept_indices] 39 | detection_probs = detection_probs[kept_indices] 40 | 41 | draw = ImageDraw.Draw(image) 42 | 43 | for bbox, cls, prob in zip(detection_bboxes.tolist(), detection_classes.tolist(), detection_probs.tolist()): 44 | color = random.choice(['red', 'green', 'blue', 'yellow', 'purple', 'white']) 45 | bbox = BBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]) 46 | category = dataset_class.LABEL_TO_CATEGORY_DICT[cls] 47 | 48 | draw.rectangle(((bbox.left, bbox.top), (bbox.right, bbox.bottom)), outline=color) 49 | draw.text((bbox.left, bbox.top), text='{category:s} {prob:.3f}', fill=color) 50 | 51 | image.save(path_to_output_image) 52 | print('Output image is saved to {path_to_output_image}') 53 | 54 | 55 | if __name__ == '__main__': 56 | def main(): 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument('-s', '--dataset', type=str, choices=DatasetBase.OPTIONS, required=True, help='name of dataset') 59 | parser.add_argument('-b', '--backbone', type=str, choices=BackboneBase.OPTIONS, required=True, help='name of backbone model') 60 | parser.add_argument('-c', '--checkpoint', type=str, required=True, help='path to checkpoint') 61 | parser.add_argument('-p', '--probability_threshold', type=float, default=0.6, help='threshold of detection probability') 62 | parser.add_argument('--image_min_side', type=float, help='default: {:g}'.format(Config.IMAGE_MIN_SIDE)) 63 | parser.add_argument('--image_max_side', type=float, help='default: {:g}'.format(Config.IMAGE_MAX_SIDE)) 64 | parser.add_argument('--anchor_ratios', type=str, help='default: "{!s}"'.format(Config.ANCHOR_RATIOS)) 65 | parser.add_argument('--anchor_sizes', type=str, help='default: "{!s}"'.format(Config.ANCHOR_SIZES)) 66 | parser.add_argument('--pooler_mode', type=str, choices=Pooler.OPTIONS, help='default: {.value:s}'.format(Config.POOLER_MODE)) 67 | parser.add_argument('--rpn_pre_nms_top_n', type=int, help='default: {:d}'.format(Config.RPN_PRE_NMS_TOP_N)) 68 | parser.add_argument('--rpn_post_nms_top_n', type=int, help='default: {:d}'.format(Config.RPN_POST_NMS_TOP_N)) 69 | parser.add_argument('input', type=str, help='path to input image') 70 | parser.add_argument('output', type=str, help='path to output result image') 71 | args = parser.parse_args() 72 | 73 | path_to_input_image = args.input 74 | path_to_output_image = args.output 75 | dataset_name = args.dataset 76 | backbone_name = args.backbone 77 | path_to_checkpoint = args.checkpoint 78 | prob_thresh = args.probability_threshold 79 | 80 | os.makedirs(os.path.join(os.path.curdir, os.path.dirname(path_to_output_image)), exist_ok=True) 81 | 82 | Config.setup(image_min_side=args.image_min_side, image_max_side=args.image_max_side, 83 | anchor_ratios=args.anchor_ratios, anchor_sizes=args.anchor_sizes, pooler_mode=args.pooler_mode, 84 | rpn_pre_nms_top_n=args.rpn_pre_nms_top_n, rpn_post_nms_top_n=args.rpn_post_nms_top_n) 85 | 86 | print('Arguments:') 87 | for k, v in vars(args).items(): 88 | print('\t{k} = {v}') 89 | print(Config.describe()) 90 | 91 | _infer(path_to_input_image, path_to_output_image, path_to_checkpoint, dataset_name, backbone_name, prob_thresh) 92 | 93 | main() -------------------------------------------------------------------------------- /infer_stream.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import random 4 | import time 5 | import torch 6 | 7 | import cv2 8 | import numpy as np 9 | from PIL import ImageDraw, Image 10 | 11 | from backbone.base import Base as BackboneBase 12 | from config.eval_config import EvalConfig as Config 13 | from dataset.base import Base as DatasetBase 14 | from bbox import BBox 15 | from model import Model 16 | from roi.pooler_ import Pooler 17 | 18 | 19 | def _infer_stream(path_to_input_stream_endpoint: str, period_of_inference: int, path_to_checkpoint: str, dataset_name: str, backbone_name: str, prob_thresh: float): 20 | dataset_class = DatasetBase.from_name(dataset_name) 21 | backbone = BackboneBase.from_name(backbone_name)(pretrained=False) 22 | model = Model(backbone, dataset_class.num_classes(), pooler_mode=Config.POOLER_MODE, 23 | anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, 24 | rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N).cuda() 25 | model.load(path_to_checkpoint) 26 | 27 | if path_to_input_stream_endpoint.isdigit(): 28 | path_to_input_stream_endpoint = int(path_to_input_stream_endpoint) 29 | video_capture = cv2.VideoCapture(path_to_input_stream_endpoint) 30 | 31 | with torch.no_grad(): 32 | for sn in itertools.count(start=1): 33 | _, frame = video_capture.read() 34 | 35 | if sn % period_of_inference != 0: 36 | continue 37 | 38 | timestamp = time.time() 39 | 40 | image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 41 | image = Image.fromarray(image) 42 | image_tensor, scale = dataset_class.preprocess(image, Config.IMAGE_MIN_SIDE, Config.IMAGE_MAX_SIDE) 43 | 44 | detection_bboxes, detection_classes, detection_probs, _ = \ 45 | model.eval().forward(image_tensor.unsqueeze(dim=0).cuda()) 46 | detection_bboxes /= scale 47 | 48 | kept_indices = detection_probs > prob_thresh 49 | detection_bboxes = detection_bboxes[kept_indices] 50 | detection_classes = detection_classes[kept_indices] 51 | detection_probs = detection_probs[kept_indices] 52 | 53 | draw = ImageDraw.Draw(image) 54 | 55 | for bbox, cls, prob in zip(detection_bboxes.tolist(), detection_classes.tolist(), detection_probs.tolist()): 56 | color = random.choice(['red', 'green', 'blue', 'yellow', 'purple', 'white']) 57 | bbox = BBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]) 58 | category = dataset_class.LABEL_TO_CATEGORY_DICT[cls] 59 | 60 | draw.rectangle(((bbox.left, bbox.top), (bbox.right, bbox.bottom)), outline=color) 61 | draw.text((bbox.left, bbox.top), text=f'{category:s} {prob:.3f}', fill=color) 62 | 63 | image = np.array(image) 64 | frame = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 65 | 66 | elapse = time.time() - timestamp 67 | fps = 1 / elapse 68 | cv2.putText(frame, f'FPS = {fps:.1f}', (20, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA) 69 | 70 | cv2.imshow('easy-faster-rcnn.pytorch', frame) 71 | if cv2.waitKey(10) == 27: 72 | break 73 | 74 | video_capture.release() 75 | cv2.destroyAllWindows() 76 | 77 | 78 | if __name__ == '__main__': 79 | def main(): 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument('-s', '--dataset', type=str, choices=DatasetBase.OPTIONS, required=True, help='name of dataset') 82 | parser.add_argument('-b', '--backbone', type=str, choices=BackboneBase.OPTIONS, required=True, help='name of backbone model') 83 | parser.add_argument('-c', '--checkpoint', type=str, required=True, help='path to checkpoint') 84 | parser.add_argument('-p', '--probability_threshold', type=float, default=0.6, help='threshold of detection probability') 85 | parser.add_argument('--image_min_side', type=float, help='default: {:g}'.format(Config.IMAGE_MIN_SIDE)) 86 | parser.add_argument('--image_max_side', type=float, help='default: {:g}'.format(Config.IMAGE_MAX_SIDE)) 87 | parser.add_argument('--anchor_ratios', type=str, help='default: "{!s}"'.format(Config.ANCHOR_RATIOS)) 88 | parser.add_argument('--anchor_sizes', type=str, help='default: "{!s}"'.format(Config.ANCHOR_SIZES)) 89 | parser.add_argument('--pooler_mode', type=str, choices=Pooler.OPTIONS, help='default: {.value:s}'.format(Config.POOLER_MODE)) 90 | parser.add_argument('--rpn_pre_nms_top_n', type=int, help='default: {:d}'.format(Config.RPN_PRE_NMS_TOP_N)) 91 | parser.add_argument('--rpn_post_nms_top_n', type=int, help='default: {:d}'.format(Config.RPN_POST_NMS_TOP_N)) 92 | parser.add_argument('input', type=str, help='path to input stream endpoint') 93 | parser.add_argument('period', type=int, help='period of inference') 94 | args = parser.parse_args() 95 | 96 | path_to_input_stream_endpoint = args.input 97 | period_of_inference = args.period 98 | dataset_name = args.dataset 99 | backbone_name = args.backbone 100 | path_to_checkpoint = args.checkpoint 101 | prob_thresh = args.probability_threshold 102 | 103 | Config.setup(image_min_side=args.image_min_side, image_max_side=args.image_max_side, 104 | anchor_ratios=args.anchor_ratios, anchor_sizes=args.anchor_sizes, pooler_mode=args.pooler_mode, 105 | rpn_pre_nms_top_n=args.rpn_pre_nms_top_n, rpn_post_nms_top_n=args.rpn_post_nms_top_n) 106 | 107 | print('Arguments:') 108 | for k, v in vars(args).items(): 109 | print(f'\t{k} = {v}') 110 | print(Config.describe()) 111 | 112 | _infer_stream(path_to_input_stream_endpoint, period_of_inference, path_to_checkpoint, dataset_name, backbone_name, prob_thresh) 113 | 114 | main() 115 | -------------------------------------------------------------------------------- /infer_websocket.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import json 4 | 5 | import numpy as np 6 | import torch 7 | import websockets 8 | from PIL import Image 9 | 10 | from backbone.base import Base as BackboneBase 11 | from bbox import BBox 12 | from config.eval_config import EvalConfig as Config 13 | from dataset.base import Base as DatasetBase 14 | from model import Model 15 | from roi.pooler_ import Pooler 16 | 17 | 18 | def _infer_websocket(path_to_checkpoint: str, dataset_name: str, backbone_name: str, prob_thresh: float): 19 | dataset_class = DatasetBase.from_name(dataset_name) 20 | backbone = BackboneBase.from_name(backbone_name)(pretrained=False) 21 | model = Model(backbone, dataset_class.num_classes(), pooler_mode=Config.POOLER_MODE, 22 | anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, 23 | rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N).cuda() 24 | model.load(path_to_checkpoint) 25 | 26 | async def handler(websocket, path): 27 | print('Connection established:', path) 28 | 29 | with torch.no_grad(): 30 | while True: 31 | frame = await websocket.recv() 32 | frame = np.frombuffer(frame, dtype=np.uint8).reshape(480, 640, 3) 33 | 34 | image = Image.fromarray(frame) 35 | image_tensor, scale = dataset_class.preprocess(image, Config.IMAGE_MIN_SIDE, Config.IMAGE_MAX_SIDE) 36 | 37 | detection_bboxes, detection_classes, detection_probs, _ = \ 38 | model.eval().forward(image_tensor.unsqueeze(dim=0).cuda()) 39 | detection_bboxes /= scale 40 | 41 | kept_indices = detection_probs > prob_thresh 42 | detection_bboxes = detection_bboxes[kept_indices] 43 | detection_classes = detection_classes[kept_indices] 44 | detection_probs = detection_probs[kept_indices] 45 | 46 | message = [] 47 | 48 | for bbox, cls, prob in zip(detection_bboxes.tolist(), detection_classes.tolist(), detection_probs.tolist()): 49 | bbox = BBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]) 50 | category = dataset_class.LABEL_TO_CATEGORY_DICT[cls] 51 | 52 | message.append({ 53 | 'left': int(bbox.left), 54 | 'top': int(bbox.top), 55 | 'right': int(bbox.right), 56 | 'bottom': int(bbox.bottom), 57 | 'category': category 58 | }) 59 | 60 | message = json.dumps(message) 61 | await websocket.send(message) 62 | 63 | server = websockets.serve(handler, host='*', port=8765, max_size=2 ** 32, compression=None) 64 | asyncio.get_event_loop().run_until_complete(server) 65 | print('Service is ready. Please navigate to http://127.0.0.1:8000/') 66 | asyncio.get_event_loop().run_forever() 67 | 68 | 69 | if __name__ == '__main__': 70 | def main(): 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('-s', '--dataset', type=str, choices=DatasetBase.OPTIONS, required=True, help='name of dataset') 73 | parser.add_argument('-b', '--backbone', type=str, choices=BackboneBase.OPTIONS, required=True, help='name of backbone model') 74 | parser.add_argument('-c', '--checkpoint', type=str, required=True, help='path to checkpoint') 75 | parser.add_argument('-p', '--probability_threshold', type=float, default=0.6, help='threshold of detection probability') 76 | parser.add_argument('--image_min_side', type=float, help='default: {:g}'.format(Config.IMAGE_MIN_SIDE)) 77 | parser.add_argument('--image_max_side', type=float, help='default: {:g}'.format(Config.IMAGE_MAX_SIDE)) 78 | parser.add_argument('--anchor_ratios', type=str, help='default: "{!s}"'.format(Config.ANCHOR_RATIOS)) 79 | parser.add_argument('--anchor_sizes', type=str, help='default: "{!s}"'.format(Config.ANCHOR_SIZES)) 80 | parser.add_argument('--pooler_mode', type=str, choices=Pooler.OPTIONS, help='default: {.value:s}'.format(Config.POOLER_MODE)) 81 | parser.add_argument('--rpn_pre_nms_top_n', type=int, help='default: {:d}'.format(Config.RPN_PRE_NMS_TOP_N)) 82 | parser.add_argument('--rpn_post_nms_top_n', type=int, help='default: {:d}'.format(Config.RPN_POST_NMS_TOP_N)) 83 | args = parser.parse_args() 84 | 85 | dataset_name = args.dataset 86 | backbone_name = args.backbone 87 | path_to_checkpoint = args.checkpoint 88 | prob_thresh = args.probability_threshold 89 | 90 | Config.setup(image_min_side=args.image_min_side, image_max_side=args.image_max_side, 91 | anchor_ratios=args.anchor_ratios, anchor_sizes=args.anchor_sizes, pooler_mode=args.pooler_mode, 92 | rpn_pre_nms_top_n=args.rpn_pre_nms_top_n, rpn_post_nms_top_n=args.rpn_post_nms_top_n) 93 | 94 | print('Arguments:') 95 | for k, v in vars(args).items(): 96 | print(f'\t{k} = {v}') 97 | print(Config.describe()) 98 | 99 | _infer_websocket(path_to_checkpoint, dataset_name, backbone_name, prob_thresh) 100 | 101 | main() 102 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class Logger(object): 5 | Initialized = False 6 | 7 | @staticmethod 8 | def initialize(path_to_log_file): 9 | logging.basicConfig(level=logging.INFO, 10 | format='%(asctime)s %(levelname)-8s %(message)s', 11 | datefmt='%Y-%m-%d %H:%M:%S', 12 | handlers=[logging.FileHandler(path_to_log_file), 13 | logging.StreamHandler()]) 14 | Logger.Initialized = True 15 | 16 | @staticmethod 17 | def log(level, message): 18 | assert Logger.Initialized, 'Logger has not been initialized' 19 | logging.log(level, message) 20 | 21 | @staticmethod 22 | def d(message): 23 | Logger.log(logging.DEBUG, message) 24 | 25 | @staticmethod 26 | def i(message): 27 | Logger.log(logging.INFO, message) 28 | 29 | @staticmethod 30 | def w(message): 31 | Logger.log(logging.WARNING, message) 32 | 33 | @staticmethod 34 | def e(message): 35 | Logger.log(logging.ERROR, message) 36 | -------------------------------------------------------------------------------- /logs/events.out.tfevents.1555900792.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/logs/events.out.tfevents.1555900792.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /logs/events.out.tfevents.1555900949.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/logs/events.out.tfevents.1555900949.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /outputs/frames/blank.TXT: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/outputs/frames/blank.TXT -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import numpy as np 8 | import cv2 9 | import matplotlib.pyplot as plt 10 | from util import count_parameters as count 11 | from util import convert2cpu as cpu 12 | from PIL import Image, ImageDraw 13 | 14 | 15 | def letterbox_image(img, inp_dim): 16 | '''resize image with unchanged aspect ratio using padding''' 17 | img_w, img_h = img.shape[1], img.shape[0] 18 | w, h = inp_dim 19 | new_w = int(img_w * min(w/img_w, h/img_h)) 20 | new_h = int(img_h * min(w/img_w, h/img_h)) 21 | resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC) 22 | 23 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 24 | 25 | canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image 26 | 27 | return canvas 28 | 29 | 30 | 31 | def prep_image(img, inp_dim): 32 | """ 33 | Prepare image for inputting to the neural network. 34 | 35 | Returns a Variable 36 | """ 37 | 38 | orig_im = cv2.imread(img) 39 | dim = orig_im.shape[1], orig_im.shape[0] 40 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 41 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 42 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 43 | return img_, orig_im, dim 44 | 45 | def prep_image_pil(img, network_dim): 46 | orig_im = Image.open(img) 47 | img = orig_im.convert('RGB') 48 | dim = img.size 49 | img = img.resize(network_dim) 50 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())) 51 | img = img.view(*network_dim, 3).transpose(0,1).transpose(0,2).contiguous() 52 | img = img.view(1, 3,*network_dim) 53 | img = img.float().div(255.0) 54 | return (img, orig_im, dim) 55 | 56 | def inp_to_image(inp): 57 | inp = inp.cpu().squeeze() 58 | inp = inp*255 59 | try: 60 | inp = inp.data.numpy() 61 | except RuntimeError: 62 | inp = inp.numpy() 63 | inp = inp.transpose(1,2,0) 64 | 65 | inp = inp[:,:,::-1] 66 | return inp 67 | 68 | 69 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.2.0.32 2 | torch==1.0.0 3 | torchvision==0.2.1 4 | msgpack==0.6.1 5 | -------------------------------------------------------------------------------- /roi/pooler.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import math 3 | import torch 4 | from torch import Tensor 5 | from torch.nn import functional as F 6 | 7 | from support.layer.roi_align import ROIAlign 8 | 9 | 10 | class Pooler(object): 11 | 12 | class Mode(Enum): 13 | POOLING = 'pooling' 14 | ALIGN = 'align' 15 | 16 | OPTIONS = ['pooling', 'align'] 17 | 18 | @staticmethod 19 | def apply(features: Tensor, proposal_bboxes: Tensor, proposal_batch_indices: Tensor, mode: Mode) -> Tensor: 20 | _, _, feature_map_height, feature_map_width = features.shape 21 | scale = 1 / 16 22 | output_size = (7, 7) 23 | # sure 2 24 | #print("proposal_batch_indices:",proposal_batch_indices) 25 | if mode == Pooler.Mode.POOLING: 26 | pool = [] 27 | #print("debug_pooling:",proposal_batch_indices.shape) 28 | for (proposal_bbox, proposal_batch_index) in zip(proposal_bboxes, proposal_batch_indices): 29 | start_x = max(min(round(proposal_bbox[0].item() * scale), feature_map_width - 1), 0) # [0, feature_map_width) 30 | start_y = max(min(round(proposal_bbox[1].item() * scale), feature_map_height - 1), 0) # (0, feature_map_height] 31 | end_x = max(min(round(proposal_bbox[2].item() * scale) + 1, feature_map_width), 1) # [0, feature_map_width) 32 | end_y = max(min(round(proposal_bbox[3].item() * scale) + 1, feature_map_height), 1) # (0, feature_map_height] 33 | # sure 3 34 | #print("position:",start_x,start_y,end_x,end_y) 35 | h=end_y-start_y 36 | w=end_x-start_x 37 | if h<7: 38 | change_h=math.ceil((7-h)/2) 39 | start_y=max(start_y-change_h,0) 40 | end_y=min(end_y+change_h,feature_map_height) 41 | if w<7: 42 | change_w=math.ceil((7-w)/2) 43 | start_x =max(start_x-change_w,0) 44 | end_x = min(end_x+change_w,feature_map_width) 45 | # sure 4 46 | #print("changed_position:", start_x, start_y, end_x, end_y) 47 | roi_feature_map = features[proposal_batch_index, :, start_y:end_y, start_x:end_x] 48 | pool.append(F.adaptive_max_pool2d(input=roi_feature_map, output_size=output_size)) 49 | shape=pool[-1].shape 50 | pool = torch.stack(pool, dim=0) 51 | elif mode == Pooler.Mode.ALIGN: 52 | pool = ROIAlign(output_size, spatial_scale=scale, sampling_ratio=0)( 53 | features, 54 | torch.cat([proposal_batch_indices.view(-1, 1).float(), proposal_bboxes], dim=1) 55 | ) 56 | else: 57 | raise ValueError 58 | 59 | pool = F.max_pool2d(input=pool, kernel_size=2, stride=2) 60 | return pool 61 | 62 | -------------------------------------------------------------------------------- /roi/pooler_.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | import torch 4 | from torch import Tensor 5 | from torch.nn import functional as F 6 | 7 | from support.layer.roi_align import ROIAlign 8 | 9 | 10 | class Pooler(object): 11 | 12 | class Mode(Enum): 13 | POOLING = 'pooling' 14 | ALIGN = 'align' 15 | 16 | OPTIONS = ['pooling', 'align'] 17 | 18 | # @staticmethod 19 | # def apply(features: Tensor, proposal_bboxes: Tensor, proposal_batch_indices: Tensor, mode: Mode) -> Tensor: 20 | # _, _,feature_map_t, feature_map_height, feature_map_width = features.shape 21 | # scale = 1 / 16 22 | # output_size = (feature_map_t,3 * 2, 3 * 2) 23 | # 24 | # if mode == Pooler.Mode.POOLING: 25 | # pool = [] 26 | # for (proposal_bbox, proposal_batch_index) in zip(proposal_bboxes, proposal_batch_indices): 27 | # start_x = max(min(round(proposal_bbox[0].item() * scale), feature_map_width - 1), 0) # [0, feature_map_width) 28 | # start_y = max(min(round(proposal_bbox[1].item() * scale), feature_map_height - 1), 0) # (0, feature_map_height] 29 | # end_x = max(min(round(proposal_bbox[2].item() * scale) + 1, feature_map_width), 1) # [0, feature_map_width) 30 | # end_y = max(min(round(proposal_bbox[3].item() * scale) + 1, feature_map_height), 1) # (0, feature_map_height] 31 | # roi_feature_map = features[proposal_batch_index, :,:, start_y:end_y, start_x:end_x] 32 | # pool.append(F.adaptive_max_pool3d(input=roi_feature_map,output_size=output_size)) 33 | # pool = torch.stack(pool, dim=0) 34 | # elif mode == Pooler.Mode.ALIGN: 35 | # pool = ROIAlign(output_size, spatial_scale=scale, sampling_ratio=0)( 36 | # features, 37 | # torch.cat([proposal_batch_indices.view(-1, 1).float(), proposal_bboxes], dim=1) 38 | # ) 39 | # else: 40 | # raise ValueError 41 | # 42 | # pool = F.max_pool3d(input=pool, kernel_size=(1,2,2), stride=(1,2,2)) 43 | # return pool 44 | 45 | @staticmethod 46 | def apply(features: Tensor, proposal_bboxes: Tensor, proposal_batch_indices: Tensor, mode: Mode) -> Tensor: 47 | _, _, feature_map_t, feature_map_height, feature_map_width = features.shape 48 | scale = 1 / 16 49 | output_size = (feature_map_t, 7, 7) 50 | 51 | if mode == Pooler.Mode.POOLING: 52 | pool = [] 53 | for (proposal_bbox, proposal_batch_index) in zip(proposal_bboxes, proposal_batch_indices): 54 | start_x = max(min(round(proposal_bbox[0].item() * scale), feature_map_width - 1), 55 | 0) # [0, feature_map_width) 56 | start_y = max(min(round(proposal_bbox[1].item() * scale), feature_map_height - 1), 57 | 0) # (0, feature_map_height] 58 | end_x = max(min(round(proposal_bbox[2].item() * scale) + 1, feature_map_width), 59 | 1) # [0, feature_map_width) 60 | end_y = max(min(round(proposal_bbox[3].item() * scale) + 1, feature_map_height), 61 | 1) # (0, feature_map_height] 62 | 63 | roi_feature_map = features[proposal_batch_index, :, :, start_y:end_y, start_x:end_x] 64 | pool.append(F.adaptive_max_pool3d(input=roi_feature_map, output_size=output_size)) 65 | pool = torch.stack(pool, dim=0) 66 | else: 67 | raise ValueError 68 | 69 | #pool = F.max_pool3d(input=pool, kernel_size=(1, 2, 2), stride=(1, 2, 2)) 70 | return pool 71 | 72 | -------------------------------------------------------------------------------- /rpn/mkf.py: -------------------------------------------------------------------------------- 1 | def make_image_key(video_id, timestamp): 2 | """Returns a unique identifier for a video id & timestamp.""" 3 | return "%s,%04d" % (video_id, int(timestamp)) 4 | 5 | if __name__ == '__main__': 6 | video_id="aaaa" 7 | timestamp="930" 8 | print(make_image_key(video_id,timestamp)) -------------------------------------------------------------------------------- /runs/Apr15_19-42-07_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328527.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_19-42-07_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328527.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_19-42-31_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328551.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_19-42-31_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328551.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_19-42-47_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328567.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_19-42-47_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328567.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_19-44-13_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328653.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_19-44-13_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328653.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_19-47-03_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328823.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_19-47-03_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555328823.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_19-53-21_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555329201.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_19-53-21_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555329201.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_19-56-51_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555329411.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_19-56-51_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555329411.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_20-00-31_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555329631.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_20-00-31_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555329631.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /runs/Apr15_20-12-31_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555330351.aiuser-Z390-GAMING-X: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/runs/Apr15_20-12-31_aiuser-Z390-GAMING-XNet1/events.out.tfevents.1555330351.aiuser-Z390-GAMING-X -------------------------------------------------------------------------------- /scripts/coco2017/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | CHECKPOINT=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${CHECKPOINT}" ]]); then 5 | echo "Argument BACKBONE or CHECKPOINT is missing" 6 | exit 7 | fi 8 | 9 | python eval.py -s=coco2017 -b=${BACKBONE} --image_min_side=800 --image_max_side=1333 --anchor_sizes="[64, 128, 256, 512]" --rpn_post_nms_top_n=1000 ${CHECKPOINT} -------------------------------------------------------------------------------- /scripts/coco2017/infer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | CHECKPOINT=$2 4 | INPUT_IMAGE=$3 5 | OUTPUT_IMAGE=$4 6 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${CHECKPOINT}" ]] && [[ -n "${INPUT_IMAGE}" ]] && [[ -n "${OUTPUT_IMAGE}" ]]); then 7 | echo "Argument BACKBONE or CHECKPOINT or INPUT_IMAGE or OUTPUT_IMAGE is missing" 8 | exit 9 | fi 10 | 11 | python infer.py -s=coco2017 -b=${BACKBONE} -c=${CHECKPOINT} --image_min_side=800 --image_max_side=1333 --anchor_sizes="[64, 128, 256, 512]" --rpn_post_nms_top_n=1000 ${INPUT_IMAGE} ${OUTPUT_IMAGE} -------------------------------------------------------------------------------- /scripts/coco2017/train-bs1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=coco2017 -b=${BACKBONE} -o=${OUTPUTS_DIR} --image_min_side=800 --image_max_side=1333 --anchor_sizes="[64, 128, 256, 512]" --anchor_smooth_l1_loss_beta=0.1111 --batch_size=1 --learning_rate=0.00125 --weight_decay=0.0001 --step_lr_sizes="[960000, 1280000]" --num_steps_to_snapshot=320000 --num_steps_to_finish=1440000 -------------------------------------------------------------------------------- /scripts/coco2017/train-bs16.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=coco2017 -b=${BACKBONE} -o=${OUTPUTS_DIR} --image_min_side=800 --image_max_side=1333 --anchor_sizes="[64, 128, 256, 512]" --anchor_smooth_l1_loss_beta=0.1111 --batch_size=16 --learning_rate=0.02 --weight_decay=0.0001 --step_lr_sizes="[60000, 80000]" --num_steps_to_snapshot=20000 --num_steps_to_finish=90000 -------------------------------------------------------------------------------- /scripts/coco2017/train-bs2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=coco2017 -b=${BACKBONE} -o=${OUTPUTS_DIR} --image_min_side=800 --image_max_side=1333 --anchor_sizes="[64, 128, 256, 512]" --anchor_smooth_l1_loss_beta=0.1111 --batch_size=2 --learning_rate=0.0025 --weight_decay=0.0001 --step_lr_sizes="[480000, 640000]" --num_steps_to_snapshot=160000 --num_steps_to_finish=720000 -------------------------------------------------------------------------------- /scripts/coco2017/train-bs4.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=coco2017 -b=${BACKBONE} -o=${OUTPUTS_DIR} --image_min_side=800 --image_max_side=1333 --anchor_sizes="[64, 128, 256, 512]" --anchor_smooth_l1_loss_beta=0.1111 --batch_size=4 --learning_rate=0.005 --weight_decay=0.0001 --step_lr_sizes="[240000, 320000]" --num_steps_to_snapshot=80000 --num_steps_to_finish=360000 -------------------------------------------------------------------------------- /scripts/coco2017/train-bs8.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=coco2017 -b=${BACKBONE} -o=${OUTPUTS_DIR} --image_min_side=800 --image_max_side=1333 --anchor_sizes="[64, 128, 256, 512]" --anchor_smooth_l1_loss_beta=0.1111 --batch_size=8 --learning_rate=0.01 --weight_decay=0.0001 --step_lr_sizes="[120000, 160000]" --num_steps_to_snapshot=40000 --num_steps_to_finish=180000 -------------------------------------------------------------------------------- /scripts/voc2007/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | CHECKPOINT=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${CHECKPOINT}" ]]); then 5 | echo "Argument BACKBONE or CHECKPOINT is missing" 6 | exit 7 | fi 8 | 9 | python eval.py -s=voc2007 -b=${BACKBONE} ${CHECKPOINT} -------------------------------------------------------------------------------- /scripts/voc2007/infer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | CHECKPOINT=$2 4 | INPUT_IMAGE=$3 5 | OUTPUT_IMAGE=$4 6 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${CHECKPOINT}" ]] && [[ -n "${INPUT_IMAGE}" ]] && [[ -n "${OUTPUT_IMAGE}" ]]); then 7 | echo "Argument BACKBONE or CHECKPOINT or INPUT_IMAGE or OUTPUT_IMAGE is missing" 8 | exit 9 | fi 10 | 11 | python infer.py -s=voc2007 -b=${BACKBONE} -c=${CHECKPOINT} ${INPUT_IMAGE} ${OUTPUT_IMAGE} -------------------------------------------------------------------------------- /scripts/voc2007/train-bs1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=voc2007 -b=${BACKBONE} -o=${OUTPUTS_DIR} --batch_size=1 --learning_rate=0.001 --step_lr_sizes="[50000, 70000]" --num_steps_to_snapshot=10000 --num_steps_to_finish=90000 -------------------------------------------------------------------------------- /scripts/voc2007/train-bs16.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=voc2007 -b=${BACKBONE} -o=${OUTPUTS_DIR} --batch_size=16 --learning_rate=0.016 --step_lr_sizes="[3125, 4375]" --num_steps_to_snapshot=625 --num_steps_to_finish=5625 -------------------------------------------------------------------------------- /scripts/voc2007/train-bs2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=voc2007 -b=${BACKBONE} -o=${OUTPUTS_DIR} --batch_size=2 --learning_rate=0.002 --step_lr_sizes="[25000, 35000]" --num_steps_to_snapshot=5000 --num_steps_to_finish=45000 -------------------------------------------------------------------------------- /scripts/voc2007/train-bs4.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=voc2007 -b=${BACKBONE} -o=${OUTPUTS_DIR} --batch_size=4 --learning_rate=0.004 --step_lr_sizes="[12500, 17500]" --num_steps_to_snapshot=2500 --num_steps_to_finish=22500 -------------------------------------------------------------------------------- /scripts/voc2007/train-bs8.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | BACKBONE=$1 3 | OUTPUTS_DIR=$2 4 | if ! ([[ -n "${BACKBONE}" ]] && [[ -n "${OUTPUTS_DIR}" ]]); then 5 | echo "Argument BACKBONE or OUTPUTS_DIR is missing" 6 | exit 7 | fi 8 | 9 | python train.py -s=voc2007 -b=${BACKBONE} -o=${OUTPUTS_DIR} --batch_size=8 --learning_rate=0.008 --step_lr_sizes="[6250, 8750]" --num_steps_to_snapshot=1250 --num_steps_to_finish=11250 -------------------------------------------------------------------------------- /sort/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vaib-saxena/Slow-Fast-pytorch-implementation/cd08ea735d501f0eedc2f2d4ed5e5c8b01a9f4a8/sort/__init__.py -------------------------------------------------------------------------------- /sort/detection.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | 4 | 5 | class Detection(object): 6 | """ 7 | This class represents a bounding box detection in a single image. 8 | 9 | Parameters 10 | ---------- 11 | tlwh : array_like 12 | Bounding box in format `(x, y, w, h)`. 13 | confidence : float 14 | Detector confidence score. 15 | feature : array_like 16 | A feature vector that describes the object contained in this image. 17 | 18 | Attributes 19 | ---------- 20 | tlwh : ndarray 21 | Bounding box in format `(top left x, top left y, width, height)`. 22 | confidence : ndarray 23 | Detector confidence score. 24 | feature : ndarray | NoneType 25 | A feature vector that describes the object contained in this image. 26 | 27 | """ 28 | 29 | def __init__(self, tlwh, confidence, feature): 30 | self.tlwh = np.asarray(tlwh, dtype=np.float) 31 | self.confidence = float(confidence) 32 | self.feature = np.asarray(feature, dtype=np.float32) 33 | 34 | def to_tlbr(self): 35 | """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., 36 | `(top left, bottom right)`. 37 | """ 38 | ret = self.tlwh.copy() 39 | ret[2:] += ret[:2] 40 | return ret 41 | 42 | def to_xyah(self): 43 | """Convert bounding box to format `(center x, center y, aspect ratio, 44 | height)`, where the aspect ratio is `width / height`. 45 | """ 46 | ret = self.tlwh.copy() 47 | ret[:2] += ret[2:] / 2 48 | ret[2] /= ret[3] 49 | return ret 50 | -------------------------------------------------------------------------------- /sort/iou_matching.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | from . import linear_assignment 5 | 6 | 7 | def iou(bbox, candidates): 8 | """Computer intersection over union. 9 | 10 | Parameters 11 | ---------- 12 | bbox : ndarray 13 | A bounding box in format `(top left x, top left y, width, height)`. 14 | candidates : ndarray 15 | A matrix of candidate bounding boxes (one per row) in the same format 16 | as `bbox`. 17 | 18 | Returns 19 | ------- 20 | ndarray 21 | The intersection over union in [0, 1] between the `bbox` and each 22 | candidate. A higher score means a larger fraction of the `bbox` is 23 | occluded by the candidate. 24 | 25 | """ 26 | bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:] 27 | candidates_tl = candidates[:, :2] 28 | candidates_br = candidates[:, :2] + candidates[:, 2:] 29 | 30 | tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], 31 | np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] 32 | br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], 33 | np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] 34 | wh = np.maximum(0., br - tl) 35 | 36 | area_intersection = wh.prod(axis=1) 37 | area_bbox = bbox[2:].prod() 38 | area_candidates = candidates[:, 2:].prod(axis=1) 39 | return area_intersection / (area_bbox + area_candidates - area_intersection) 40 | 41 | 42 | def iou_cost(tracks, detections, track_indices=None, 43 | detection_indices=None): 44 | """An intersection over union distance metric. 45 | 46 | Parameters 47 | ---------- 48 | tracks : List[deep_sort.track.Track] 49 | A list of tracks. 50 | detections : List[deep_sort.detection.Detection] 51 | A list of detections. 52 | track_indices : Optional[List[int]] 53 | A list of indices to tracks that should be matched. Defaults to 54 | all `tracks`. 55 | detection_indices : Optional[List[int]] 56 | A list of indices to detections that should be matched. Defaults 57 | to all `detections`. 58 | 59 | Returns 60 | ------- 61 | ndarray 62 | Returns a cost matrix of shape 63 | len(track_indices), len(detection_indices) where entry (i, j) is 64 | `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. 65 | 66 | """ 67 | if track_indices is None: 68 | track_indices = np.arange(len(tracks)) 69 | if detection_indices is None: 70 | detection_indices = np.arange(len(detections)) 71 | 72 | cost_matrix = np.zeros((len(track_indices), len(detection_indices))) 73 | for row, track_idx in enumerate(track_indices): 74 | if tracks[track_idx].time_since_update > 1: 75 | cost_matrix[row, :] = linear_assignment.INFTY_COST 76 | continue 77 | 78 | bbox = tracks[track_idx].to_tlwh() 79 | candidates = np.asarray([detections[i].tlwh for i in detection_indices]) 80 | cost_matrix[row, :] = 1. - iou(bbox, candidates) 81 | return cost_matrix 82 | -------------------------------------------------------------------------------- /sort/nn_matching.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | 4 | 5 | def _pdist(a, b): 6 | """Compute pair-wise squared distance between points in `a` and `b`. 7 | 8 | Parameters 9 | ---------- 10 | a : array_like 11 | An NxM matrix of N samples of dimensionality M. 12 | b : array_like 13 | An LxM matrix of L samples of dimensionality M. 14 | 15 | Returns 16 | ------- 17 | ndarray 18 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 19 | contains the squared distance between `a[i]` and `b[j]`. 20 | 21 | """ 22 | a, b = np.asarray(a), np.asarray(b) 23 | if len(a) == 0 or len(b) == 0: 24 | return np.zeros((len(a), len(b))) 25 | a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1) 26 | r2 = -2. * np.dot(a, b.T) + a2[:, None] + b2[None, :] 27 | r2 = np.clip(r2, 0., float(np.inf)) 28 | return r2 29 | 30 | 31 | def _cosine_distance(a, b, data_is_normalized=False): 32 | """Compute pair-wise cosine distance between points in `a` and `b`. 33 | 34 | Parameters 35 | ---------- 36 | a : array_like 37 | An NxM matrix of N samples of dimensionality M. 38 | b : array_like 39 | An LxM matrix of L samples of dimensionality M. 40 | data_is_normalized : Optional[bool] 41 | If True, assumes rows in a and b are unit length vectors. 42 | Otherwise, a and b are explicitly normalized to lenght 1. 43 | 44 | Returns 45 | ------- 46 | ndarray 47 | Returns a matrix of size len(a), len(b) such that eleement (i, j) 48 | contains the squared distance between `a[i]` and `b[j]`. 49 | 50 | """ 51 | if not data_is_normalized: 52 | a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True) 53 | b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True) 54 | return 1. - np.dot(a, b.T) 55 | 56 | 57 | def _nn_euclidean_distance(x, y): 58 | """ Helper function for nearest neighbor distance metric (Euclidean). 59 | 60 | Parameters 61 | ---------- 62 | x : ndarray 63 | A matrix of N row-vectors (sample points). 64 | y : ndarray 65 | A matrix of M row-vectors (query points). 66 | 67 | Returns 68 | ------- 69 | ndarray 70 | A vector of length M that contains for each entry in `y` the 71 | smallest Euclidean distance to a sample in `x`. 72 | 73 | """ 74 | distances = _pdist(x, y) 75 | return np.maximum(0.0, distances.min(axis=0)) 76 | 77 | 78 | def _nn_cosine_distance(x, y): 79 | """ Helper function for nearest neighbor distance metric (cosine). 80 | 81 | Parameters 82 | ---------- 83 | x : ndarray 84 | A matrix of N row-vectors (sample points). 85 | y : ndarray 86 | A matrix of M row-vectors (query points). 87 | 88 | Returns 89 | ------- 90 | ndarray 91 | A vector of length M that contains for each entry in `y` the 92 | smallest cosine distance to a sample in `x`. 93 | 94 | """ 95 | distances = _cosine_distance(x, y) 96 | return distances.min(axis=0) 97 | 98 | 99 | class NearestNeighborDistanceMetric(object): 100 | """ 101 | A nearest neighbor distance metric that, for each target, returns 102 | the closest distance to any sample that has been observed so far. 103 | 104 | Parameters 105 | ---------- 106 | metric : str 107 | Either "euclidean" or "cosine". 108 | matching_threshold: float 109 | The matching threshold. Samples with larger distance are considered an 110 | invalid match. 111 | budget : Optional[int] 112 | If not None, fix samples per class to at most this number. Removes 113 | the oldest samples when the budget is reached. 114 | 115 | Attributes 116 | ---------- 117 | samples : Dict[int -> List[ndarray]] 118 | A dictionary that maps from target identities to the list of samples 119 | that have been observed so far. 120 | 121 | """ 122 | 123 | def __init__(self, metric, matching_threshold, budget=None): 124 | 125 | 126 | if metric == "euclidean": 127 | self._metric = _nn_euclidean_distance 128 | elif metric == "cosine": 129 | self._metric = _nn_cosine_distance 130 | else: 131 | raise ValueError( 132 | "Invalid metric; must be either 'euclidean' or 'cosine'") 133 | self.matching_threshold = matching_threshold 134 | self.budget = budget 135 | self.samples = {} 136 | 137 | def partial_fit(self, features, targets, active_targets): 138 | """Update the distance metric with new data. 139 | 140 | Parameters 141 | ---------- 142 | features : ndarray 143 | An NxM matrix of N features of dimensionality M. 144 | targets : ndarray 145 | An integer array of associated target identities. 146 | active_targets : List[int] 147 | A list of targets that are currently present in the scene. 148 | 149 | """ 150 | for feature, target in zip(features, targets): 151 | self.samples.setdefault(target, []).append(feature) 152 | if self.budget is not None: 153 | self.samples[target] = self.samples[target][-self.budget:] 154 | self.samples = {k: self.samples[k] for k in active_targets} 155 | 156 | def distance(self, features, targets): 157 | """Compute distance between features and targets. 158 | 159 | Parameters 160 | ---------- 161 | features : ndarray 162 | An NxM matrix of N features of dimensionality M. 163 | targets : List[int] 164 | A list of targets to match the given `features` against. 165 | 166 | Returns 167 | ------- 168 | ndarray 169 | Returns a cost matrix of shape len(targets), len(features), where 170 | element (i, j) contains the closest squared distance between 171 | `targets[i]` and `features[j]`. 172 | 173 | """ 174 | cost_matrix = np.zeros((len(targets), len(features))) 175 | for i, target in enumerate(targets): 176 | cost_matrix[i, :] = self._metric(self.samples[target], features) 177 | return cost_matrix 178 | -------------------------------------------------------------------------------- /sort/preprocessing.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | import numpy as np 3 | import cv2 4 | 5 | 6 | def non_max_suppression(boxes, max_bbox_overlap, scores=None): 7 | """Suppress overlapping detections. 8 | 9 | Original code from [1]_ has been adapted to include confidence score. 10 | 11 | .. [1] http://www.pyimagesearch.com/2015/02/16/ 12 | faster-non-maximum-suppression-python/ 13 | 14 | Examples 15 | -------- 16 | 17 | >>> boxes = [d.roi for d in detections] 18 | >>> scores = [d.confidence for d in detections] 19 | >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores) 20 | >>> detections = [detections[i] for i in indices] 21 | 22 | Parameters 23 | ---------- 24 | boxes : ndarray 25 | Array of ROIs (x, y, width, height). 26 | max_bbox_overlap : float 27 | ROIs that overlap more than this values are suppressed. 28 | scores : Optional[array_like] 29 | Detector confidence score. 30 | 31 | Returns 32 | ------- 33 | List[int] 34 | Returns indices of detections that have survived non-maxima suppression. 35 | 36 | """ 37 | if len(boxes) == 0: 38 | return [] 39 | 40 | boxes = boxes.astype(np.float) 41 | pick = [] 42 | 43 | x1 = boxes[:, 0] 44 | y1 = boxes[:, 1] 45 | x2 = boxes[:, 2] + boxes[:, 0] 46 | y2 = boxes[:, 3] + boxes[:, 1] 47 | 48 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 49 | if scores is not None: 50 | idxs = np.argsort(scores) 51 | else: 52 | idxs = np.argsort(y2) 53 | 54 | while len(idxs) > 0: 55 | last = len(idxs) - 1 56 | i = idxs[last] 57 | pick.append(i) 58 | 59 | xx1 = np.maximum(x1[i], x1[idxs[:last]]) 60 | yy1 = np.maximum(y1[i], y1[idxs[:last]]) 61 | xx2 = np.minimum(x2[i], x2[idxs[:last]]) 62 | yy2 = np.minimum(y2[i], y2[idxs[:last]]) 63 | 64 | w = np.maximum(0, xx2 - xx1 + 1) 65 | h = np.maximum(0, yy2 - yy1 + 1) 66 | 67 | overlap = (w * h) / area[idxs[:last]] 68 | 69 | idxs = np.delete( 70 | idxs, np.concatenate( 71 | ([last], np.where(overlap > max_bbox_overlap)[0]))) 72 | 73 | return pick 74 | -------------------------------------------------------------------------------- /sort/track.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | 3 | 4 | class TrackState: 5 | """ 6 | Enumeration type for the single target track state. Newly created tracks are 7 | classified as `tentative` until enough evidence has been collected. Then, 8 | the track state is changed to `confirmed`. Tracks that are no longer alive 9 | are classified as `deleted` to mark them for removal from the set of active 10 | tracks. 11 | 12 | """ 13 | 14 | Tentative = 1 15 | Confirmed = 2 16 | Deleted = 3 17 | 18 | 19 | class Track: 20 | """ 21 | A single target track with state space `(x, y, a, h)` and associated 22 | velocities, where `(x, y)` is the center of the bounding box, `a` is the 23 | aspect ratio and `h` is the height. 24 | 25 | Parameters 26 | ---------- 27 | mean : ndarray 28 | Mean vector of the initial state distribution. 29 | covariance : ndarray 30 | Covariance matrix of the initial state distribution. 31 | track_id : int 32 | A unique track identifier. 33 | n_init : int 34 | Number of consecutive detections before the track is confirmed. The 35 | track state is set to `Deleted` if a miss occurs within the first 36 | `n_init` frames. 37 | max_age : int 38 | The maximum number of consecutive misses before the track state is 39 | set to `Deleted`. 40 | feature : Optional[ndarray] 41 | Feature vector of the detection this track originates from. If not None, 42 | this feature is added to the `features` cache. 43 | 44 | Attributes 45 | ---------- 46 | mean : ndarray 47 | Mean vector of the initial state distribution. 48 | covariance : ndarray 49 | Covariance matrix of the initial state distribution. 50 | track_id : int 51 | A unique track identifier. 52 | hits : int 53 | Total number of measurement updates. 54 | age : int 55 | Total number of frames since first occurance. 56 | time_since_update : int 57 | Total number of frames since last measurement update. 58 | state : TrackState 59 | The current track state. 60 | features : List[ndarray] 61 | A cache of features. On each measurement update, the associated feature 62 | vector is added to this list. 63 | 64 | """ 65 | 66 | def __init__(self, mean, covariance, track_id, n_init, max_age, 67 | feature=None): 68 | self.mean = mean 69 | self.covariance = covariance 70 | self.track_id = track_id 71 | self.hits = 1 72 | self.age = 1 73 | self.time_since_update = 0 74 | 75 | self.state = TrackState.Tentative 76 | self.features = [] 77 | if feature is not None: 78 | self.features.append(feature) 79 | 80 | self._n_init = n_init 81 | self._max_age = max_age 82 | 83 | def to_tlwh(self): 84 | """Get current position in bounding box format `(top left x, top left y, 85 | width, height)`. 86 | 87 | Returns 88 | ------- 89 | ndarray 90 | The bounding box. 91 | 92 | """ 93 | ret = self.mean[:4].copy() 94 | ret[2] *= ret[3] 95 | ret[:2] -= ret[2:] / 2 96 | return ret 97 | 98 | def to_tlbr(self): 99 | """Get current position in bounding box format `(min x, miny, max x, 100 | max y)`. 101 | 102 | Returns 103 | ------- 104 | ndarray 105 | The bounding box. 106 | 107 | """ 108 | ret = self.to_tlwh() 109 | ret[2:] = ret[:2] + ret[2:] 110 | return ret 111 | 112 | def predict(self, kf): 113 | """Propagate the state distribution to the current time step using a 114 | Kalman filter prediction step. 115 | 116 | Parameters 117 | ---------- 118 | kf : kalman_filter.KalmanFilter 119 | The Kalman filter. 120 | 121 | """ 122 | self.mean, self.covariance = kf.predict(self.mean, self.covariance) 123 | self.age += 1 124 | self.time_since_update += 1 125 | 126 | def update(self, kf, detection): 127 | """Perform Kalman filter measurement update step and update the feature 128 | cache. 129 | 130 | Parameters 131 | ---------- 132 | kf : kalman_filter.KalmanFilter 133 | The Kalman filter. 134 | detection : Detection 135 | The associated detection. 136 | 137 | """ 138 | self.mean, self.covariance = kf.update( 139 | self.mean, self.covariance, detection.to_xyah()) 140 | self.features.append(detection.feature) 141 | 142 | self.hits += 1 143 | self.time_since_update = 0 144 | if self.state == TrackState.Tentative and self.hits >= self._n_init: 145 | self.state = TrackState.Confirmed 146 | 147 | def mark_missed(self): 148 | """Mark this track as missed (no association at the current time step). 149 | """ 150 | if self.state == TrackState.Tentative: 151 | self.state = TrackState.Deleted 152 | elif self.time_since_update > self._max_age: 153 | self.state = TrackState.Deleted 154 | 155 | def is_tentative(self): 156 | """Returns True if this track is tentative (unconfirmed). 157 | """ 158 | return self.state == TrackState.Tentative 159 | 160 | def is_confirmed(self): 161 | """Returns True if this track is confirmed.""" 162 | return self.state == TrackState.Confirmed 163 | 164 | def is_deleted(self): 165 | """Returns True if this track is dead and should be deleted.""" 166 | return self.state == TrackState.Deleted 167 | -------------------------------------------------------------------------------- /sort/tracker.py: -------------------------------------------------------------------------------- 1 | # vim: expandtab:ts=4:sw=4 2 | from __future__ import absolute_import 3 | import numpy as np 4 | from . import kalman_filter 5 | from . import linear_assignment 6 | from . import iou_matching 7 | from .track import Track 8 | 9 | 10 | class Tracker: 11 | """ 12 | This is the multi-target tracker. 13 | 14 | Parameters 15 | ---------- 16 | metric : nn_matching.NearestNeighborDistanceMetric 17 | A distance metric for measurement-to-track association. 18 | max_age : int 19 | Maximum number of missed misses before a track is deleted. 20 | n_init : int 21 | Number of consecutive detections before the track is confirmed. The 22 | track state is set to `Deleted` if a miss occurs within the first 23 | `n_init` frames. 24 | 25 | Attributes 26 | ---------- 27 | metric : nn_matching.NearestNeighborDistanceMetric 28 | The distance metric used for measurement to track association. 29 | max_age : int 30 | Maximum number of missed misses before a track is deleted. 31 | n_init : int 32 | Number of frames that a track remains in initialization phase. 33 | kf : kalman_filter.KalmanFilter 34 | A Kalman filter to filter target trajectories in image space. 35 | tracks : List[Track] 36 | The list of active tracks at the current time step. 37 | 38 | """ 39 | 40 | def __init__(self, metric, max_iou_distance=0.7, max_age=30, n_init=3): 41 | self.metric = metric 42 | self.max_iou_distance = max_iou_distance 43 | self.max_age = max_age 44 | self.n_init = n_init 45 | 46 | self.kf = kalman_filter.KalmanFilter() 47 | self.tracks = [] 48 | self._next_id = 1 49 | 50 | def predict(self): 51 | """Propagate track state distributions one time step forward. 52 | 53 | This function should be called once every time step, before `update`. 54 | """ 55 | for track in self.tracks: 56 | track.predict(self.kf) 57 | 58 | def update(self, detections): 59 | """Perform measurement update and track management. 60 | 61 | Parameters 62 | ---------- 63 | detections : List[deep_sort.detection.Detection] 64 | A list of detections at the current time step. 65 | 66 | """ 67 | # Run matching cascade. 68 | matches, unmatched_tracks, unmatched_detections = \ 69 | self._match(detections) 70 | ############################################## 71 | #print('match = {}'.format(matches)) 72 | ############################################### 73 | # Update track set. 74 | for track_idx, detection_idx in matches: 75 | self.tracks[track_idx].update( 76 | self.kf, detections[detection_idx]) 77 | for track_idx in unmatched_tracks: 78 | self.tracks[track_idx].mark_missed() 79 | for detection_idx in unmatched_detections: 80 | self._initiate_track(detections[detection_idx]) 81 | self.tracks = [t for t in self.tracks if not t.is_deleted()] 82 | 83 | # Update distance metric. 84 | active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] 85 | features, targets = [], [] 86 | for track in self.tracks: 87 | if not track.is_confirmed(): 88 | continue 89 | features += track.features 90 | targets += [track.track_id for _ in track.features] 91 | track.features = [] 92 | self.metric.partial_fit( 93 | np.asarray(features), np.asarray(targets), active_targets) 94 | 95 | def _match(self, detections): 96 | 97 | def gated_metric(tracks, dets, track_indices, detection_indices): 98 | features = np.array([dets[i].feature for i in detection_indices]) 99 | targets = np.array([tracks[i].track_id for i in track_indices]) 100 | cost_matrix = self.metric.distance(features, targets) 101 | cost_matrix = linear_assignment.gate_cost_matrix( 102 | self.kf, cost_matrix, tracks, dets, track_indices, 103 | detection_indices) 104 | 105 | return cost_matrix 106 | 107 | # Split track set into confirmed and unconfirmed tracks. 108 | confirmed_tracks = [ 109 | i for i, t in enumerate(self.tracks) if t.is_confirmed()] 110 | unconfirmed_tracks = [ 111 | i for i, t in enumerate(self.tracks) if not t.is_confirmed()] 112 | 113 | # Associate confirmed tracks using appearance features. 114 | matches_a, unmatched_tracks_a, unmatched_detections = \ 115 | linear_assignment.matching_cascade( 116 | gated_metric, self.metric.matching_threshold, self.max_age, 117 | self.tracks, detections, confirmed_tracks) 118 | 119 | # Associate remaining tracks together with unconfirmed tracks using IOU. 120 | iou_track_candidates = unconfirmed_tracks + [ 121 | k for k in unmatched_tracks_a if 122 | self.tracks[k].time_since_update == 1] 123 | unmatched_tracks_a = [ 124 | k for k in unmatched_tracks_a if 125 | self.tracks[k].time_since_update != 1] 126 | matches_b, unmatched_tracks_b, unmatched_detections = \ 127 | linear_assignment.min_cost_matching( 128 | iou_matching.iou_cost, self.max_iou_distance, self.tracks, 129 | detections, iou_track_candidates, unmatched_detections) 130 | 131 | matches = matches_a + matches_b 132 | unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) 133 | return matches, unmatched_tracks, unmatched_detections 134 | 135 | def _initiate_track(self, detection): 136 | mean, covariance = self.kf.initiate(detection.to_xyah()) 137 | self.tracks.append(Track( 138 | mean, covariance, self._next_id, self.n_init, self.max_age, 139 | detection.feature)) 140 | self._next_id += 1 141 | -------------------------------------------------------------------------------- /support/layer/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from support import _C 4 | 5 | nms = _C.nms 6 | -------------------------------------------------------------------------------- /support/layer/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from support import _C 9 | 10 | 11 | class _ROIAlign(Function): 12 | @staticmethod 13 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 14 | ctx.save_for_backward(roi) 15 | ctx.output_size = _pair(output_size) 16 | ctx.spatial_scale = spatial_scale 17 | ctx.sampling_ratio = sampling_ratio 18 | ctx.input_shape = input.size() 19 | output = _C.roi_align_forward( 20 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio 21 | ) 22 | return output 23 | 24 | @staticmethod 25 | @once_differentiable 26 | def backward(ctx, grad_output): 27 | rois, = ctx.saved_tensors 28 | output_size = ctx.output_size 29 | spatial_scale = ctx.spatial_scale 30 | sampling_ratio = ctx.sampling_ratio 31 | bs, ch, h, w = ctx.input_shape 32 | grad_input = _C.roi_align_backward( 33 | grad_output, 34 | rois, 35 | spatial_scale, 36 | output_size[0], 37 | output_size[1], 38 | bs, 39 | ch, 40 | h, 41 | w, 42 | sampling_ratio, 43 | ) 44 | return grad_input, None, None, None, None 45 | 46 | 47 | roi_align = _ROIAlign.apply 48 | 49 | 50 | class ROIAlign(nn.Module): 51 | def __init__(self, output_size, spatial_scale, sampling_ratio): 52 | super(ROIAlign, self).__init__() 53 | self.output_size = output_size 54 | self.spatial_scale = spatial_scale 55 | self.sampling_ratio = sampling_ratio 56 | 57 | def forward(self, input, rois): 58 | return roi_align( 59 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 60 | ) 61 | 62 | def __repr__(self): 63 | tmpstr = self.__class__.__name__ + "(" 64 | tmpstr += "output_size=" + str(self.output_size) 65 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 66 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 67 | tmpstr += ")" 68 | return tmpstr 69 | -------------------------------------------------------------------------------- /support/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import glob 4 | import os 5 | 6 | import torch 7 | from setuptools import setup 8 | from torch.utils.cpp_extension import CUDA_HOME 9 | from torch.utils.cpp_extension import CppExtension 10 | from torch.utils.cpp_extension import CUDAExtension 11 | 12 | requirements = ["torch", "torchvision"] 13 | 14 | 15 | def get_extensions(): 16 | this_dir = os.path.dirname(os.path.abspath(__file__)) 17 | extensions_dir = os.path.join(this_dir, "src") 18 | 19 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 20 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 21 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 22 | 23 | sources = main_file + source_cpu 24 | extension = CppExtension 25 | 26 | extra_compile_args = {"cxx": []} 27 | define_macros = [] 28 | 29 | if torch.cuda.is_available() and CUDA_HOME is not None: 30 | extension = CUDAExtension 31 | sources += source_cuda 32 | define_macros += [("WITH_CUDA", None)] 33 | extra_compile_args["nvcc"] = [ 34 | "-DCUDA_HAS_FP16=1", 35 | "-D__CUDA_NO_HALF_OPERATORS__", 36 | "-D__CUDA_NO_HALF_CONVERSIONS__", 37 | "-D__CUDA_NO_HALF2_OPERATORS__", 38 | ] 39 | 40 | sources = [os.path.join(extensions_dir, s) for s in sources] 41 | 42 | include_dirs = [extensions_dir] 43 | 44 | ext_modules = [ 45 | extension( 46 | "support._C", 47 | sources, 48 | include_dirs=include_dirs, 49 | define_macros=define_macros, 50 | extra_compile_args=extra_compile_args, 51 | ) 52 | ] 53 | 54 | return ext_modules 55 | 56 | 57 | setup( 58 | name="support", 59 | version="0.1", 60 | ext_modules=get_extensions(), 61 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 62 | ) 63 | -------------------------------------------------------------------------------- /support/src/ROIAlign.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | // Interface for Python 11 | at::Tensor ROIAlign_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int sampling_ratio) { 17 | if (input.type().is_cuda()) { 18 | #ifdef WITH_CUDA 19 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 20 | #else 21 | AT_ERROR("Not compiled with GPU support"); 22 | #endif 23 | } 24 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 25 | } 26 | 27 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 28 | const at::Tensor& rois, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /support/src/cpu/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "cpu/vision.h" 3 | 4 | 5 | template 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, 7 | const at::Tensor& scores, 8 | const float threshold) { 9 | AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); 10 | AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); 11 | AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); 12 | 13 | if (dets.numel() == 0) { 14 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 15 | } 16 | 17 | auto x1_t = dets.select(1, 0).contiguous(); 18 | auto y1_t = dets.select(1, 1).contiguous(); 19 | auto x2_t = dets.select(1, 2).contiguous(); 20 | auto y2_t = dets.select(1, 3).contiguous(); 21 | 22 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 23 | 24 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 25 | 26 | auto ndets = dets.size(0); 27 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 28 | 29 | auto suppressed = suppressed_t.data(); 30 | auto order = order_t.data(); 31 | auto x1 = x1_t.data(); 32 | auto y1 = y1_t.data(); 33 | auto x2 = x2_t.data(); 34 | auto y2 = y2_t.data(); 35 | auto areas = areas_t.data(); 36 | 37 | for (int64_t _i = 0; _i < ndets; _i++) { 38 | auto i = order[_i]; 39 | if (suppressed[i] == 1) 40 | continue; 41 | auto ix1 = x1[i]; 42 | auto iy1 = y1[i]; 43 | auto ix2 = x2[i]; 44 | auto iy2 = y2[i]; 45 | auto iarea = areas[i]; 46 | 47 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 48 | auto j = order[_j]; 49 | if (suppressed[j] == 1) 50 | continue; 51 | auto xx1 = std::max(ix1, x1[j]); 52 | auto yy1 = std::max(iy1, y1[j]); 53 | auto xx2 = std::min(ix2, x2[j]); 54 | auto yy2 = std::min(iy2, y2[j]); 55 | 56 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 57 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 58 | auto inter = w * h; 59 | auto ovr = inter / (iarea + areas[j] - inter); 60 | if (ovr >= threshold) 61 | suppressed[j] = 1; 62 | } 63 | } 64 | return at::nonzero(suppressed_t == 0).squeeze(1); 65 | } 66 | 67 | at::Tensor nms_cpu(const at::Tensor& dets, 68 | const at::Tensor& scores, 69 | const float threshold) { 70 | at::Tensor result; 71 | AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { 72 | result = nms_cpu_kernel(dets, scores, threshold); 73 | }); 74 | return result; 75 | } 76 | -------------------------------------------------------------------------------- /support/src/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | -------------------------------------------------------------------------------- /support/src/cuda/nms.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 12 | 13 | __device__ inline float devIoU(float const * const a, float const * const b) { 14 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 15 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 16 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 17 | float interS = width * height; 18 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 19 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 20 | return interS / (Sa + Sb - interS); 21 | } 22 | 23 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 24 | const float *dev_boxes, unsigned long long *dev_mask) { 25 | const int row_start = blockIdx.y; 26 | const int col_start = blockIdx.x; 27 | 28 | // if (row_start > col_start) return; 29 | 30 | const int row_size = 31 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 32 | const int col_size = 33 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 34 | 35 | __shared__ float block_boxes[threadsPerBlock * 5]; 36 | if (threadIdx.x < col_size) { 37 | block_boxes[threadIdx.x * 5 + 0] = 38 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 39 | block_boxes[threadIdx.x * 5 + 1] = 40 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 41 | block_boxes[threadIdx.x * 5 + 2] = 42 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 43 | block_boxes[threadIdx.x * 5 + 3] = 44 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 45 | block_boxes[threadIdx.x * 5 + 4] = 46 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 47 | } 48 | __syncthreads(); 49 | 50 | if (threadIdx.x < row_size) { 51 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 52 | const float *cur_box = dev_boxes + cur_box_idx * 5; 53 | int i = 0; 54 | unsigned long long t = 0; 55 | int start = 0; 56 | if (row_start == col_start) { 57 | start = threadIdx.x + 1; 58 | } 59 | for (i = start; i < col_size; i++) { 60 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 61 | t |= 1ULL << i; 62 | } 63 | } 64 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 65 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 66 | } 67 | } 68 | 69 | // boxes is a N x 5 tensor 70 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { 71 | using scalar_t = float; 72 | AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); 73 | auto scores = boxes.select(1, 4); 74 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 75 | auto boxes_sorted = boxes.index_select(0, order_t); 76 | 77 | int boxes_num = boxes.size(0); 78 | 79 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 80 | 81 | scalar_t* boxes_dev = boxes_sorted.data(); 82 | 83 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 84 | 85 | unsigned long long* mask_dev = NULL; 86 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 87 | // boxes_num * col_blocks * sizeof(unsigned long long))); 88 | 89 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 90 | 91 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 92 | THCCeilDiv(boxes_num, threadsPerBlock)); 93 | dim3 threads(threadsPerBlock); 94 | nms_kernel<<>>(boxes_num, 95 | nms_overlap_thresh, 96 | boxes_dev, 97 | mask_dev); 98 | 99 | std::vector mask_host(boxes_num * col_blocks); 100 | THCudaCheck(cudaMemcpy(&mask_host[0], 101 | mask_dev, 102 | sizeof(unsigned long long) * boxes_num * col_blocks, 103 | cudaMemcpyDeviceToHost)); 104 | 105 | std::vector remv(col_blocks); 106 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 107 | 108 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 109 | int64_t* keep_out = keep.data(); 110 | 111 | int num_to_keep = 0; 112 | for (int i = 0; i < boxes_num; i++) { 113 | int nblock = i / threadsPerBlock; 114 | int inblock = i % threadsPerBlock; 115 | 116 | if (!(remv[nblock] & (1ULL << inblock))) { 117 | keep_out[num_to_keep++] = i; 118 | unsigned long long *p = &mask_host[0] + i * col_blocks; 119 | for (int j = nblock; j < col_blocks; j++) { 120 | remv[j] |= p[j]; 121 | } 122 | } 123 | } 124 | 125 | THCudaFree(state, mask_dev); 126 | // TODO improve this part 127 | return std::get<0>(order_t.index({ 128 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 129 | order_t.device(), keep.scalar_type()) 130 | }).sort(0, false)); 131 | } 132 | -------------------------------------------------------------------------------- /support/src/cuda/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, 14 | const at::Tensor& rois, 15 | const float spatial_scale, 16 | const int pooled_height, 17 | const int pooled_width, 18 | const int batch_size, 19 | const int channels, 20 | const int height, 21 | const int width, 22 | const int sampling_ratio); 23 | 24 | 25 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); 26 | 27 | 28 | at::Tensor compute_flow_cuda(const at::Tensor& boxes, 29 | const int height, 30 | const int width); 31 | -------------------------------------------------------------------------------- /support/src/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.type().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | -------------------------------------------------------------------------------- /support/src/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "nms.h" 3 | #include "ROIAlign.h" 4 | 5 | 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 7 | m.def("nms", &nms, "non-maximum suppression"); 8 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 9 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 10 | } 11 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os 4 | from torch.utils.data import DataLoader, Dataset 5 | import numpy as np 6 | from scipy import interp 7 | import matplotlib.pyplot as plt 8 | import torch 9 | from config import params 10 | import torch.backends.cudnn as cudnn 11 | from lib import slowfastnet 12 | from Config import Config 13 | 14 | class Test_video(Dataset): 15 | def __init__(self,short_side): 16 | self.short_side=short_side 17 | def normalize(self, buffer): 18 | # Normalize the buffer 19 | # buffer = (buffer - 128)/128.0 20 | for i, frame in enumerate(buffer): 21 | frame = (frame - np.array([[[128.0, 128.0, 128.0]]]))/128.0 22 | buffer[i] = frame 23 | return buffer 24 | 25 | def to_tensor(self, buffer): 26 | # convert from [D, H, W, C] format to [C, D, H, W] (what PyTorch uses) 27 | # D = Depth (in this case, time), H = Height, W = Width, C = Channels 28 | return buffer.transpose((3, 0, 1, 2)) 29 | 30 | def crop(self, buffer, crop_size): 31 | # randomly select time index for temporal jittering 32 | # time_index = np.random.randint(buffer.shape[0] - clip_len) 33 | # Randomly select start indices in order to crop the video 34 | height_index = np.random.randint(buffer.shape[1] - crop_size) 35 | width_index = np.random.randint(buffer.shape[2] - crop_size) 36 | 37 | # crop and jitter the video using indexing. The spatial crop is performed on 38 | # the entire array, so each frame is cropped in the same location. The temporal 39 | # jitter takes place via the selection of consecutive frames 40 | buffer = buffer[:, 41 | height_index:height_index + crop_size, 42 | width_index:width_index + crop_size, :] 43 | 44 | return buffer 45 | 46 | def generate_video_clip(self,split_span,keep_num,fname="/home/aiuser/Desktop/_7oWZq_s_Sk.mkv"): 47 | capture = cv2.VideoCapture(fname) 48 | #获取视频的基本信息 49 | frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) 50 | frame_width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) 51 | frame_height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) 52 | fps = int(capture.get(cv2.CAP_PROP_FPS)) 53 | #计算要切多少段,每段切多少帧 54 | print(frame_count,frame_width) 55 | split_len=fps*split_span 56 | split_time=frame_count/split_len 57 | if frame_height < frame_width: 58 | resize_height = np.random.randint(self.short_side[0], self.short_side[1] + 1) 59 | resize_width = int(float(resize_height) / frame_height * frame_width) 60 | else: 61 | resize_width = np.random.randint(self.short_side[0], self.short_side[1] + 1) 62 | resize_height = int(float(resize_width) / frame_width * frame_height) 63 | start_idx = 0 64 | end_idx = start_idx + split_len 65 | skip_span = split_len // keep_num if end_idx // keep_num > 0 else 1 66 | rem = split_len - skip_span * keep_num if split_len - skip_span * keep_num >= 0 else 0 67 | while split_time>0: #切多少段 68 | split_time=split_time-1 69 | start_idx = start_idx + rem // 2 70 | buffer = [] 71 | sample_count=0 72 | #处理每一段视频 73 | while (start_idx=keep_num): 77 | continue 78 | if start_idx % skip_span != 0 and start_idx!=0: 79 | continue 80 | if retaining is False: 81 | break 82 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 83 | if (frame_height != resize_height) or (frame_width != resize_width): 84 | frame = cv2.resize(frame, (resize_width, resize_height)) 85 | buffer.append(frame) 86 | # if len(pa.isna(frame).nonzero()[1]) != 0 or np.max(frame) > 255: 87 | # print("discard:", buffer) 88 | sample_count=sample_count+1 89 | print(np.shape(buffer)) 90 | if len(buffer) %d' % (self.id(), time.time() - start, len(bboxes), len(kept_indices))) 16 | return kept_indices 17 | 18 | def test_nms_empty(self): 19 | bboxes = torch.tensor([], dtype=torch.float).cuda() 20 | scores = torch.tensor([], dtype=torch.float).cuda() 21 | kept_indices = self._run_nms(bboxes, scores) 22 | self.assertEqual(len(kept_indices), 0) 23 | 24 | def test_nms_single(self): 25 | bboxes = torch.tensor([[5, 5, 10, 10]], dtype=torch.float).cuda() 26 | scores = torch.tensor([0.8], dtype=torch.float).cuda() 27 | kept_indices = self._run_nms(bboxes, scores) 28 | self.assertEqual(len(kept_indices), 1) 29 | self.assertListEqual(kept_indices.tolist(), [0]) 30 | 31 | def test_nms_small(self): 32 | bboxes = torch.tensor([[5, 5, 10, 10], [5, 5, 10, 10], [5, 5, 30, 30]], dtype=torch.float).cuda() 33 | scores = torch.tensor([0.6, 0.9, 0.4], dtype=torch.float).cuda() 34 | kept_indices = self._run_nms(bboxes, scores) 35 | self.assertEqual(len(kept_indices), 2) 36 | self.assertListEqual(kept_indices.tolist(), [1, 2]) 37 | 38 | def test_nms_large(self): 39 | # detections format: [[left, top, right, bottom, score], ...], which (right, bottom) is included in area 40 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 41 | detections = np.load(os.path.join(cur_dir, 'nms-large-input.npy')) 42 | detections = torch.tensor(detections, dtype=torch.float).cuda() 43 | bboxes = detections[:, 0:4] 44 | scores = detections[:, 4] 45 | 46 | kept_indices = self._run_nms(bboxes, scores) 47 | self.assertEqual(len(kept_indices), 1934) 48 | 49 | expect = np.load(os.path.join(cur_dir, 'nms-large-output.npy')) 50 | self.assertListEqual(sorted(kept_indices.tolist()), 51 | sorted(expect.tolist())) 52 | 53 | 54 | if __name__ == '__main__': 55 | assert torch.cuda.is_available(), 'NMS module requires CUDA support' 56 | torch.tensor([]).cuda() # dummy for initializing GPU 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /test_con.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.tensor as tensor 3 | import torch 4 | import f 5 | # import numpy as np 6 | # loss=nn.CrossEntropyLoss() 7 | # a=tensor(([2,3],[4,5]),dtype=torch.float) 8 | # w=tensor(torch.ones(2,1),dtype=torch.float,requires_grad=True) 9 | # out=torch.mm(a,w) 10 | # print(out) 11 | # a=5 12 | # print(tensor(5,dtype=torch.float)) 13 | # out=torch.mul(out.float(),tensor(5).float()) 14 | # # print(out) 15 | # # print(tensor([1]).float) 16 | # sm=nn.Softmax(dim=0) 17 | # print(out.view(-1)) 18 | # smo=sm(out.view(-1)) 19 | # print(smo) 20 | # smo=torch.log(smo) 21 | # loss=nn.NLLLoss() 22 | # target=tensor([1]) 23 | # loss=loss(smo.unsqueeze(0),target) 24 | # print(loss) 25 | # loss.backward() 26 | # print(w.grad.data) 27 | 28 | def test_grad(): 29 | input=tensor(([1,2,3],[4,5,6],[7,8,9]),dtype=torch.float) 30 | #weight=tensor(([0.1,0.2,0.3,0.4],[0.1,0.2,0.3,0.4],[0.1,0.2,0.3,0.4]),requires_grad=True) 31 | weight=tensor(torch.rand(3, 4),requires_grad=True) 32 | #input=input.unsqueeze(0) 33 | print(input,weight) 34 | pre=torch.mm(input,weight) 35 | #loss1=f.multilabel_soft_margin_loss() 36 | loss2=nn.MultiLabelMarginLoss() 37 | lable1=tensor(([0, 1, 1,0],),dtype=torch.float) 38 | lable2 = tensor(([0, 1, 1,0], [1, 0, 0,0], [1, 0,1 ,1]), dtype=torch.long) 39 | print(pre,lable1) 40 | loss1=f.multilabel_soft_margin_loss(pre,lable1,reduction='sum') 41 | loss1.backward() 42 | print('weight.grad.data1:',weight.grad.data) 43 | 44 | # loss2 = loss2(pre, lable2) 45 | # loss2.backward() 46 | # print('weight.grad.data2:', weight.grad.data) 47 | if __name__ == '__main__': 48 | test_grad() -------------------------------------------------------------------------------- /test_daptice.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | import math 3 | import numpy as np 4 | 5 | alist = t.randn(2, 3, 9) 6 | 7 | inputsz = np.array(alist.shape[2:]) 8 | outputsz = np.array([9]) 9 | 10 | stridesz = np.floor(inputsz / outputsz).astype(np.int32) 11 | print("stridesz",stridesz) 12 | kernelsz = inputsz - (outputsz - 1) * stridesz 13 | print("kernelsz",kernelsz) 14 | 15 | adp = t.nn.AdaptiveMaxPool1d([10]) 16 | avg = t.nn.MaxPool1d(kernel_size=list(kernelsz), stride=list(stridesz)) 17 | adplist = adp(alist) 18 | avglist = avg(alist) 19 | 20 | print(alist) 21 | print(adplist) 22 | print(avglist) 23 | -------------------------------------------------------------------------------- /test_nms.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import unittest 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from support.layer.nms import nms 9 | 10 | class TestNMS(unittest.TestCase): 11 | def _run_nms(self, bboxes, scores): 12 | start = time.time() 13 | threshold = 0.7 14 | kept_indices = nms(bboxes, scores, threshold) 15 | print('%s in %.3fs, %d -> %d' % (self.id(), time.time() - start, len(bboxes), len(kept_indices))) 16 | return kept_indices 17 | 18 | def test_nms_empty(self): 19 | bboxes = torch.tensor([], dtype=torch.float).cuda() 20 | scores = torch.tensor([], dtype=torch.float).cuda() 21 | kept_indices = self._run_nms(bboxes, scores) 22 | self.assertEqual(len(kept_indices), 0) 23 | 24 | def test_nms_single(self): 25 | bboxes = torch.tensor([[5, 5, 10, 10]], dtype=torch.float).cuda() 26 | scores = torch.tensor([0.8], dtype=torch.float).cuda() 27 | kept_indices = self._run_nms(bboxes, scores) 28 | self.assertEqual(len(kept_indices), 1) 29 | self.assertListEqual(kept_indices.tolist(), [0]) 30 | 31 | def test_nms_small(self): 32 | bboxes = torch.tensor([[5, 5, 10, 10], [5, 5, 10, 10], [5, 5, 30, 30]], dtype=torch.float).cuda() 33 | scores = torch.tensor([0.6, 0.9, 0.4], dtype=torch.float).cuda() 34 | kept_indices = self._run_nms(bboxes, scores) 35 | self.assertEqual(len(kept_indices), 2) 36 | self.assertListEqual(kept_indices.tolist(), [1, 2]) 37 | 38 | def test_nms_large(self): 39 | # detections format: [[left, top, right, bottom, score], ...], which (right, bottom) is included in area 40 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 41 | detections = np.load(os.path.join(cur_dir, 'nms-large-input.npy')) 42 | detections = torch.tensor(detections, dtype=torch.float).cuda() 43 | bboxes = detections[:, 0:4] 44 | scores = detections[:, 4] 45 | 46 | kept_indices = self._run_nms(bboxes, scores) 47 | self.assertEqual(len(kept_indices), 1934) 48 | 49 | expect = np.load(os.path.join(cur_dir, 'nms-large-output.npy')) 50 | self.assertListEqual(sorted(kept_indices.tolist()), 51 | sorted(expect.tolist())) 52 | 53 | 54 | if __name__ == '__main__': 55 | assert torch.cuda.is_available(), 'NMS module requires CUDA support' 56 | torch.tensor([]).cuda() # dummy for initializing GPU 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | --------------------------------------------------------------------------------