├── test
    ├── test_results
    │   └── readme.md
    ├── ckpts
    │   └── readme.md
    ├── util
    │   ├── __pycache__
    │   │   ├── misc.cpython-38.pyc
    │   │   └── lr_sched.cpython-38.pyc
    │   └── lr_sched.py
    ├── robots
    │   ├── __pycache__
    │   │   └── panda_robot.cpython-38.pyc
    │   ├── franka_description
    │   │   └── meshes
    │   │   │   └── collision
    │   │   │       ├── hand.stl
    │   │   │       ├── finger.stl
    │   │   │       ├── link0.stl
    │   │   │       ├── link1.stl
    │   │   │       ├── link2.stl
    │   │   │       ├── link3.stl
    │   │   │       ├── link4.stl
    │   │   │       ├── link5.stl
    │   │   │       ├── link6.stl
    │   │   │       ├── link7.stl
    │   │   │       └── finger.stl.convex.stl
    │   ├── panda_gripper.urdf
    │   └── panda_robot.py
    ├── llama
    │   ├── __init__.py
    │   ├── tokenizer.py
    │   ├── utils.py
    │   └── llama_adapter.py
    ├── colors.py
    ├── test.sh
    ├── clean_data.py
    ├── test_entireprocess_in_sapien.py
    ├── test_llama.py
    ├── processgen.py
    ├── cal_test_mani_succ_rate.py
    ├── camera.py
    ├── utils.py
    └── test_one_stick_clean.py
├── data_collection
    ├── data
    │   └── readme.md
    ├── asset
    │   └── readme.md
    ├── code
    │   ├── robots
    │   │   ├── misc
    │   │   │   ├── table_map.jpg
    │   │   │   └── cube.obj
    │   │   ├── franka_description
    │   │   │   └── meshes
    │   │   │   │   └── collision
    │   │   │   │       ├── finger.stl
    │   │   │   │       ├── hand.stl
    │   │   │   │       ├── link0.stl
    │   │   │   │       ├── link1.stl
    │   │   │   │       ├── link2.stl
    │   │   │   │       ├── link3.stl
    │   │   │   │       ├── link4.stl
    │   │   │   │       ├── link5.stl
    │   │   │   │       ├── link6.stl
    │   │   │   │       ├── link7.stl
    │   │   │   │       └── finger.stl.convex.stl
    │   │   ├── panda_gripper.urdf
    │   │   ├── panda_robot.py
    │   │   └── panda.urdf
    │   ├── colors.py
    │   ├── check_cat_balance.py
    │   ├── test_data_collect.py
    │   ├── scripts
    │   │   └── run_gen_offline_data.sh
    │   ├── transfer_dataset.py
    │   ├── train_test_split.py
    │   ├── gen_offline_data.py
    │   ├── datagen.py
    │   ├── camera.py
    │   └── utils.py
    └── stats
    │   ├── ins_cnt_46cats.txt
    │   └── test_id.txt
├── train
    ├── ckpts
    │   └── readme.md
    ├── exp
    │   └── readme.md
    ├── llama
    │   ├── __init__.py
    │   ├── tokenizer.py
    │   ├── utils.py
    │   └── llama_adapter.py
    ├── output
    │   └── events.out.tfevents.1718348142.di-20240614115853-6ppx6.1005317.0
    ├── finetune.sh
    ├── util
    │   └── lr_sched.py
    ├── engine_finetune.py
    ├── data
    │   ├── dataset.py
    │   └── create_dataset_aff.py
    ├── main_finetune.py
    └── utils.py
├── requirements.txt
└── README.md


/test/test_results/readme.md:
--------------------------------------------------------------------------------
1 | The test results will be shown here
2 | 


--------------------------------------------------------------------------------
/data_collection/data/readme.md:
--------------------------------------------------------------------------------
1 | The generated train/test data will be shown here
2 | 


--------------------------------------------------------------------------------
/test/ckpts/readme.md:
--------------------------------------------------------------------------------
1 | Please place the downloaded checkpoints (CLIP, LlaMa, Llama-Adapter) in this folder
2 | 


--------------------------------------------------------------------------------
/train/ckpts/readme.md:
--------------------------------------------------------------------------------
1 | Please place the downloaded checkpoints here, including CLIP, LlaMa, and LlaMa-Adapter
2 | 


--------------------------------------------------------------------------------
/data_collection/asset/readme.md:
--------------------------------------------------------------------------------
1 | Please place the urdf from the official partnet-mobility website in this folder
2 | 


--------------------------------------------------------------------------------
/test/util/__pycache__/misc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/util/__pycache__/misc.cpython-38.pyc


--------------------------------------------------------------------------------
/data_collection/code/robots/misc/table_map.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/misc/table_map.jpg


--------------------------------------------------------------------------------
/test/util/__pycache__/lr_sched.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/util/__pycache__/lr_sched.cpython-38.pyc


--------------------------------------------------------------------------------
/test/robots/__pycache__/panda_robot.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/__pycache__/panda_robot.cpython-38.pyc


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/hand.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/hand.stl


--------------------------------------------------------------------------------
/train/exp/readme.md:
--------------------------------------------------------------------------------
1 | The training checkpoint will be saved here.
2 | Or if you want to test the checkpoint of ManipLLM, place the ManipLLM checkpoint in this folder.
3 | 


--------------------------------------------------------------------------------
/test/llama/__init__.py:
--------------------------------------------------------------------------------
1 | from .llama import ModelArgs, Transformer
2 | from .tokenizer import Tokenizer
3 | from .llama_adapter import *
4 | from .utils import format_prompt


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/finger.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/finger.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link0.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link0.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link1.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link1.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link2.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link2.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link3.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link3.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link4.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link4.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link5.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link5.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link6.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link6.stl


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/link7.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/test/robots/franka_description/meshes/collision/link7.stl


--------------------------------------------------------------------------------
/train/llama/__init__.py:
--------------------------------------------------------------------------------
1 | from .llama import ModelArgs, Transformer
2 | from .tokenizer import Tokenizer
3 | from .llama_adapter import *
4 | from .utils import format_prompt


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/finger.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/finger.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/hand.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/hand.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link0.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link0.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link1.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link1.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link2.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link2.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link3.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link3.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link4.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link4.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link5.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link5.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link6.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link6.stl


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/link7.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/data_collection/code/robots/franka_description/meshes/collision/link7.stl


--------------------------------------------------------------------------------
/train/output/events.out.tfevents.1718348142.di-20240614115853-6ppx6.1005317.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clorislili/ManipLLM/HEAD/train/output/events.out.tfevents.1718348142.di-20240614115853-6ppx6.1005317.0


--------------------------------------------------------------------------------
/test/colors.py:
--------------------------------------------------------------------------------
1 | colors = [[0.5, 0.5, 0.5], [0.8, 0, 0], [0, 0.8, 0], [0, 0, 0.8], \
2 |         [0.5, 0.5, 0], [0.5, 0, 0.5], [0, 0.5, 0.5], \
3 |         [0.3, 0.6, 0], [0.6, 0, 0.3], [0.3, 0, 0.6], \
4 |         [0.6, 0.3, 0], [0.3, 0, 0.6], [0.6, 0, 0.3], \
5 |         [0.8, 0.2, 0.5]]
6 | 
7 | 


--------------------------------------------------------------------------------
/data_collection/code/colors.py:
--------------------------------------------------------------------------------
1 | colors = [[0.5, 0.5, 0.5], [0.8, 0, 0], [0, 0.8, 0], [0, 0, 0.8], \
2 |         [0.5, 0.5, 0], [0.5, 0, 0.5], [0, 0.5, 0.5], \
3 |         [0.3, 0.6, 0], [0.6, 0, 0.3], [0.3, 0, 0.6], \
4 |         [0.6, 0.3, 0], [0.3, 0, 0.6], [0.6, 0, 0.3], \
5 |         [0.8, 0.2, 0.5]]
6 | 
7 | 


--------------------------------------------------------------------------------
/data_collection/code/check_cat_balance.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | 
 4 | data_dir = '../data/train_data'
 5 | data_list = os.listdir(data_dir)
 6 | cat_cal = dict()
 7 | for data_name in data_list:
 8 |     cat = data_name.split('_')[1]
 9 |     if cat not in list(cat_cal.keys()):
10 |         cat_cal[cat] =  1
11 |     else:
12 |         cat_cal[cat] +=  1
13 | print(cat_cal)


--------------------------------------------------------------------------------
/data_collection/code/test_data_collect.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import shutil
 4 | cat_dict = dict()
 5 | data_dir = '../data/train_data'
 6 | data_list = os.listdir(data_dir)
 7 | 
 8 | 
 9 | for data_name in data_list:
10 |     cat = data_name.split('_')[1]
11 |     
12 |     if cat in list(cat_dict.keys()):
13 |        
14 |         cat_dict[cat] += 1
15 |     else:
16 |         cat_dict[cat] = 1
17 | print(cat_dict)
18 |         


--------------------------------------------------------------------------------
/test/test.sh:
--------------------------------------------------------------------------------
 1 | #step1: model inference 
 2 | OUPUT_DIR='./test_results/result_ori'
 3 | CUDA_VISIBLE_DEVICES=0 python test_llama.py \
 4 |   --llama_dir ./ckpts/llama_model_weights \
 5 |   --adapter_dir /PATH/TO/MANIPLLM/MODEL \
 6 |   --data_dir ../data_collection/data/test_data \
 7 |   --out_dir "$OUPUT_DIR" \
 8 |   --action pulling
 9 | 
10 | #step2: test in simulator
11 | python test_entireprocess_in_sapien.py \
12 |   --data_dir ../data_collection/data/test_data \
13 |   --num_processes 10 \
14 |   --out_dir "$OUPUT_DIR" \
15 |   --no_gui \
16 |   --use_mask True 
17 |   
18 | # #step3: calculate success rate
19 | python cal_test_mani_succ_rate.py \
20 |     --primact_type pulling \
21 |     --data_dir "$OUPUT_DIR"
22 | 


--------------------------------------------------------------------------------
/data_collection/stats/ins_cnt_46cats.txt:
--------------------------------------------------------------------------------
 1 | Safe 30 11
 2 | Door 36 9
 3 | Display 37 9
 4 | Refrigerator 44 7
 5 | Laptop 55 6
 6 | Lighter 28 12
 7 | Microwave 16 21
 8 | Mouse 14 25
 9 | Box 28 12
10 | TrashCan 70 5
11 | KitchenPot 25 14
12 | Suitcase 24 14
13 | Pliers 25 14
14 | StorageFurniture 346 1
15 | Remote 49 7
16 | Bottle 57 6
17 | FoldingChair 26 13
18 | Toaster 25 14
19 | Lamp 45 7
20 | Dispenser 57 6
21 | Toilet 69 5
22 | Scissors 47 7
23 | Table 101 3
24 | Stapler 23 15
25 | Kettle 29 12
26 | USB 51 6
27 | Switch 70 5
28 | WashingMachine 17 20
29 | Faucet 84 4
30 | Phone 18 19
31 | Bucket 36 12
32 | Dishwaher 48 7
33 | Window 58 6
34 | Oven 30 12
35 | Knife 44 8
36 | Fan 81 4
37 | Keyboard 37 10
38 | Printer 29 12
39 | Eyeglasses 65 5
40 | Globe 61 6
41 | Cart 61 6
42 | Pen 48 8


--------------------------------------------------------------------------------
/train/finetune.sh:
--------------------------------------------------------------------------------
 1 | #step1: generate training json
 2 | JSON_DIR='./data/train_json'
 3 | python ./data/create_dataset_aff.py --folder_dir ../data_collection/data/train_data --output_dir "$JSON_DIR" --num_point 20
 4 | 
 5 | #step2: train model
 6 | OUTPUT_DIR='./exp/train_ckpts'
 7 | mkdir -p "$OUTPUT_DIR"
 8 | CUDA_VISIBLE_DEVICES=0 python -u -m torch.distributed.launch --master_port=11710 --nproc_per_node=1 --use_env main_finetune.py --batch_size 1 \
 9 |       --epochs 10 --warmup_epochs 1 --blr 1e-3 --weight_decay 0.02 \
10 |       --output_dir "$OUTPUT_DIR" \
11 |       --pretrained_path ./ckpts/BIAS_LORA_NORM-336-Chinese-7B.pth \
12 |       --llama_path ./ckpts/llama_model_weights \
13 |       --bins True \
14 |       --mlm True\
15 |       --aff_prior \
16 |       --data_config "$JSON_DIR"
17 | 


--------------------------------------------------------------------------------
/data_collection/code/scripts/run_gen_offline_data.sh:
--------------------------------------------------------------------------------
 1 | # generate around 20,000 training samples, then stop it manually
 2 | python gen_offline_data.py \
 3 |   --data_dir ../data/train_data\
 4 |   --data_fn ../stats/train_id.txt\
 5 |   --primact_types pulling \
 6 |   --num_processes 40 \
 7 |   --num_epochs 100 \
 8 |   --starting_epoch 0 \
 9 |   --ins_cnt_fn ../stats/ins_cnt_46cats.txt \
10 |   --mode train 
11 | 
12 | # delete the extra testing dataset, and remain around 1,500 testing samples. Make sure that each category has as least 50 samples.
13 | python gen_offline_data.py \
14 |   --data_dir ../data/test_data\
15 |   --data_fn ../stats/test_id.txt\
16 |   --primact_types pulling \
17 |   --num_processes 10 \
18 |   --num_epochs 20 \
19 |   --starting_epoch 0 \
20 |   --ins_cnt_fn ../stats/ins_cnt_46cats.txt \
21 |   --mode test 
22 | 


--------------------------------------------------------------------------------
/test/clean_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | data_dir = '/home/jiyao/mingxu/where2act-main/data/highreso996_1119'
 4 | data_tar = '/home/jiyao/mingxu/where2act-main/data/highreso996_1119_rgnonly'
 5 | data_list = os.listdir(data_dir)
 6 | for data_id in data_list:
 7 |     # file_list = os.listdir(os.path.join(data_dir,data_id))
 8 |     # for file_dir in file_list:
 9 |     #     if file_dir != 'rgb.png':
10 |     #         os.remove(os.path.join(data_dir,data_id,file_dir))
11 |     # if not os.path.exists(os.path.join(data_dir,data_id,'result.json')):
12 |     #     shutil.rmtree(os.path.join(data_dir,data_id))
13 |     source_file = os.path.join(data_dir,data_id,'rgb.png')
14 |     destination_directory = os.path.join(data_tar,data_id)
15 |     if not os.path.exists(destination_directory):
16 |         os.makedirs(destination_directory)
17 |     shutil.copy(source_file, destination_directory)


--------------------------------------------------------------------------------
/test/util/lr_sched.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | 
 9 | def adjust_learning_rate(optimizer, epoch, args):
10 |     """Decay the learning rate with half-cycle cosine after warmup"""
11 |     if epoch < args.warmup_epochs:
12 |         lr = args.lr * epoch / args.warmup_epochs 
13 |     else:
14 |         lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
15 |             (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
16 |     for param_group in optimizer.param_groups:
17 |         if "lr_scale" in param_group:
18 |             param_group["lr"] = lr * param_group["lr_scale"]
19 |         else:
20 |             param_group["lr"] = lr
21 |     return lr
22 | 


--------------------------------------------------------------------------------
/train/util/lr_sched.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import math
 8 | 
 9 | def adjust_learning_rate(optimizer, epoch, args):
10 |     """Decay the learning rate with half-cycle cosine after warmup"""
11 |     if epoch < args.warmup_epochs:
12 |         lr = args.lr * epoch / args.warmup_epochs 
13 |     else:
14 |         lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
15 |             (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
16 |     for param_group in optimizer.param_groups:
17 |         if "lr_scale" in param_group:
18 |             param_group["lr"] = lr * param_group["lr_scale"]
19 |         else:
20 |             param_group["lr"] = lr
21 |     return lr
22 | 


--------------------------------------------------------------------------------
/test/test_entireprocess_in_sapien.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from argparse import ArgumentParser
 4 | 
 5 | from processgen import Processgen
 6 | import json
 7 | parser = ArgumentParser()
 8 | parser.add_argument('--no_gui', action='store_true', default=False, help='no_gui [default: False]')
 9 | parser.add_argument('--data_dir', type=str, help='data directory')
10 | parser.add_argument('--num_processes', type=int, default=40, help='number of CPU cores to use')
11 | parser.add_argument('--out_dir', type=str, help='outdata directory')
12 | parser.add_argument('--use_mask', type=str, default=False, help='whether use movable mask')
13 | conf = parser.parse_args()
14 | 
15 | if os.path.exists(conf.out_dir):
16 |     pass
17 | else:
18 |     print('NO infer directory')
19 |     exit()
20 |     
21 | processgen = Processgen(conf.num_processes)
22 | record_names = os.listdir(conf.out_dir)
23 | for record_name in record_names:
24 |     processgen.add_one_test_job(record_name,conf)
25 | processgen.start_all()
26 | data_tuple_list = processgen.join_all()
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/data_collection/code/transfer_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | file_path = '../stats/train_30cats_train_data_list.txt'
 4 | 
 5 | # Open the file in read mode
 6 | lines = []
 7 | with open(file_path, 'r') as file:
 8 |     # Iterate over each line in the file
 9 |     for line in file:
10 |         # Process the line (for example, print it)
11 |         # print(line.strip())  # .strip() removes leading/trailing whitespace, including the newline character
12 |         lines.append(line.strip())
13 | data_dir = '../data/train_data'
14 | target_dir = '../data/train_data0606'
15 | data_list = os.listdir(data_dir)
16 | cat_cal = dict()
17 | for data_name in data_list:
18 |     data_id = data_name.split('_')[0]
19 |     data_cat = data_name.split('_')[1]
20 |     source_dir = os.path.join(data_dir,data_name)
21 |     destination_dir = os.path.join(target_dir,data_name)
22 |     try:
23 |         for line in lines:
24 |             if data_id in line and data_cat in line:
25 |                 shutil.copytree(source_dir, destination_dir)
26 |                 break
27 |     except:
28 |         continue
29 |     


--------------------------------------------------------------------------------
/data_collection/code/robots/misc/cube.obj:
--------------------------------------------------------------------------------
 1 | # Blender v2.93.4 OBJ File: ''
 2 | # www.blender.org
 3 | mtllib untitled.mtl
 4 | o Cube
 5 | v 1.000000 1.000000 -1.000000
 6 | v 1.000000 -1.000000 -1.000000
 7 | v 1.000000 1.000000 1.000000
 8 | v 1.000000 -1.000000 1.000000
 9 | v -1.000000 1.000000 -1.000000
10 | v -1.000000 -1.000000 -1.000000
11 | v -1.000000 1.000000 1.000000
12 | v -1.000000 -1.000000 1.000000
13 | vt 1.000000 1.000000
14 | vt 0.000000 1.000000
15 | vt 0.000000 0.000000
16 | vt 1.000000 0.000000
17 | vt 1.000000 0.000000
18 | vt 1.000000 1.000000
19 | vt 0.000000 1.000000
20 | vt 0.000000 0.000000
21 | vt 1.000000 1.000000
22 | vt 1.000000 0.000000
23 | vt 0.000000 1.000000
24 | vt 1.000000 1.000000
25 | vt 1.000000 0.000000
26 | vt 0.000000 1.000000
27 | vt 0.000000 0.000000
28 | vt 0.000000 0.000000
29 | vn 0.0000 1.0000 0.0000
30 | vn 0.0000 0.0000 1.0000
31 | vn -1.0000 0.0000 0.0000
32 | vn 0.0000 -1.0000 0.0000
33 | vn 1.0000 0.0000 0.0000
34 | vn 0.0000 0.0000 -1.0000
35 | usemtl Material
36 | s off
37 | f 1/1/1 5/2/1 7/3/1 3/4/1
38 | f 4/5/2 3/6/2 7/7/2 8/8/2
39 | f 8/8/3 7/7/3 5/9/3 6/10/3
40 | f 6/11/4 2/12/4 4/5/4 8/8/4
41 | f 2/13/5 1/1/5 3/14/5 4/15/5
42 | f 6/16/6 5/2/6 1/1/6 2/13/6
43 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | click==8.1.7
 2 | cmake==3.28.1
 3 | comm==0.2.1
 4 | dash==2.17.0
 5 | ffmpy==0.3.1
 6 | ftfy==6.1.3
 7 | future==0.18.3
 8 | h5py==3.10.0
 9 | huggingface-hub==0.23.0
10 | imageio==2.33.1
11 | Jinja2==3.1.3
12 | jupyter-client==7.3.4
13 | matplotlib==3.7.4
14 | matplotlib-inline==0.1.6
15 | mpmath==1.3.0
16 | multidict==6.0.5
17 | networkx==3.1
18 | numpy==1.24.4
19 | oauthlib==3.2.2
20 | open3d==0.18.0
21 | openai==0.28.0
22 | openai-clip==1.0.1
23 | opencv-python==4.9.0.80
24 | packaging==23.2
25 | pandas==2.0.3
26 | parso==0.8.3
27 | pexpect==4.9.0
28 | pickleshare==0.7.5
29 | pillow==10.2.0
30 | pip==24.0
31 | pkgutil_resolve_name==1.3.10
32 | platformdirs==4.2.0
33 | plotly==5.22.0
34 | prompt-toolkit==3.0.42
35 | protobuf==4.25.2
36 | PyYAML==6.0.1
37 | requests==2.31.0
38 | safetensors==0.4.2
39 | sapien==0.7.0.dev0
40 | scikit-learn==1.2.2
41 | scipy==1.10.1
42 | segment-anything==1.0
43 | semantic-version==2.10.0
44 | sentencepiece==0.1.99
45 | sentry-sdk==1.42.0
46 | service-identity==24.1.0
47 | setproctitle==1.3.3
48 | setuptools==68.2.2
49 | shapely==2.0.3
50 | shellingham==1.5.4
51 | shortuuid==1.0.13
52 | six==1.16.0
53 | tensorboard==2.14.0
54 | timm==0.6.13
55 | tensorboard-data-server==0.7.2
56 | tensorboardX==2.6.2.2
57 | tokenizers==0.13.3
58 | torch==2.0.1
59 | torchvision==0.15.2
60 | tornado==6.1
61 | tqdm==4.66.1
62 | transformers==4.31.0
63 | transforms3d==0.4.1
64 | urllib3==2.1.0
65 | wandb==0.16.4
66 | wheel==0.41.2
67 | 


--------------------------------------------------------------------------------
/test/llama/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 3 | 
 4 | from sentencepiece import SentencePieceProcessor
 5 | from logging import getLogger
 6 | from typing import List
 7 | import os
 8 | 
 9 | 
10 | logger = getLogger()
11 | 
12 | 
13 | class Tokenizer:
14 |     def __init__(self, model_path: str):
15 |         # reload tokenizer
16 |         assert os.path.isfile(model_path), model_path
17 |         self.sp_model = SentencePieceProcessor(model_file=model_path)
18 |         logger.info(f"Reloaded SentencePiece model from {model_path}")
19 | 
20 |         # BOS / EOS token IDs
21 |         self.n_words: int = self.sp_model.vocab_size()
22 |         self.bos_id: int = self.sp_model.bos_id()
23 |         self.eos_id: int = self.sp_model.eos_id()
24 |         self.pad_id: int = self.sp_model.pad_id()
25 |         logger.info(
26 |             f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
27 |         )
28 |         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
29 | 
30 |     def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
31 |         assert type(s) is str
32 |         t = self.sp_model.encode(s)
33 |         if bos:
34 |             t = [self.bos_id] + t
35 |         if eos:
36 |             t = t + [self.eos_id]
37 |         return t
38 | 
39 |     def decode(self, t: List[int]) -> str:
40 |         return self.sp_model.decode(t)
41 | 


--------------------------------------------------------------------------------
/train/llama/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 3 | 
 4 | from sentencepiece import SentencePieceProcessor
 5 | from logging import getLogger
 6 | from typing import List
 7 | import os
 8 | 
 9 | 
10 | logger = getLogger()
11 | 
12 | 
13 | class Tokenizer:
14 |     def __init__(self, model_path: str):
15 |         # reload tokenizer
16 |         assert os.path.isfile(model_path), model_path
17 |         self.sp_model = SentencePieceProcessor(model_file=model_path)
18 |         logger.info(f"Reloaded SentencePiece model from {model_path}")
19 | 
20 |         # BOS / EOS token IDs
21 |         self.n_words: int = self.sp_model.vocab_size()
22 |         self.bos_id: int = self.sp_model.bos_id()
23 |         self.eos_id: int = self.sp_model.eos_id()
24 |         self.pad_id: int = self.sp_model.pad_id()
25 |         logger.info(
26 |             f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
27 |         )
28 |         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
29 | 
30 |     def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
31 |         assert type(s) is str
32 |         t = self.sp_model.encode(s)
33 |         if bos:
34 |             t = [self.bos_id] + t
35 |         if eos:
36 |             t = t + [self.eos_id]
37 |         return t
38 | 
39 |     def decode(self, t: List[int]) -> str:
40 |         return self.sp_model.decode(t)
41 | 


--------------------------------------------------------------------------------
/data_collection/code/train_test_split.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | cat_test = ['Toilet', 'Scissors','Table', 'Stapler','USB','WashingMachine', 'Oven','Faucet', 'Phone','Kettle','Window']
 3 | cat_train = ['Safe', 'Door','Display','Refrigerator' ,'Laptop','Lighter','Microwave','Mouse','Box','TrashCan','KitchenPot','Suitcase','Pliers','StorageFurniture','Remote','Bottle'
 4 |     , 'FoldingChair','Toaster','Lamp','Dispenser', 'Cart', 'Globe','Eyeglasses','Pen','Switch','Printer','Keyboard','Fan','Knife','Dishwaher']
 5 | file_path = '../stats/train_46cats_all_data_list.txt'
 6 | 
 7 | 
 8 | test_lines = []
 9 | with open(file_path, 'r') as file:
10 |     for line in file:
11 |         cleaned_line = line.strip()
12 |         for cat in cat_test:
13 |             if cat in cleaned_line:
14 |                 test_lines.append(cleaned_line)
15 |                 break
16 | train_lines = []
17 | with open(file_path, 'r') as file:
18 |     for line in file:
19 |         cleaned_line = line.strip()
20 |         for cat in cat_train:
21 |             if cat in cleaned_line:
22 |                 train_lines.append(cleaned_line)
23 |                 break
24 | random.shuffle(train_lines)
25 | length = len(train_lines)
26 | 
27 | 
28 | train_lines_output = train_lines[:int((4*length)/5)]
29 | test_lines.extend(train_lines[int((4*length)/5):])
30 | 
31 | file_path1 = '../stats/test_id.txt'
32 | 
33 | with open(file_path1, 'w') as file:
34 |     for item in test_lines:
35 |         file.write(f"{item}\n")
36 | 
37 | file_path2 = '../stats/train_id.txt'
38 | with open(file_path2, 'w') as file:
39 |     for item in train_lines_output:
40 |         file.write(f"{item}\n")
41 | 
42 | 


--------------------------------------------------------------------------------
/test/test_llama.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import torch
 3 | import llama
 4 | import os
 5 | from PIL import Image, ImageDraw
 6 | import cv2
 7 | import json
 8 | from tqdm import tqdm
 9 | import numpy as np
10 | import torch.nn as nn
11 | parser = ArgumentParser()
12 | parser.add_argument('--llama_dir', type=str, help='llama directory')
13 | parser.add_argument('--adapter_dir', type=str, help='adapter directory')
14 | parser.add_argument('--data_dir', type=str)
15 | parser.add_argument('--out_dir', type=str)
16 | parser.add_argument('--action', type=str, help='llama directory')
17 | conf = parser.parse_args()
18 | device = 'cuda' if torch.cuda.is_available() else "cpu"
19 | llama_dir = conf.llama_dir
20 | # print(conf.adapter_dir, llama_dir, device)
21 | model, preprocess = llama.load(conf.adapter_dir, llama_dir, device)
22 | model.to(device)
23 | model.eval()
24 | if '-ori' in conf.adapter_dir:
25 |     prompt = llama.format_prompt('Specify the contact point and orientation of pushing the object.') # though it is called pushing, but the prediction is the same as manipulating. It is just an old version of prompt naming during training
26 | else:
27 |     prompt = llama.format_prompt('Specify the contact point and gripper direction of manipulating the object.')
28 | record_names = os.listdir(conf.data_dir)
29 | for record_name in tqdm(record_names):
30 |     out_dir  = os.path.join(conf.out_dir,record_name)
31 | 
32 |     if not os.path.exists(out_dir):
33 |         os.makedirs(out_dir)
34 |     
35 |     record_dir = os.path.join(conf.data_dir,record_name)
36 |     rgb_dir = os.path.join(record_dir,'original_rgb.png')
37 |     if not os.path.exists(rgb_dir):
38 |         continue
39 |     start_pixel = 0
40 |     size=336
41 |     img_1 = Image.fromarray(np.array(Image.open(rgb_dir).convert('RGB'))[start_pixel:start_pixel+336,start_pixel:start_pixel+336,:])
42 |     img = preprocess(img_1).unsqueeze(0).to(device) 
43 |     with torch.no_grad():
44 |         result = model.generate(img, [prompt])[0]
45 |     # print(result)
46 |     with open(os.path.join(out_dir, 'prediction.json'), 'w') as fout:
47 |         json.dump(result, fout)
48 | 


--------------------------------------------------------------------------------
/test/processgen.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     to control multiprocess test in sapien
 3 | """
 4 | 
 5 | import os
 6 | import numpy as np
 7 | import multiprocessing as mp
 8 | from subprocess import call
 9 | import time
10 | 
11 | def printout(flog, strout):
12 |     print(strout)
13 |     if flog is not None:
14 |         flog.write(strout + '\n')
15 | class Processgen(object):
16 | 
17 |     def __init__(self, num_processes, flog=None):
18 |         self.num_processes = num_processes
19 |         self.flog = flog
20 |         
21 |         self.todos = []
22 |         self.processes = []
23 |         self.is_running = False
24 |         self.Q = mp.Queue()
25 | 
26 |     def __len__(self):
27 |         return len(self.todos)
28 | 
29 |     def add_one_test_job(self,record_name,conf):
30 |         if self.is_running:
31 |             printout(self.flog, 'ERROR: cannot add a new job while Processgen is running!')
32 |             exit(1)
33 |         todo = (conf.data_dir,record_name,conf.out_dir,conf.use_mask)
34 |         self.todos.append(todo)
35 | 
36 |     
37 |     @staticmethod
38 |     def job_func(pid, todos, Q):
39 |         succ_todos = []
40 |         # print(todos)
41 |         for todo in todos:
42 |             cmd = 'xvfb-run -a python test_one_stick_clean.py --data_dir {} --record_name {} --out_dir {} --no_gui --use_mask {}' \
43 |                     .format(todo[0], todo[1], todo[2], todo[-1])
44 |             folder_name_withjob = os.path.join(todo[2],todo[1])
45 |             # print(cmd)
46 |             # exit()
47 | 
48 |             ret = call(cmd, shell=True)
49 |             
50 |             if ret == 0:
51 |                 succ_todos.append(folder_name_withjob)
52 |             if ret == 2:
53 |                 succ_todos.append(None)
54 |         Q.put(succ_todos)
55 | 
56 |     def start_all(self):
57 |         if self.is_running:
58 |             printout(self.flog, 'ERROR: cannot start all while Processgen is running!')
59 |             exit(1)
60 | 
61 |         total_todos = len(self)
62 |         num_todos_per_process = int(np.ceil(total_todos / self.num_processes))
63 |         np.random.shuffle(self.todos)
64 |         for i in range(self.num_processes):
65 |             todos = self.todos[i*num_todos_per_process: min(total_todos, (i+1)*num_todos_per_process)]
66 |             p = mp.Process(target=self.job_func, args=(i, todos, self.Q))
67 |             p.start()
68 |             self.processes.append(p)
69 |         
70 |         self.is_running = True
71 | 
72 |     def join_all(self):
73 |         if not self.is_running:
74 |             printout(self.flog, 'ERROR: cannot join all while Processgen is idle!')
75 |             exit(1)
76 | 
77 |         ret = []
78 |         for p in self.processes:
79 |             ret += self.Q.get()
80 | 
81 |         for p in self.processes:
82 |             p.join()
83 | 
84 |         self.todos = []
85 |         self.processes = []
86 |         self.Q = mp.Queue()
87 |         self.is_running=False
88 |         return ret
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/data_collection/code/gen_offline_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from argparse import ArgumentParser
 4 | 
 5 | from datagen import DataGen
 6 | 
 7 | parser = ArgumentParser()
 8 | parser.add_argument('--data_dir', type=str, help='data directory')
 9 | parser.add_argument('--data_fn', type=str, help='data file that indexs all shape-ids')
10 | parser.add_argument('--primact_types', type=str, help='list all primacts [separated by comma], default: None, meaning all', default=None)
11 | parser.add_argument('--num_processes', type=int, default=40, help='number of CPU cores to use')
12 | parser.add_argument('--num_epochs', type=int, default=160, help='control the data amount')
13 | parser.add_argument('--starting_epoch', type=int, default=0, help='help to resume. If previous generating does not generate the expected amount of data, when resuming, set this term to the previous epoch number to prevent from overlapping')
14 | parser.add_argument('--ins_cnt_fn', type=str, help='a file listing all category instance count, which is used to balance the interaction data amount to make sure that all categories have roughly same amount of data interaction, regardless of different shape counts in these categories')
15 | parser.add_argument('--mode', type=str, help='train or test; control the categories')
16 | conf = parser.parse_args()
17 | 
18 | 
19 | 
20 | if conf.mode == 'train' and conf.primact_types == 'pulling':
21 |     #set train categories
22 |     conf.category_types = ['Safe', 'Door','Display','Refrigerator' ,'Laptop','Lighter','Microwave','Mouse','Box','TrashCan','KitchenPot','Suitcase','Pliers','StorageFurniture','Remote','Bottle'
23 |     , 'FoldingChair','Toaster','Lamp','Dispenser', 'Cart', 'Globe','Eyeglasses','Pen','Switch','Printer','Keyboard','Fan','Knife','Dishwaher']
24 | elif conf.mode == 'test' and conf.primact_types == 'pulling':
25 |     #set test categories
26 |     conf.category_types = ['Safe', 'Door','Display','Refrigerator' ,'Laptop','Lighter','Microwave','Mouse','Box','TrashCan','KitchenPot','Suitcase','Pliers','StorageFurniture','Remote','Bottle'
27 |     , 'FoldingChair','Toaster','Lamp','Dispenser', 'Cart', 'Globe','Eyeglasses','Pen','Switch','Printer','Keyboard','Fan','Knife','Dishwaher','Toilet', 'Scissors','Table', 'Stapler','USB',
28 |     'WashingMachine', 'Oven','Faucet', 'Phone','Kettle','Window']
29 | 
30 | hard_train_cat = ['Dispenser','Globe','Remote','Cart','Fan','Knife']
31 | easy_train_cat = ['StorageFurniture','Pen','Laptop','Microwave','Refrigerator','Safe']
32 | 
33 | cat2freq = dict()
34 | with open(conf.ins_cnt_fn, 'r') as fin:
35 |     for l in fin.readlines():
36 |         cat, _, freq = l.rstrip().split()
37 |         #hard categories are harder to collect success samples, therefore, increase the frequency of interacting with these categories to keep the category balance
38 |         if cat in hard_train_cat: 
39 |             freq *= 2
40 |             cat2freq[cat] = freq
41 |         elif cat in easy_train_cat:
42 |             freq = int(float(freq) / 1.2)
43 |             cat2freq[cat] = freq
44 |         cat2freq[cat] = int(freq)
45 | 
46 | datagen = DataGen(conf.num_processes)
47 | primact_type = conf.primact_types
48 | with open(conf.data_fn, 'r') as fin:
49 |     for l in fin.readlines():
50 |         shape_id, cat = l.rstrip().split()
51 |         if cat in conf.category_types:
52 |             for epoch in range(conf.starting_epoch, conf.starting_epoch+conf.num_epochs):
53 |                 for cnt_id in range(cat2freq[cat]):
54 |                     datagen.add_one_collect_job(conf.data_dir, shape_id, cat, cnt_id, primact_type, epoch)
55 | 
56 | datagen.start_all()
57 | 
58 | print('start generating data')
59 | 


--------------------------------------------------------------------------------
/train/engine_finetune.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import sys
 3 | from typing import Iterable
 4 | 
 5 | import torch
 6 | 
 7 | import util.misc as misc
 8 | import util.lr_sched as lr_sched
 9 | 
10 | from llama import LLaMA_adapter
11 | 
12 | def train_one_epoch(model: LLaMA_adapter,
13 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
14 |                     device: torch.device, epoch: int, loss_scaler,
15 |                     log_writer=None,
16 |                     args=None):
17 |     model.train(True)
18 |     # model.module.set_default_trainability()
19 | 
20 |     metric_logger = misc.MetricLogger(delimiter="  ")
21 |     metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
22 |     header = 'Epoch: [{}]'.format(epoch)
23 |     print_freq = 10
24 | 
25 |     accum_iter = args.accum_iter
26 | 
27 |     optimizer.zero_grad()
28 | 
29 |     if log_writer is not None:
30 |         print('log_dir: {}'.format(log_writer.log_dir))
31 |     for data_iter_step, (examples, labels, example_mask, imgs) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
32 |         # we use a per iteration (instead of per epoch) lr scheduler
33 |         if data_iter_step % accum_iter == 0:
34 |             lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
35 | 
36 |         #把张量移到同一设备上
37 |         
38 |         examples = examples.to(device)
39 |         labels = labels.to(device)
40 |         imgs = imgs.to(device, non_blocking=True)
41 |         imgs = imgs.to(device, non_blocking=True)
42 |         
43 |         # print("________---------------66666")
44 |         #with torch.no_grad():#不计算梯度减少内存占用
45 |         with torch.cuda.amp.autocast():
46 |                 c_loss, m_loss = model(examples, labels, imgs)
47 |         loss = c_loss  + m_loss * 0
48 |         loss_value = loss.item()
49 |         c_loss_value = c_loss.item()
50 |         m_loss_value = m_loss
51 |         if not math.isfinite(loss_value):
52 |             print("Loss is {}, stopping training".format(loss_value))
53 |             sys.exit(1)
54 | 
55 |         loss /= accum_iter
56 |         loss_scaler(loss, optimizer, parameters=model.parameters(),
57 |                     update_grad=(data_iter_step + 1) % accum_iter == 0)
58 |         if (data_iter_step + 1) % accum_iter == 0:
59 |             optimizer.zero_grad()
60 | 
61 |         torch.cuda.synchronize()
62 | 
63 |         metric_logger.update(closs=c_loss_value)
64 |         metric_logger.update(mloss=m_loss_value)
65 | 
66 |         lr = optimizer.param_groups[0]["lr"]
67 |         metric_logger.update(lr=lr)
68 | 
69 |         loss_value_reduce = misc.all_reduce_mean(loss_value)
70 |         c_loss_value_reduce = misc.all_reduce_mean(c_loss_value)
71 |         m_loss_value_reduce = misc.all_reduce_mean(m_loss_value)
72 |         if log_writer is not None and (data_iter_step + 1) % accum_iter == 0:
73 |             """ We use epoch_1000x as the x-axis in tensorboard.
74 |             This calibrates different curves when batch size changes.
75 |             """
76 |             epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
77 |             log_writer.add_scalar('c_train_loss', c_loss_value_reduce, epoch_1000x)
78 |             log_writer.add_scalar('m_train_loss', m_loss_value_reduce, epoch_1000x)
79 |             log_writer.add_scalar('lr', lr, epoch_1000x)
80 |         # break
81 | 
82 | 
83 |     # gather the stats from all processes
84 |     metric_logger.synchronize_between_processes()
85 |     print("Averaged stats:", metric_logger)
86 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ManipLLM
 2 | The official codebase for ManipLLM:  Embodied Multimodal Large Language Model for Object-Centric Robotic Manipulation (CVPR 2024)
 3 | 
 4 | ## Acknowledgement
 5 | This repo benefits from [LLama_Adapter](https://github.com/OpenGVLab/LLaMA-Adapter) and [Where2act](https://github.com/daerduoCarey/where2act). Thanks for their wonderful works.
 6 | 
 7 | ## Setup
 8 | 1) conda create --name manipllm python=3.8
 9 | 
10 | 2) conda activate manipllm
11 | 
12 | 3) pip install -r requirements.txt
13 | 
14 |             
15 | ## Data Collection
16 | 
17 | 
18 | - Collect data by your own: Download [partnet mobility](https://sapien.ucsd.edu/downloads) urdf from its official website and place under ./ManipLLM/data_collection/asset.
19 |   ```bash
20 |   ./asset/original_sapien_dataset
21 |     ├── 148
22 |     |   └── mobility.urdf
23 |     ├── 149
24 |     |   └── mobility.urdf
25 |     ├── ...
26 |     │   ...
27 |     └── ...
28 |   
29 |   cd ./ManipLLM/data_collection/code
30 |   
31 |   bash scripts/run_gen_offline_data.sh
32 | 
33 | This command will first generate training dataset and then generate the testing dataset.
34 | 
35 | ## Model Training
36 | - Preparation:
37 | 
38 |   Download checkpoints for [CLIP](https://disk.pku.edu.cn/link/AA93FF7210CF0D4F428850C0F520C81453), [LLaMa-Adapter](https://disk.pku.edu.cn/link/AA682A19DB7FDA4028B112449D24BBC308). The downloaded checkpoints should be placed under /ManipLLM/train/ckpts. Obtain the LLaMA backbone weights using this [form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform). Please note that checkpoints from unofficial sources (e.g., BitTorrent) may contain malicious code and should be used with care. Organize the downloaded checkpoints in the following structure:
39 |     ```plaintext
40 |     ./ckpts/llama_model_weights
41 |     ├── 7B
42 |     │   ├── checklist.chk
43 |     │   ├── consolidated.00.pth
44 |     │   └── params.json
45 |     └── tokenizer.model
46 |     ./ckpts/BIAS_LORA_NORM-336-Chinese-7B.pth
47 |     ./ckpts/ViT-L-14-336px.pt
48 | - Model training: The training requires the server to has a least 40g memory. The command will first generate the training json, then start training
49 | 
50 |   
51 |   ```bash
52 |   cd ./ManipLLM/train
53 |   
54 |   bash finetune.sh
55 | 
56 | ## Model Testing
57 | - The public code only infers on the final prompt without chain-of-thought, predicting the pose directly. 
58 | 
59 | - Remember to add the checkpoints of [CLIP](https://disk.pku.edu.cn/link/AA93FF7210CF0D4F428850C0F520C81453), [LLaMa](same with the process in training), and [LLaMa-Adapter](https://disk.pku.edu.cn/link/AA682A19DB7FDA4028B112449D24BBC308) under /ManipLLM/test/ckpts as well.
60 | 
61 | - We release the checkpoint: checkpoint-9-ori.pth. Note that, due to the randomness in data collection, the provided testing dataset is different from the ones in paper, so you may result in slightly different but comparable results compared with the results in paper. Download the released [checkpoint-9-ori](https://pan.baidu.com/s/1kh_LO7W7TnnrpPzI4khw0Q?pwd=cipc) or use your own trained checkpoint. The link we provide is baiduyun downloading link. If you need a google drive download link, send your google account via email to xl3062@columbia.edu, then we will share the link with you. Remember to change the line5 in test.sh to the dir you placed the ckpts.
62 | 
63 | - Download OUR [test data](https://disk.pku.edu.cn/link/AA103C5B00398E4E4089903CB06AC09D8C) or collect the test data by your own. The downloaded 'test_data' folder should be unziped under /ManipLLM/data_collection/data. Download [partnet mobility](https://sapien.ucsd.edu/downloads) urdf from its official website and place under /ManipLLM/data_collection/asset.
64 | 
65 | - The testing requires the server to has a least 40g memory. This command will first use the model to infer on all the test samples, and then interact with object in the simulator (SAPIEN).
66 |   
67 |   ```bash
68 |   cd ./ManipLLM/test
69 |   
70 |   bash test.sh
71 | 
72 | 


--------------------------------------------------------------------------------
/test/cal_test_mani_succ_rate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | from argparse import ArgumentParser
 4 | import utils
 5 | import os
 6 | def calculate_succ_ration(data_list_for_cat,conf,out_dir):
 7 |     out_info={}
 8 |     for cat in conf.category_types:
 9 |         if cat in data_list_for_cat.keys():
10 |             succ_ration_list=[]
11 |             for i in data_list_for_cat[cat]:
12 |                 try:
13 |                     with open(os.path.join(i, 'result.json'), 'r') as fin:
14 |                         result_data = json.load(fin)
15 |                         succ_ration_list.append(result_data['mani_succ'])
16 |                 except:
17 |                     continue
18 |            
19 |             succ_ration_list = np.array(succ_ration_list)
20 |             out_info['number_of_%s'%cat]= len(succ_ration_list)
21 |             mean_value = np.mean(succ_ration_list.astype(float))
22 |             out_info['mani_succ_ration_for_%s'%cat]= mean_value
23 |         else:
24 |             # print("there is no '%s' data "% cat)
25 |             continue
26 |     train_cat = ['Safe', 'Door','Display','Refrigerator' ,'Laptop','Lighter','Microwave','Mouse','Box','TrashCan','KitchenPot','Suitcase','Pliers','StorageFurniture','Remote','Bottle'
27 |     , 'FoldingChair','Toaster','Lamp','Dispenser','Eyeglasses','Pen','Printer','Keyboard','Fan','Knife','Dishwaher']
28 | 
29 |     count_train = 1e-6
30 |     count_test = 1e-6
31 |     osum_train = 0
32 |     osum_test = 0
33 |     print(out_info)
34 |     for i in range(0,len(out_info.keys()),2):
35 |         if list(out_info.keys())[i].split('_')[-1] in train_cat:
36 |             if 0.0 <= out_info[list(out_info.keys())[i+1]] and  out_info[list(out_info.keys())[i+1]] <= 1.0:
37 |                 osum_train += out_info[list(out_info.keys())[i]] * out_info[list(out_info.keys())[i+1]]
38 |                 # print(out_info[list(out_info.keys())[i]],out_info[list(out_info.keys())[i+1]])
39 |                 count_train += out_info[list(out_info.keys())[i]]
40 |         else:
41 |             if 0.0 <= out_info[list(out_info.keys())[i+1]] and  out_info[list(out_info.keys())[i+1]] <= 1.0:
42 |                 osum_test += out_info[list(out_info.keys())[i]] * out_info[list(out_info.keys())[i+1]]
43 |                 count_test += out_info[list(out_info.keys())[i]]
44 | 
45 |     print(f'test seen acc on {count_train} samples is {osum_train/count_train}, test unseen acc on {count_test} samples is {osum_test/count_test}')
46 |     with open(os.path.join(out_dir, 'mani_succ_ration_for_cats.json'), 'w') as fout:
47 |         json.dump(out_info, fout)
48 |     
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     parser = ArgumentParser()
53 |     parser.add_argument('--primact_type', type=str, help='primact_type:pushing,pulling,pushing left,pulling left')
54 |     parser.add_argument('--data_dir', type=str, help='data_dir for whole test data')
55 |     parser.add_argument('--out_dir', type=str, help='out_dir for calculate_info')
56 |     conf = parser.parse_args()
57 |     
58 | 
59 |     conf.category_types = ['Safe', 'Door','Display','Refrigerator' ,'Laptop','Lighter','Microwave','Mouse','Box','TrashCan','KitchenPot','Suitcase','Pliers','StorageFurniture','Remote','Bottle'
60 |     , 'FoldingChair','Toaster','Lamp','Dispenser','Toilet', 'Scissors','Table','USB',
61 |     'WashingMachine', 'Oven','Faucet']
62 |     conf.out_dir = os.path.join(conf.data_dir,'calculate_info')
63 |     if not os.path.exists(conf.out_dir):
64 |         os.makedirs(conf.out_dir)
65 | 
66 |     data_list_for_cat={}
67 |     record_names = os.listdir(conf.data_dir)
68 |    
69 |     for record_name in record_names:
70 |        
71 |         if '.png' in record_name or '.json' in record_name:
72 |             continue
73 |         else:
74 |            
75 |             category= record_name.rstrip().split('_')[1]
76 |             data_list_for_cat.setdefault(category,[]).append(os.path.join(conf.data_dir, record_name.rstrip()))
77 |    
78 |     calculate_succ_ration(data_list_for_cat,conf,conf.out_dir)
79 | 


--------------------------------------------------------------------------------
/test/llama/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib
 3 | import hashlib
 4 | import warnings
 5 | 
 6 | from tqdm import tqdm
 7 | import torch
 8 | 
 9 | 
10 | def sample_top_p(probs, p):
11 |     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
12 |     probs_sum = torch.cumsum(probs_sort, dim=-1)
13 |     mask = probs_sum - probs_sort > p
14 |     probs_sort[mask] = 0.0
15 |     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
16 |     next_token = torch.multinomial(probs_sort, num_samples=1)
17 |     next_token = torch.gather(probs_idx, -1, next_token)
18 |     return next_token
19 | 
20 | 
21 | def format_prompt(instruction, input=None, lang_type='EN'):
22 | 
23 |     PROMPT_DICT = {
24 |         "prompt_input": (
25 |             "Below is an instruction that describes a task, paired with an input that provides further context. "
26 |             "Write a response that appropriately completes the request.\n\n"
27 |             "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
28 |         ),
29 |         "prompt_no_input": (
30 |             "Below is an instruction that describes a task. "
31 |             "Write a response that appropriately completes the request.\n\n"
32 |             "### Instruction:\n{instruction}\n\n### Response:"
33 |         ),
34 |     }
35 |     CH_PROMPT_DICT = {
36 |         "prompt_input": (
37 |             "Below is a chinese instruction that describes a task, paired with a chinese input that provides further context. "
38 |             "Write a chinese response that appropriately completes the request.\n\n"
39 |             "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
40 |         ),
41 |         "prompt_no_input": (
42 |             "Below is a chinese instruction that describes a task. "
43 |             "Write a chinese response that appropriately completes the request.\n\n"
44 |             "### Instruction:\n{instruction}\n\n### Response:"
45 |         )
46 |     }
47 |     if input is None or input == '':
48 |         if lang_type == 'EN':
49 |             return PROMPT_DICT['prompt_no_input'].format_map({'instruction': instruction})
50 |         else:
51 |             return CH_PROMPT_DICT['prompt_no_input'].format_map({'instruction': instruction})
52 |     else:
53 |         if lang_type == 'EN':
54 |             return PROMPT_DICT["prompt_input"].format_map({'instruction': instruction, 'input': input})
55 |         else:
56 |             return CH_PROMPT_DICT["prompt_input"].format_map({'instruction': instruction, 'input': input})
57 | 
58 | 
59 | def _download(url: str, root: str):
60 |     os.makedirs(root, exist_ok=True)
61 |     filename = os.path.basename(url)
62 |     # assume the url is https://some/path/sha256_model.pth
63 |     expected_sha256 = url.split("/")[-1].split('_')[0]
64 |     # expected_sha256 = url.split("/")[-2]
65 |     download_target = os.path.join(root, filename)
66 | 
67 |     if os.path.exists(download_target) and not os.path.isfile(download_target):
68 |         raise RuntimeError(f"{download_target} exists and is not a regular file")
69 | 
70 |     if os.path.isfile(download_target):
71 |         if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
72 |             return download_target
73 |         else:
74 |             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
75 | 
76 |     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
77 |         with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
78 |             while True:
79 |                 buffer = source.read(8192)
80 |                 if not buffer:
81 |                     break
82 | 
83 |                 output.write(buffer)
84 |                 loop.update(len(buffer))
85 | 
86 |     if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
87 |         raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
88 | 
89 |     return download_target
90 | 


--------------------------------------------------------------------------------
/train/llama/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib
 3 | import hashlib
 4 | import warnings
 5 | 
 6 | from tqdm import tqdm
 7 | import torch
 8 | 
 9 | 
10 | def sample_top_p(probs, p):
11 |     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
12 |     probs_sum = torch.cumsum(probs_sort, dim=-1)
13 |     mask = probs_sum - probs_sort > p
14 |     probs_sort[mask] = 0.0
15 |     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
16 |     next_token = torch.multinomial(probs_sort, num_samples=1)
17 |     next_token = torch.gather(probs_idx, -1, next_token)
18 |     return next_token
19 | 
20 | 
21 | def format_prompt(instruction, input=None, lang_type='EN'):
22 | 
23 |     PROMPT_DICT = {
24 |         "prompt_input": (
25 |             "Below is an instruction that describes a task, paired with an input that provides further context. "
26 |             "Write a response that appropriately completes the request.\n\n"
27 |             "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
28 |         ),
29 |         "prompt_no_input": (
30 |             "Below is an instruction that describes a task. "
31 |             "Write a response that appropriately completes the request.\n\n"
32 |             "### Instruction:\n{instruction}\n\n### Response:"
33 |         ),
34 |     }
35 |     CH_PROMPT_DICT = {
36 |         "prompt_input": (
37 |             "Below is a chinese instruction that describes a task, paired with a chinese input that provides further context. "
38 |             "Write a chinese response that appropriately completes the request.\n\n"
39 |             "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
40 |         ),
41 |         "prompt_no_input": (
42 |             "Below is a chinese instruction that describes a task. "
43 |             "Write a chinese response that appropriately completes the request.\n\n"
44 |             "### Instruction:\n{instruction}\n\n### Response:"
45 |         )
46 |     }
47 |     if input is None or input == '':
48 |         if lang_type == 'EN':
49 |             return PROMPT_DICT['prompt_no_input'].format_map({'instruction': instruction})
50 |         else:
51 |             return CH_PROMPT_DICT['prompt_no_input'].format_map({'instruction': instruction})
52 |     else:
53 |         if lang_type == 'EN':
54 |             return PROMPT_DICT["prompt_input"].format_map({'instruction': instruction, 'input': input})
55 |         else:
56 |             return CH_PROMPT_DICT["prompt_input"].format_map({'instruction': instruction, 'input': input})
57 | 
58 | 
59 | def _download(url: str, root: str):
60 |     os.makedirs(root, exist_ok=True)
61 |     filename = os.path.basename(url)
62 |     # assume the url is https://some/path/sha256_model.pth
63 |     expected_sha256 = url.split("/")[-1].split('_')[0]
64 |     # expected_sha256 = url.split("/")[-2]
65 |     download_target = os.path.join(root, filename)
66 | 
67 |     if os.path.exists(download_target) and not os.path.isfile(download_target):
68 |         raise RuntimeError(f"{download_target} exists and is not a regular file")
69 | 
70 |     if os.path.isfile(download_target):
71 |         if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
72 |             return download_target
73 |         else:
74 |             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
75 | 
76 |     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
77 |         with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
78 |             while True:
79 |                 buffer = source.read(8192)
80 |                 if not buffer:
81 |                     break
82 | 
83 |                 output.write(buffer)
84 |                 loop.update(len(buffer))
85 | 
86 |     if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
87 |         raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
88 | 
89 |     return download_target
90 | 


--------------------------------------------------------------------------------
/data_collection/code/datagen.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Batch-generate data
  3 | """
  4 | 
  5 | import os
  6 | import numpy as np
  7 | import multiprocessing as mp
  8 | from subprocess import call
  9 | from utils import printout
 10 | import time
 11 | 
 12 | 
 13 | class DataGen(object):
 14 | 
 15 |     def __init__(self, num_processes, flog=None):
 16 |         self.num_processes = num_processes
 17 |         self.flog = flog
 18 |         
 19 |         self.todos = []
 20 |         self.processes = []
 21 |         self.is_running = False
 22 |         self.Q = mp.Queue()
 23 | 
 24 |     def __len__(self):
 25 |         return len(self.todos)
 26 | 
 27 |     def add_one_collect_job(self, data_dir, shape_id, category, cnt_id, primact_type, trial_id):
 28 |         if self.is_running:
 29 |             printout(self.flog, 'ERROR: cannot add a new job while DataGen is running!')
 30 |             exit(1)
 31 | 
 32 |         todo = ('COLLECT', shape_id, category, cnt_id, primact_type, data_dir, trial_id, np.random.randint(10000000))
 33 |         self.todos.append(todo)
 34 |     
 35 |     def add_one_recollect_job(self, src_data_dir, dir1, dir2, recollect_record_name, tar_data_dir, x, y):
 36 |         if self.is_running:
 37 |             printout(self.flog, 'ERROR: cannot add a new job while DataGen is running!')
 38 |             exit(1)
 39 | 
 40 |         todo = ('RECOLLECT', src_data_dir, recollect_record_name, tar_data_dir, np.random.randint(10000000), x, y, dir1, dir2)
 41 |         self.todos.append(todo)
 42 |     
 43 |     def add_one_checkcollect_job(self, src_data_dir, dir1, dir2, recollect_record_name, tar_data_dir, x, y):
 44 |         if self.is_running:
 45 |             printout(self.flog, 'ERROR: cannot add a new job while DataGen is running!')
 46 |             exit(1)
 47 | 
 48 |         todo = ('CHECKCOLLECT', src_data_dir, recollect_record_name, tar_data_dir, np.random.randint(10000000), x, y, dir1, dir2)
 49 |         self.todos.append(todo)
 50 |     
 51 |     @staticmethod
 52 |     def job_func(pid, todos, Q):
 53 |         succ_todos = []
 54 |         for todo in todos:
 55 |             if todo[0] == 'COLLECT':
 56 |                
 57 |                 # the code is runned without gui
 58 |                 cmd = 'xvfb-run -a python collect_data.py %s %s %d %s --out_dir %s --trial_id %d --random_seed %d --no_gui' \
 59 |                         % (todo[1], todo[2], todo[3], todo[4], todo[5], todo[6], todo[7])
 60 |                 # print(cmd)
 61 |                 # assert(0)
 62 |                 folder_name = todo[5]
 63 |                 job_name = '%s_%s_%d_%s_%s' % (todo[1], todo[2], todo[3], todo[4], todo[6])
 64 |             ret = call(cmd, shell=True)
 65 |             if ret == 0:
 66 |                 succ_todos.append(os.path.join(folder_name, job_name))
 67 |             if ret == 2:
 68 |                 succ_todos.append(None)
 69 |         Q.put(succ_todos)
 70 | 
 71 |     def start_all(self):
 72 |         if self.is_running:
 73 |             printout(self.flog, 'ERROR: cannot start all while DataGen is running!')
 74 |             exit(1)
 75 | 
 76 |         total_todos = len(self)
 77 |         num_todos_per_process = int(np.ceil(total_todos / self.num_processes))
 78 |         np.random.shuffle(self.todos)
 79 |         for i in range(self.num_processes):
 80 |             todos = self.todos[i*num_todos_per_process: min(total_todos, (i+1)*num_todos_per_process)]
 81 |             p = mp.Process(target=self.job_func, args=(i, todos, self.Q))
 82 |             p.start()
 83 |             self.processes.append(p)
 84 |         
 85 |         self.is_running = True
 86 | 
 87 |     def join_all(self):
 88 |         if not self.is_running:
 89 |             printout(self.flog, 'ERROR: cannot join all while DataGen is idle!')
 90 |             exit(1)
 91 | 
 92 |         ret = []
 93 |         for p in self.processes:
 94 |             ret += self.Q.get()
 95 | 
 96 |         for p in self.processes:
 97 |             p.join()
 98 | 
 99 |         self.todos = []
100 |         self.processes = []
101 |         self.Q = mp.Queue()
102 |         self.is_running=False
103 |         return ret
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/test/robots/panda_gripper.urdf:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <!-- =================================================================================== -->
  3 | <!-- |    This document was autogenerated by xacro from panda_arm_hand.urdf.xacro      | -->
  4 | <!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
  5 | <!-- =================================================================================== -->
  6 | <robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
  7 |   <link name="root">
  8 |     <inertial>
  9 |       <mass value="0.01"/>
 10 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 11 |     </inertial>
 12 |   </link>
 13 |   <link name="root1">
 14 |     <inertial>
 15 |       <mass value="0.01"/>
 16 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 17 |     </inertial>
 18 |   </link>
 19 |   <link name="root2">
 20 |     <inertial>
 21 |       <mass value="0.01"/>
 22 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 23 |     </inertial>
 24 |   </link>
 25 |   <link name="root3">
 26 |     <inertial>
 27 |       <mass value="0.01"/>
 28 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 29 |     </inertial>
 30 |   </link>
 31 |   <link name="root4">
 32 |     <inertial>
 33 |       <mass value="0.01"/>
 34 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 35 |     </inertial>
 36 |   </link>
 37 |   <link name="root5">
 38 |     <inertial>
 39 |       <mass value="1"/>
 40 |       <inertia ixx="0.05" ixy="0.0" ixz="0.0" iyy="0.05" iyz="0.0" izz="0.05"/>
 41 |     </inertial>
 42 |   </link>
 43 |   <joint name="x_joint" type="prismatic">
 44 |     <parent link="root"/>
 45 |     <child link="root1"/>
 46 |     <axis xyz="1 0 0"/>
 47 |     <limit lower="-10" upper="10"/>
 48 |   </joint>
 49 |   <joint name="y_joint" type="prismatic">
 50 |     <parent link="root1"/>
 51 |     <child link="root2"/>
 52 |     <axis xyz="0 1 0"/>
 53 |     <limit lower="-10" upper="10"/>
 54 |   </joint>
 55 |   <joint name="z_joint" type="prismatic">
 56 |     <parent link="root2"/>
 57 |     <child link="root3"/>
 58 |     <axis xyz="0 0 1"/>
 59 |     <limit lower="-10" upper="10"/>
 60 |   </joint>
 61 |   <joint name="x_rotation_joint" type="continuous">
 62 |     <parent link="root3"/>
 63 |     <child link="root4"/>
 64 |     <axis xyz="1 0 0"/>
 65 |   </joint>
 66 |   <joint name="y_rotation_joint" type="continuous">
 67 |     <parent link="root4"/>
 68 |     <child link="root5"/>
 69 |     <axis xyz="0 1 0"/>
 70 |   </joint>
 71 |   <joint name="z_rotation_joint" type="continuous">
 72 |     <parent link="root5"/>
 73 |     <child link="panda_hand"/>
 74 |     <axis xyz="0 0 1"/>
 75 |   </joint>
 76 |   <link name="panda_hand">
 77 |     <visual>
 78 |       <geometry>
 79 |         <mesh filename="franka_description/meshes/visual/hand.dae"/>
 80 |       </geometry>
 81 |     </visual>
 82 |     <collision>
 83 |       <geometry>
 84 |         <mesh filename="franka_description/meshes/collision/hand.stl"/>
 85 |       </geometry>
 86 |     </collision>
 87 |   </link>
 88 |   <link name="panda_leftfinger">
 89 |     <visual>
 90 |       <geometry>
 91 |         <mesh filename="franka_description/meshes/visual/finger.dae"/>
 92 |       </geometry>
 93 |     </visual>
 94 |     <collision>
 95 |       <geometry>
 96 |         <mesh filename="franka_description/meshes/collision/finger.stl"/>
 97 |       </geometry>
 98 |     </collision>
 99 |   </link>
100 |   <link name="panda_rightfinger">
101 |     <visual>
102 |       <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
103 |       <geometry>
104 |         <mesh filename="franka_description/meshes/visual/finger.dae"/>
105 |       </geometry>
106 |     </visual>
107 |     <collision>
108 |       <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
109 |       <geometry>
110 |         <mesh filename="franka_description/meshes/collision/finger.stl"/>
111 |       </geometry>
112 |     </collision>
113 |   </link>
114 |   <joint name="panda_finger_joint1" type="prismatic">
115 |     <parent link="panda_hand"/>
116 |     <child link="panda_leftfinger"/>
117 |     <origin rpy="0 0 0" xyz="0 0 0.0584"/>
118 |     <axis xyz="0 1 0"/>
119 |     <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
120 |   </joint>
121 |   <joint name="panda_finger_joint2" type="prismatic">
122 |     <parent link="panda_hand"/>
123 |     <child link="panda_rightfinger"/>
124 |     <origin rpy="0 0 0" xyz="0 0 0.0584"/>
125 |     <axis xyz="0 -1 0"/>
126 |     <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
127 |     <mimic joint="panda_finger_joint1"/>
128 |   </joint>
129 | </robot>
130 | 
131 | 


--------------------------------------------------------------------------------
/data_collection/code/robots/panda_gripper.urdf:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <!-- =================================================================================== -->
  3 | <!-- |    This document was autogenerated by xacro from panda_arm_hand.urdf.xacro      | -->
  4 | <!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
  5 | <!-- =================================================================================== -->
  6 | <robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
  7 |   <link name="root">
  8 |     <inertial>
  9 |       <mass value="0.01"/>
 10 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 11 |     </inertial>
 12 |   </link>
 13 |   <link name="root1">
 14 |     <inertial>
 15 |       <mass value="0.01"/>
 16 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 17 |     </inertial>
 18 |   </link>
 19 |   <link name="root2">
 20 |     <inertial>
 21 |       <mass value="0.01"/>
 22 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 23 |     </inertial>
 24 |   </link>
 25 |   <link name="root3">
 26 |     <inertial>
 27 |       <mass value="0.01"/>
 28 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 29 |     </inertial>
 30 |   </link>
 31 |   <link name="root4">
 32 |     <inertial>
 33 |       <mass value="0.01"/>
 34 |       <inertia ixx="1e-4" ixy="0.0" ixz="0.0" iyy="1e-4" iyz="0.0" izz="1e-4"/>
 35 |     </inertial>
 36 |   </link>
 37 |   <link name="root5">
 38 |     <inertial>
 39 |       <mass value="1"/>
 40 |       <inertia ixx="0.05" ixy="0.0" ixz="0.0" iyy="0.05" iyz="0.0" izz="0.05"/>
 41 |     </inertial>
 42 |   </link>
 43 |   <joint name="x_joint" type="prismatic">
 44 |     <parent link="root"/>
 45 |     <child link="root1"/>
 46 |     <axis xyz="1 0 0"/>
 47 |     <limit lower="-10" upper="10"/>
 48 |   </joint>
 49 |   <joint name="y_joint" type="prismatic">
 50 |     <parent link="root1"/>
 51 |     <child link="root2"/>
 52 |     <axis xyz="0 1 0"/>
 53 |     <limit lower="-10" upper="10"/>
 54 |   </joint>
 55 |   <joint name="z_joint" type="prismatic">
 56 |     <parent link="root2"/>
 57 |     <child link="root3"/>
 58 |     <axis xyz="0 0 1"/>
 59 |     <limit lower="-10" upper="10"/>
 60 |   </joint>
 61 |   <joint name="x_rotation_joint" type="continuous">
 62 |     <parent link="root3"/>
 63 |     <child link="root4"/>
 64 |     <axis xyz="1 0 0"/>
 65 |   </joint>
 66 |   <joint name="y_rotation_joint" type="continuous">
 67 |     <parent link="root4"/>
 68 |     <child link="root5"/>
 69 |     <axis xyz="0 1 0"/>
 70 |   </joint>
 71 |   <joint name="z_rotation_joint" type="continuous">
 72 |     <parent link="root5"/>
 73 |     <child link="panda_hand"/>
 74 |     <axis xyz="0 0 1"/>
 75 |   </joint>
 76 |   <link name="panda_hand">
 77 |     <visual>
 78 |       <geometry>
 79 |         <mesh filename="franka_description/meshes/visual/hand.dae"/>
 80 |       </geometry>
 81 |     </visual>
 82 |     <collision>
 83 |       <geometry>
 84 |         <mesh filename="franka_description/meshes/collision/hand.stl"/>
 85 |       </geometry>
 86 |     </collision>
 87 |   </link>
 88 |   <link name="panda_leftfinger">
 89 |     <visual>
 90 |       <geometry>
 91 |         <mesh filename="franka_description/meshes/visual/finger.dae"/>
 92 |       </geometry>
 93 |     </visual>
 94 |     <collision>
 95 |       <geometry>
 96 |         <mesh filename="franka_description/meshes/collision/finger.stl"/>
 97 |       </geometry>
 98 |     </collision>
 99 |   </link>
100 |   <link name="panda_rightfinger">
101 |     <visual>
102 |       <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
103 |       <geometry>
104 |         <mesh filename="franka_description/meshes/visual/finger.dae"/>
105 |       </geometry>
106 |     </visual>
107 |     <collision>
108 |       <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
109 |       <geometry>
110 |         <mesh filename="franka_description/meshes/collision/finger.stl"/>
111 |       </geometry>
112 |     </collision>
113 |   </link>
114 |   <joint name="panda_finger_joint1" type="prismatic">
115 |     <parent link="panda_hand"/>
116 |     <child link="panda_leftfinger"/>
117 |     <origin rpy="0 0 0" xyz="0 0 0.0584"/>
118 |     <axis xyz="0 1 0"/>
119 |     <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
120 |   </joint>
121 |   <joint name="panda_finger_joint2" type="prismatic">
122 |     <parent link="panda_hand"/>
123 |     <child link="panda_rightfinger"/>
124 |     <origin rpy="0 0 0" xyz="0 0 0.0584"/>
125 |     <axis xyz="0 -1 0"/>
126 |     <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
127 |     <mimic joint="panda_finger_joint1"/>
128 |   </joint>
129 | </robot>
130 | 
131 | 


--------------------------------------------------------------------------------
/test/robots/franka_description/meshes/collision/finger.stl.convex.stl:
--------------------------------------------------------------------------------
  1 | solid AssimpScene
  2 |  facet normal -nan -nan -nan
  3 |   outer loop
  4 |   vertex 0.00869277 -0.000132643 0.0501662
  5 |   vertex 0.0104486 0.00258331 0.000146801
  6 |   vertex 0.01036 0.0264034 0.000154629
  7 |   endloop
  8 |  endfacet
  9 | 
 10 |  facet normal -nan -nan -nan
 11 |   outer loop
 12 |   vertex 0.00869277 -0.000132643 0.0501662
 13 |   vertex 0.01036 0.0264034 0.000154629
 14 |   vertex 0.0104005 0.0252534 0.0190366
 15 |   endloop
 16 |  endfacet
 17 | 
 18 |  facet normal -nan -nan -nan
 19 |   outer loop
 20 |   vertex 0.0104005 0.0252534 0.0190366
 21 |   vertex 0.00861608 0.0139887 0.0513279
 22 |   vertex 0.00869277 -0.000132643 0.0501662
 23 |   endloop
 24 |  endfacet
 25 | 
 26 |  facet normal -nan -nan -nan
 27 |   outer loop
 28 |   vertex 0.0104486 0.00258331 0.000146801
 29 |   vertex -0.0103872 0.00253418 0.000131696
 30 |   vertex -0.0104013 0.0263094 0.00016651
 31 |   endloop
 32 |  endfacet
 33 | 
 34 |  facet normal -nan -nan -nan
 35 |   outer loop
 36 |   vertex 0.0104486 0.00258331 0.000146801
 37 |   vertex -0.0104013 0.0263094 0.00016651
 38 |   vertex 0.01036 0.0264034 0.000154629
 39 |   endloop
 40 |  endfacet
 41 | 
 42 |  facet normal -nan -nan -nan
 43 |   outer loop
 44 |   vertex -0.0103889 0.0252203 0.0191876
 45 |   vertex -0.00527792 0.0142931 0.053849
 46 |   vertex 0.00583983 0.0142743 0.0538034
 47 |   endloop
 48 |  endfacet
 49 | 
 50 |  facet normal -nan -nan -nan
 51 |   outer loop
 52 |   vertex -0.0103889 0.0252203 0.0191876
 53 |   vertex 0.00583983 0.0142743 0.0538034
 54 |   vertex 0.0104005 0.0252534 0.0190366
 55 |   endloop
 56 |  endfacet
 57 | 
 58 |  facet normal -nan -nan -nan
 59 |   outer loop
 60 |   vertex -0.0103889 0.0252203 0.0191876
 61 |   vertex 0.0104005 0.0252534 0.0190366
 62 |   vertex 0.01036 0.0264034 0.000154629
 63 |   endloop
 64 |  endfacet
 65 | 
 66 |  facet normal -nan -nan -nan
 67 |   outer loop
 68 |   vertex -0.0103889 0.0252203 0.0191876
 69 |   vertex 0.01036 0.0264034 0.000154629
 70 |   vertex -0.0104013 0.0263094 0.00016651
 71 |   endloop
 72 |  endfacet
 73 | 
 74 |  facet normal -nan -nan -nan
 75 |   outer loop
 76 |   vertex -0.0103872 0.00253418 0.000131696
 77 |   vertex -0.00862294 -5.68019e-05 0.0509528
 78 |   vertex -0.00884117 0.0139176 0.0505894
 79 |   endloop
 80 |  endfacet
 81 | 
 82 |  facet normal -nan -nan -nan
 83 |   outer loop
 84 |   vertex -0.0103872 0.00253418 0.000131696
 85 |   vertex -0.00884117 0.0139176 0.0505894
 86 |   vertex -0.0103889 0.0252203 0.0191876
 87 |   endloop
 88 |  endfacet
 89 | 
 90 |  facet normal -nan -nan -nan
 91 |   outer loop
 92 |   vertex -0.0103889 0.0252203 0.0191876
 93 |   vertex -0.0104013 0.0263094 0.00016651
 94 |   vertex -0.0103872 0.00253418 0.000131696
 95 |   endloop
 96 |  endfacet
 97 | 
 98 |  facet normal -nan -nan -nan
 99 |   outer loop
100 |   vertex 0.00613802 -2.06026e-05 0.0535776
101 |   vertex 0.00869277 -0.000132643 0.0501662
102 |   vertex 0.00861608 0.0139887 0.0513279
103 |   endloop
104 |  endfacet
105 | 
106 |  facet normal -nan -nan -nan
107 |   outer loop
108 |   vertex -0.00884117 0.0139176 0.0505894
109 |   vertex -0.00527792 0.0142931 0.053849
110 |   vertex -0.0103889 0.0252203 0.0191876
111 |   endloop
112 |  endfacet
113 | 
114 |  facet normal -nan -nan -nan
115 |   outer loop
116 |   vertex -0.00884117 0.0139176 0.0505894
117 |   vertex -0.00862294 -5.68019e-05 0.0509528
118 |   vertex -0.00548142 -9.11208e-05 0.0537247
119 |   endloop
120 |  endfacet
121 | 
122 |  facet normal -nan -nan -nan
123 |   outer loop
124 |   vertex -0.00884117 0.0139176 0.0505894
125 |   vertex -0.00548142 -9.11208e-05 0.0537247
126 |   vertex -0.00527792 0.0142931 0.053849
127 |   endloop
128 |  endfacet
129 | 
130 |  facet normal -nan -nan -nan
131 |   outer loop
132 |   vertex 0.00583983 0.0142743 0.0538034
133 |   vertex -0.00527792 0.0142931 0.053849
134 |   vertex -0.00548142 -9.11208e-05 0.0537247
135 |   endloop
136 |  endfacet
137 | 
138 |  facet normal -nan -nan -nan
139 |   outer loop
140 |   vertex 0.00583983 0.0142743 0.0538034
141 |   vertex -0.00548142 -9.11208e-05 0.0537247
142 |   vertex 0.00613802 -2.06026e-05 0.0535776
143 |   endloop
144 |  endfacet
145 | 
146 |  facet normal -nan -nan -nan
147 |   outer loop
148 |   vertex 0.00583983 0.0142743 0.0538034
149 |   vertex 0.00613802 -2.06026e-05 0.0535776
150 |   vertex 0.00861608 0.0139887 0.0513279
151 |   endloop
152 |  endfacet
153 | 
154 |  facet normal -nan -nan -nan
155 |   outer loop
156 |   vertex 0.00583983 0.0142743 0.0538034
157 |   vertex 0.00861608 0.0139887 0.0513279
158 |   vertex 0.0104005 0.0252534 0.0190366
159 |   endloop
160 |  endfacet
161 | 
162 |  facet normal -nan -nan -nan
163 |   outer loop
164 |   vertex -0.00873039 -2.35252e-05 0.0361648
165 |   vertex 0.00869277 -0.000132643 0.0501662
166 |   vertex 0.00613802 -2.06026e-05 0.0535776
167 |   endloop
168 |  endfacet
169 | 
170 |  facet normal -nan -nan -nan
171 |   outer loop
172 |   vertex -0.00873039 -2.35252e-05 0.0361648
173 |   vertex 0.00613802 -2.06026e-05 0.0535776
174 |   vertex -0.00548142 -9.11208e-05 0.0537247
175 |   endloop
176 |  endfacet
177 | 
178 |  facet normal -nan -nan -nan
179 |   outer loop
180 |   vertex -0.00548142 -9.11208e-05 0.0537247
181 |   vertex -0.00862294 -5.68019e-05 0.0509528
182 |   vertex -0.00873039 -2.35252e-05 0.0361648
183 |   endloop
184 |  endfacet
185 | 
186 |  facet normal -nan -nan -nan
187 |   outer loop
188 |   vertex -0.00873039 -2.35252e-05 0.0361648
189 |   vertex -0.00862294 -5.68019e-05 0.0509528
190 |   vertex -0.0103872 0.00253418 0.000131696
191 |   endloop
192 |  endfacet
193 | 
194 |  facet normal -nan -nan -nan
195 |   outer loop
196 |   vertex -0.00873039 -2.35252e-05 0.0361648
197 |   vertex -0.0103872 0.00253418 0.000131696
198 |   vertex 0.0104486 0.00258331 0.000146801
199 |   endloop
200 |  endfacet
201 | 
202 |  facet normal -nan -nan -nan
203 |   outer loop
204 |   vertex -0.00873039 -2.35252e-05 0.0361648
205 |   vertex 0.0104486 0.00258331 0.000146801
206 |   vertex 0.00869277 -0.000132643 0.0501662
207 |   endloop
208 |  endfacet
209 | 
210 | endsolid AssimpScene
211 | 


--------------------------------------------------------------------------------
/data_collection/code/robots/franka_description/meshes/collision/finger.stl.convex.stl:
--------------------------------------------------------------------------------
  1 | solid AssimpScene
  2 |  facet normal -nan -nan -nan
  3 |   outer loop
  4 |   vertex 0.00869277 -0.000132643 0.0501662
  5 |   vertex 0.0104486 0.00258331 0.000146801
  6 |   vertex 0.01036 0.0264034 0.000154629
  7 |   endloop
  8 |  endfacet
  9 | 
 10 |  facet normal -nan -nan -nan
 11 |   outer loop
 12 |   vertex 0.00869277 -0.000132643 0.0501662
 13 |   vertex 0.01036 0.0264034 0.000154629
 14 |   vertex 0.0104005 0.0252534 0.0190366
 15 |   endloop
 16 |  endfacet
 17 | 
 18 |  facet normal -nan -nan -nan
 19 |   outer loop
 20 |   vertex 0.0104005 0.0252534 0.0190366
 21 |   vertex 0.00861608 0.0139887 0.0513279
 22 |   vertex 0.00869277 -0.000132643 0.0501662
 23 |   endloop
 24 |  endfacet
 25 | 
 26 |  facet normal -nan -nan -nan
 27 |   outer loop
 28 |   vertex 0.0104486 0.00258331 0.000146801
 29 |   vertex -0.0103872 0.00253418 0.000131696
 30 |   vertex -0.0104013 0.0263094 0.00016651
 31 |   endloop
 32 |  endfacet
 33 | 
 34 |  facet normal -nan -nan -nan
 35 |   outer loop
 36 |   vertex 0.0104486 0.00258331 0.000146801
 37 |   vertex -0.0104013 0.0263094 0.00016651
 38 |   vertex 0.01036 0.0264034 0.000154629
 39 |   endloop
 40 |  endfacet
 41 | 
 42 |  facet normal -nan -nan -nan
 43 |   outer loop
 44 |   vertex -0.0103889 0.0252203 0.0191876
 45 |   vertex -0.00527792 0.0142931 0.053849
 46 |   vertex 0.00583983 0.0142743 0.0538034
 47 |   endloop
 48 |  endfacet
 49 | 
 50 |  facet normal -nan -nan -nan
 51 |   outer loop
 52 |   vertex -0.0103889 0.0252203 0.0191876
 53 |   vertex 0.00583983 0.0142743 0.0538034
 54 |   vertex 0.0104005 0.0252534 0.0190366
 55 |   endloop
 56 |  endfacet
 57 | 
 58 |  facet normal -nan -nan -nan
 59 |   outer loop
 60 |   vertex -0.0103889 0.0252203 0.0191876
 61 |   vertex 0.0104005 0.0252534 0.0190366
 62 |   vertex 0.01036 0.0264034 0.000154629
 63 |   endloop
 64 |  endfacet
 65 | 
 66 |  facet normal -nan -nan -nan
 67 |   outer loop
 68 |   vertex -0.0103889 0.0252203 0.0191876
 69 |   vertex 0.01036 0.0264034 0.000154629
 70 |   vertex -0.0104013 0.0263094 0.00016651
 71 |   endloop
 72 |  endfacet
 73 | 
 74 |  facet normal -nan -nan -nan
 75 |   outer loop
 76 |   vertex -0.0103872 0.00253418 0.000131696
 77 |   vertex -0.00862294 -5.68019e-05 0.0509528
 78 |   vertex -0.00884117 0.0139176 0.0505894
 79 |   endloop
 80 |  endfacet
 81 | 
 82 |  facet normal -nan -nan -nan
 83 |   outer loop
 84 |   vertex -0.0103872 0.00253418 0.000131696
 85 |   vertex -0.00884117 0.0139176 0.0505894
 86 |   vertex -0.0103889 0.0252203 0.0191876
 87 |   endloop
 88 |  endfacet
 89 | 
 90 |  facet normal -nan -nan -nan
 91 |   outer loop
 92 |   vertex -0.0103889 0.0252203 0.0191876
 93 |   vertex -0.0104013 0.0263094 0.00016651
 94 |   vertex -0.0103872 0.00253418 0.000131696
 95 |   endloop
 96 |  endfacet
 97 | 
 98 |  facet normal -nan -nan -nan
 99 |   outer loop
100 |   vertex 0.00613802 -2.06026e-05 0.0535776
101 |   vertex 0.00869277 -0.000132643 0.0501662
102 |   vertex 0.00861608 0.0139887 0.0513279
103 |   endloop
104 |  endfacet
105 | 
106 |  facet normal -nan -nan -nan
107 |   outer loop
108 |   vertex -0.00884117 0.0139176 0.0505894
109 |   vertex -0.00527792 0.0142931 0.053849
110 |   vertex -0.0103889 0.0252203 0.0191876
111 |   endloop
112 |  endfacet
113 | 
114 |  facet normal -nan -nan -nan
115 |   outer loop
116 |   vertex -0.00884117 0.0139176 0.0505894
117 |   vertex -0.00862294 -5.68019e-05 0.0509528
118 |   vertex -0.00548142 -9.11208e-05 0.0537247
119 |   endloop
120 |  endfacet
121 | 
122 |  facet normal -nan -nan -nan
123 |   outer loop
124 |   vertex -0.00884117 0.0139176 0.0505894
125 |   vertex -0.00548142 -9.11208e-05 0.0537247
126 |   vertex -0.00527792 0.0142931 0.053849
127 |   endloop
128 |  endfacet
129 | 
130 |  facet normal -nan -nan -nan
131 |   outer loop
132 |   vertex 0.00583983 0.0142743 0.0538034
133 |   vertex -0.00527792 0.0142931 0.053849
134 |   vertex -0.00548142 -9.11208e-05 0.0537247
135 |   endloop
136 |  endfacet
137 | 
138 |  facet normal -nan -nan -nan
139 |   outer loop
140 |   vertex 0.00583983 0.0142743 0.0538034
141 |   vertex -0.00548142 -9.11208e-05 0.0537247
142 |   vertex 0.00613802 -2.06026e-05 0.0535776
143 |   endloop
144 |  endfacet
145 | 
146 |  facet normal -nan -nan -nan
147 |   outer loop
148 |   vertex 0.00583983 0.0142743 0.0538034
149 |   vertex 0.00613802 -2.06026e-05 0.0535776
150 |   vertex 0.00861608 0.0139887 0.0513279
151 |   endloop
152 |  endfacet
153 | 
154 |  facet normal -nan -nan -nan
155 |   outer loop
156 |   vertex 0.00583983 0.0142743 0.0538034
157 |   vertex 0.00861608 0.0139887 0.0513279
158 |   vertex 0.0104005 0.0252534 0.0190366
159 |   endloop
160 |  endfacet
161 | 
162 |  facet normal -nan -nan -nan
163 |   outer loop
164 |   vertex -0.00873039 -2.35252e-05 0.0361648
165 |   vertex 0.00869277 -0.000132643 0.0501662
166 |   vertex 0.00613802 -2.06026e-05 0.0535776
167 |   endloop
168 |  endfacet
169 | 
170 |  facet normal -nan -nan -nan
171 |   outer loop
172 |   vertex -0.00873039 -2.35252e-05 0.0361648
173 |   vertex 0.00613802 -2.06026e-05 0.0535776
174 |   vertex -0.00548142 -9.11208e-05 0.0537247
175 |   endloop
176 |  endfacet
177 | 
178 |  facet normal -nan -nan -nan
179 |   outer loop
180 |   vertex -0.00548142 -9.11208e-05 0.0537247
181 |   vertex -0.00862294 -5.68019e-05 0.0509528
182 |   vertex -0.00873039 -2.35252e-05 0.0361648
183 |   endloop
184 |  endfacet
185 | 
186 |  facet normal -nan -nan -nan
187 |   outer loop
188 |   vertex -0.00873039 -2.35252e-05 0.0361648
189 |   vertex -0.00862294 -5.68019e-05 0.0509528
190 |   vertex -0.0103872 0.00253418 0.000131696
191 |   endloop
192 |  endfacet
193 | 
194 |  facet normal -nan -nan -nan
195 |   outer loop
196 |   vertex -0.00873039 -2.35252e-05 0.0361648
197 |   vertex -0.0103872 0.00253418 0.000131696
198 |   vertex 0.0104486 0.00258331 0.000146801
199 |   endloop
200 |  endfacet
201 | 
202 |  facet normal -nan -nan -nan
203 |   outer loop
204 |   vertex -0.00873039 -2.35252e-05 0.0361648
205 |   vertex 0.0104486 0.00258331 0.000146801
206 |   vertex 0.00869277 -0.000132643 0.0501662
207 |   endloop
208 |  endfacet
209 | 
210 | endsolid AssimpScene
211 | 


--------------------------------------------------------------------------------
/data_collection/code/camera.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     an RGB-D camera
  3 | """
  4 | import numpy as np
  5 | from sapien.core import Pose
  6 | 
  7 | 
  8 | class Camera(object):
  9 | 
 10 |     def __init__(self, env, near=0.1, far=100.0, image_size=336, dist=5.0, \
 11 |             phi=np.pi/5, theta=np.pi, fov=35, random_position=False, fixed_position=False):
 12 |         builder = env.scene.create_actor_builder()
 13 |         camera_mount_actor = builder.build(is_kinematic=True)
 14 |         self.env = env
 15 |         
 16 |         # set camera intrinsics
 17 |         self.camera = env.scene.add_mounted_camera('camera', camera_mount_actor, Pose(), \
 18 |                 image_size, image_size, 0, np.deg2rad(fov), near, far)
 19 | 
 20 |         # set camera extrinsics
 21 |         if random_position:
 22 |             phi = (np.random.random()+1) * np.pi/6
 23 |             theta = np.random.uniform(low=0.7, high=1.3) * np.pi
 24 |             dist = 4.5 + np.random.random()
 25 |         if fixed_position:
 26 |             theta = np.pi
 27 |             phi = np.pi/10
 28 |         pos = np.array([dist*np.cos(phi)*np.cos(theta), \
 29 |                 dist*np.cos(phi)*np.sin(theta), \
 30 |                 dist*np.sin(phi)])
 31 |         forward = -pos / np.linalg.norm(pos)
 32 |         left = np.cross([0, 0, 1], forward)
 33 |         left = left / np.linalg.norm(left)
 34 |         up = np.cross(forward, left)
 35 |         mat44 = np.eye(4)
 36 |         mat44[:3, :3] = np.vstack([forward, left, up]).T
 37 |         mat44[:3, 3] = pos      # mat44 is cam2world
 38 |         mat44[0, 3] += env.object_position_offset
 39 |         self.mat44 = mat44
 40 |         camera_mount_actor.set_pose(Pose.from_transformation_matrix(mat44))
 41 | 
 42 |         # log parameters
 43 |         self.near = near
 44 |         self.far = far
 45 |         self.dist = dist
 46 |         self.theta = theta
 47 |         self.phi = phi
 48 |         self.pos = pos
 49 | 
 50 |     def get_observation(self):
 51 |         self.camera.take_picture()
 52 |         rgba = self.camera.get_color_rgba()
 53 |         rgba = (rgba * 255).clip(0, 255).astype(np.float32) / 255
 54 |         white = np.ones((rgba.shape[0], rgba.shape[1], 3), dtype=np.float32)
 55 |         mask = np.tile(rgba[:, :, 3:4], [1, 1, 3])
 56 |         rgb = rgba[:, :, :3] * mask + white * (1 - mask)
 57 |         depth = self.camera.get_depth().astype(np.float32)
 58 |         return rgb, depth
 59 | 
 60 |     def compute_camera_XYZA(self, depth):
 61 |         camera_matrix = self.camera.get_camera_matrix()[:3, :3]
 62 |         y, x = np.where(depth < 1)
 63 |         z = self.near * self.far / (self.far + depth * (self.near - self.far))
 64 |         permutation = np.array([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
 65 |         points = (permutation @ np.dot(np.linalg.inv(camera_matrix), \
 66 |             np.stack([x, y, np.ones_like(x)] * z[y, x], 0))).T
 67 |         return y, x, points
 68 | 
 69 |     @staticmethod
 70 |     def compute_XYZA_matrix(id1, id2, pts, size1, size2):
 71 |         out = np.zeros((size1, size2, 4), dtype=np.float32)
 72 |         out[id1, id2, :3] = pts
 73 |         out[id1, id2, 3] = 1
 74 |         return out
 75 |  
 76 |     def get_normal_map(self):
 77 |         nor = self.camera.get_normal_rgba()
 78 |         # convert from PartNet-space (x-right, y-up, z-backward) to SAPIEN-space (x-front, y-left, z-up)
 79 |         new_nor = np.array(nor, dtype=np.float32)
 80 |         new_nor[:, :, 0] = -nor[:, :, 2]
 81 |         new_nor[:, :, 1] = -nor[:, :, 0]
 82 |         new_nor[:, :, 2] = nor[:, :, 1]
 83 |         return new_nor
 84 | 
 85 |     def get_movable_link_mask(self, link_ids):
 86 |         link_seg = self.camera.get_segmentation()
 87 |         link_mask = np.zeros((link_seg.shape[0], link_seg.shape[1])).astype(np.uint8)
 88 |         for idx, lid in enumerate(link_ids):
 89 |             cur_link_pixels = int(np.sum(link_seg==lid))
 90 |             if cur_link_pixels > 0:
 91 |                 link_mask[link_seg == lid] = idx+1
 92 |         return link_mask
 93 |     
 94 |     def get_handle_mask(self):
 95 |         # read part seg partid2renderids
 96 |         partid2renderids = dict()
 97 |         for k in self.env.scene.render_id_to_visual_name:
 98 |             if self.env.scene.render_id_to_visual_name[k].split('-')[0] == 'handle':
 99 |                 part_id = int(self.env.scene.render_id_to_visual_name[k].split('-')[-1])
100 |                 if part_id not in partid2renderids:
101 |                     partid2renderids[part_id] = []
102 |                 partid2renderids[part_id].append(k)
103 |         # generate 0/1 handle mask
104 |         part_seg = self.camera.get_obj_segmentation()
105 |         handle_mask = np.zeros((part_seg.shape[0], part_seg.shape[1])).astype(np.uint8)
106 |         for partid in partid2renderids:
107 |             cur_part_mask = np.isin(part_seg, partid2renderids[partid])
108 |             cur_part_mask_pixels = int(np.sum(cur_part_mask))
109 |             if cur_part_mask_pixels > 0:
110 |                 handle_mask[cur_part_mask] = 1
111 |         return handle_mask
112 | 
113 |     def get_object_mask(self):
114 |         rgba = self.camera.get_albedo_rgba()
115 |         return rgba[:, :, 3] > 0.5
116 | 
117 |     # return camera parameters
118 |     def get_metadata(self):
119 |         return {
120 |             'pose': self.camera.get_pose(),
121 |             'near': self.camera.get_near(),
122 |             'far': self.camera.get_far(),
123 |             'width': self.camera.get_width(),
124 |             'height': self.camera.get_height(),
125 |             'fov': self.camera.get_fovy(),
126 |             'camera_matrix': self.camera.get_camera_matrix(),
127 |             'projection_matrix': self.camera.get_projection_matrix(),
128 |             'model_matrix': self.camera.get_model_matrix(),
129 |             'mat44': self.mat44,
130 |         }
131 |     
132 |     # return camera parameters
133 |     def get_metadata_json(self):
134 |         return {
135 |             'dist': self.dist,
136 |             'theta': self.theta,
137 |             'phi': self.phi,
138 |             'near': self.camera.get_near(),
139 |             'far': self.camera.get_far(),
140 |             'width': self.camera.get_width(),
141 |             'height': self.camera.get_height(),
142 |             'fov': self.camera.get_fovy(),
143 |             'camera_matrix': self.camera.get_camera_matrix().tolist(),
144 |             'projection_matrix': self.camera.get_projection_matrix().tolist(),
145 |             'model_matrix': self.camera.get_model_matrix().tolist(),
146 |             'mat44': self.mat44.tolist(),
147 |         }
148 | 
149 | 


--------------------------------------------------------------------------------
/test/robots/panda_robot.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Franka Panda Robot Arm
  3 |         support panda.urdf, panda_gripper.urdf
  4 | """
  5 | 
  6 | from __future__ import division
  7 | import sapien.core as sapien
  8 | from sapien.core import Pose, SceneConfig #,PxrMaterial, 
  9 | from transforms3d.quaternions import axangle2quat, qmult
 10 | import numpy as np
 11 | from utils import pose2exp_coordinate, adjoint_matrix
 12 | 
 13 | 
 14 | class Robot(object):
 15 |     def __init__(self, env, urdf, material, open_gripper=False):
 16 |         self.env = env
 17 |         self.timestep = env.scene.get_timestep()
 18 | 
 19 |         # load robot
 20 |         loader = env.scene.create_urdf_loader()
 21 |         loader.scale = 1.2
 22 |         loader.fix_root_link = True
 23 |         self.robot = loader.load(urdf, {"material": material})
 24 |         #self.robot = loader.load(urdf, material)
 25 |         self.robot.name = "robot"
 26 | 
 27 |         # hand (EE), two grippers, the rest arm joints (if any)
 28 |         self.end_effector_index, self.end_effector = \
 29 |             [(i, l) for i, l in enumerate(self.robot.get_links()) if l.name == 'panda_hand'][0]
 30 |         self.hand_actor_id = self.end_effector.get_id()
 31 |         self.gripper_joints = [joint for joint in self.robot.get_joints() if 
 32 |                 joint.get_name().startswith("panda_finger_joint")]
 33 |         self.gripper_actor_ids = [joint.get_child_link().get_id() for joint in self.gripper_joints]
 34 |         self.arm_joints = [joint for joint in self.robot.get_joints() if
 35 |                 joint.get_dof() > 0 and not joint.get_name().startswith("panda_finger")]
 36 | 
 37 |         # set drive joint property
 38 |         for joint in self.arm_joints:
 39 |             joint.set_drive_property(1000, 400)
 40 |         for joint in self.gripper_joints:
 41 |             joint.set_drive_property(200, 60)
 42 | 
 43 |         # open/close the gripper at start
 44 |         if open_gripper:
 45 |             joint_angles = []
 46 |             for j in self.robot.get_joints():
 47 |                 if j.get_dof() == 1:
 48 |                     if j.get_name().startswith("panda_finger_joint"):
 49 |                         joint_angles.append(0.04)
 50 |                     else:
 51 |                         joint_angles.append(0)
 52 |             self.robot.set_qpos(joint_angles)
 53 | 
 54 |     def compute_joint_velocity_from_twist(self, twist: np.ndarray) -> np.ndarray:
 55 |         """
 56 |         This function is a kinematic-level calculation which do not consider dynamics.
 57 |         Pay attention to the frame of twist, is it spatial twist or body twist
 58 | 
 59 |         Jacobian is provided for your, so no need to compute the velocity kinematics
 60 |         ee_jacobian is the geometric Jacobian on account of only the joint of robot arm, not gripper
 61 |         Jacobian in SAPIEN is defined as the derivative of spatial twist with respect to joint velocity
 62 | 
 63 |         Args:
 64 |             twist: (6,) vector to represent the twist
 65 | 
 66 |         Returns:
 67 |             (7, ) vector for the velocity of arm joints (not include gripper)
 68 | 
 69 |         """
 70 |         assert twist.size == 6
 71 |         # Jacobian define in SAPIEN use twist (v, \omega) which is different from the definition in the slides
 72 |         # So we perform the matrix block operation below
 73 |         dense_jacobian = self.robot.compute_spatial_twist_jacobian()  # (num_link * 6, dof())
 74 |         ee_jacobian = np.zeros([6, self.robot.dof - 2])
 75 |         ee_jacobian[:3, :] = dense_jacobian[self.end_effector_index * 6 - 3: self.end_effector_index * 6, :self.robot.dof - 2]
 76 |         ee_jacobian[3:6, :] = dense_jacobian[(self.end_effector_index - 1) * 6: self.end_effector_index * 6 - 3, :self.robot.dof - 2]
 77 | 
 78 |         #numerical_small_bool = ee_jacobian < 1e-1
 79 |         #ee_jacobian[numerical_small_bool] = 0
 80 |         #inverse_jacobian = np.linalg.pinv(ee_jacobian)
 81 |         inverse_jacobian = np.linalg.pinv(ee_jacobian, rcond=1e-2)
 82 |         #inverse_jacobian[np.abs(inverse_jacobian) > 5] = 0
 83 |         #print(inverse_jacobian)
 84 |         return inverse_jacobian @ twist
 85 | 
 86 |     def internal_controller(self, qvel: np.ndarray) -> None:
 87 |         """Control the robot dynamically to execute the given twist for one time step
 88 | 
 89 |         This method will try to execute the joint velocity using the internal dynamics function in SAPIEN.
 90 | 
 91 |         Note that this function is only used for one time step, so you may need to call it multiple times in your code
 92 |         Also this controller is not perfect, it will still have some small movement even after you have finishing using
 93 |         it. Thus try to wait for some steps using self.wait_n_steps(n) like in the hw2.py after you call it multiple
 94 |         time to allow it to reach the target position
 95 | 
 96 |         Args:
 97 |             qvel: (7,) vector to represent the joint velocity
 98 | 
 99 |         """
100 |         assert qvel.size == len(self.arm_joints)
101 |         target_qpos = qvel * self.timestep + self.robot.get_drive_target()[:-2]
102 |         for i, joint in enumerate(self.arm_joints):
103 |             joint.set_drive_velocity_target(qvel[i])
104 |             joint.set_drive_target(target_qpos[i])
105 |         passive_force = self.robot.compute_passive_force()
106 |         self.robot.set_qf(passive_force)
107 | 
108 |     def calculate_twist(self, time_to_target, target_ee_pose):
109 |         relative_transform = self.end_effector.get_pose().inv().to_transformation_matrix() @ target_ee_pose
110 |         unit_twist, theta = pose2exp_coordinate(relative_transform)
111 |         velocity = theta / time_to_target
112 |         body_twist = unit_twist * velocity
113 |         current_ee_pose = self.end_effector.get_pose().to_transformation_matrix()
114 |         return adjoint_matrix(current_ee_pose) @ body_twist
115 | 
116 |     def move_to_target_pose(self, target_ee_pose: np.ndarray, num_steps: int) -> None:
117 |         """
118 |         Move the robot hand dynamically to a given target pose
119 |         Args:
120 |             target_ee_pose: (4, 4) transformation of robot hand in robot base frame (ee2base)
121 |             num_steps:  how much steps to reach to target pose, 
122 |                         each step correspond to self.scene.get_timestep() seconds
123 |                         in physical simulation
124 |         """
125 |         executed_time = num_steps * self.timestep
126 | 
127 |         spatial_twist = self.calculate_twist(executed_time, target_ee_pose)
128 |         for i in range(num_steps):
129 |             if i % 100 == 0:
130 |                 spatial_twist = self.calculate_twist((num_steps - i) * self.timestep, target_ee_pose)
131 |             qvel = self.compute_joint_velocity_from_twist(spatial_twist)
132 |             self.internal_controller(qvel)
133 |             self.env.step()
134 |             self.env.render()
135 |         return
136 | 
137 |     def close_gripper(self):
138 |         for joint in self.gripper_joints:
139 |             joint.set_drive_target(0.0)
140 | 
141 |     def open_gripper(self):
142 |         for joint in self.gripper_joints:
143 |             joint.set_drive_target(0.04)
144 | 
145 |     def clear_velocity_command(self):
146 |         for joint in self.arm_joints:
147 |             joint.set_drive_velocity_target(0)
148 | 
149 |     def wait_n_steps(self, n: int):
150 |         self.clear_velocity_command()
151 |         for i in range(n):
152 |             passive_force = self.robot.compute_passive_force()
153 |             self.robot.set_qf(passive_force)
154 |             self.env.step()
155 |             self.env.render()
156 |         self.robot.set_qf([0] * self.robot.dof)
157 | 
158 | 


--------------------------------------------------------------------------------
/data_collection/code/robots/panda_robot.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Franka Panda Robot Arm
  3 |         support panda.urdf, panda_gripper.urdf
  4 | """
  5 | 
  6 | from __future__ import division
  7 | import sapien.core as sapien
  8 | from sapien.core import Pose, SceneConfig #,PxrMaterial, 
  9 | from transforms3d.quaternions import axangle2quat, qmult
 10 | import numpy as np
 11 | from utils import pose2exp_coordinate, adjoint_matrix
 12 | 
 13 | 
 14 | class Robot(object):
 15 |     def __init__(self, env, urdf, material, open_gripper=False):
 16 |         self.env = env
 17 |         self.timestep = env.scene.get_timestep()
 18 | 
 19 |         # load robot
 20 |         loader = env.scene.create_urdf_loader()
 21 |         loader.scale = 1.3
 22 |         loader.fix_root_link = True
 23 |         self.robot = loader.load(urdf, {"material": material})
 24 |         #self.robot = loader.load(urdf, material)
 25 |         self.robot.name = "robot"
 26 | 
 27 |         # hand (EE), two grippers, the rest arm joints (if any)
 28 |         self.end_effector_index, self.end_effector = \
 29 |             [(i, l) for i, l in enumerate(self.robot.get_links()) if l.name == 'panda_hand'][0]
 30 |         self.hand_actor_id = self.end_effector.get_id()
 31 |         self.gripper_joints = [joint for joint in self.robot.get_joints() if 
 32 |                 joint.get_name().startswith("panda_finger_joint")]
 33 |         self.gripper_actor_ids = [joint.get_child_link().get_id() for joint in self.gripper_joints]
 34 |         self.arm_joints = [joint for joint in self.robot.get_joints() if
 35 |                 joint.get_dof() > 0 and not joint.get_name().startswith("panda_finger")]
 36 | 
 37 |         # set drive joint property
 38 |         for joint in self.arm_joints:
 39 |             joint.set_drive_property(1000, 400)
 40 |         for joint in self.gripper_joints:
 41 |             joint.set_drive_property(200, 60)
 42 | 
 43 |         # open/close the gripper at start
 44 |         if open_gripper:
 45 |             joint_angles = []
 46 |             for j in self.robot.get_joints():
 47 |                 if j.get_dof() == 1:
 48 |                     if j.get_name().startswith("panda_finger_joint"):
 49 |                         joint_angles.append(0.04)
 50 |                     else:
 51 |                         joint_angles.append(0)
 52 |             self.robot.set_qpos(joint_angles)
 53 | 
 54 |     def compute_joint_velocity_from_twist(self, twist: np.ndarray) -> np.ndarray:
 55 |         """
 56 |         This function is a kinematic-level calculation which do not consider dynamics.
 57 |         Pay attention to the frame of twist, is it spatial twist or body twist
 58 | 
 59 |         Jacobian is provided for your, so no need to compute the velocity kinematics
 60 |         ee_jacobian is the geometric Jacobian on account of only the joint of robot arm, not gripper
 61 |         Jacobian in SAPIEN is defined as the derivative of spatial twist with respect to joint velocity
 62 | 
 63 |         Args:
 64 |             twist: (6,) vector to represent the twist
 65 | 
 66 |         Returns:
 67 |             (7, ) vector for the velocity of arm joints (not include gripper)
 68 | 
 69 |         """
 70 |         assert twist.size == 6
 71 |         # Jacobian define in SAPIEN use twist (v, \omega) which is different from the definition in the slides
 72 |         # So we perform the matrix block operation below
 73 |         dense_jacobian = self.robot.compute_spatial_twist_jacobian()  # (num_link * 6, dof())
 74 |         ee_jacobian = np.zeros([6, self.robot.dof - 2])
 75 |         ee_jacobian[:3, :] = dense_jacobian[self.end_effector_index * 6 - 3: self.end_effector_index * 6, :self.robot.dof - 2]
 76 |         ee_jacobian[3:6, :] = dense_jacobian[(self.end_effector_index - 1) * 6: self.end_effector_index * 6 - 3, :self.robot.dof - 2]
 77 | 
 78 |         #numerical_small_bool = ee_jacobian < 1e-1
 79 |         #ee_jacobian[numerical_small_bool] = 0
 80 |         #inverse_jacobian = np.linalg.pinv(ee_jacobian)
 81 |         inverse_jacobian = np.linalg.pinv(ee_jacobian, rcond=1e-2)
 82 |         #inverse_jacobian[np.abs(inverse_jacobian) > 5] = 0
 83 |         #print(inverse_jacobian)
 84 |         return inverse_jacobian @ twist
 85 | 
 86 |     def internal_controller(self, qvel: np.ndarray) -> None:
 87 |         """Control the robot dynamically to execute the given twist for one time step
 88 | 
 89 |         This method will try to execute the joint velocity using the internal dynamics function in SAPIEN.
 90 | 
 91 |         Note that this function is only used for one time step, so you may need to call it multiple times in your code
 92 |         Also this controller is not perfect, it will still have some small movement even after you have finishing using
 93 |         it. Thus try to wait for some steps using self.wait_n_steps(n) like in the hw2.py after you call it multiple
 94 |         time to allow it to reach the target position
 95 | 
 96 |         Args:
 97 |             qvel: (7,) vector to represent the joint velocity
 98 | 
 99 |         """
100 |         assert qvel.size == len(self.arm_joints)
101 |         target_qpos = qvel * self.timestep + self.robot.get_drive_target()[:-2]
102 |         for i, joint in enumerate(self.arm_joints):
103 |             joint.set_drive_velocity_target(qvel[i])
104 |             joint.set_drive_target(target_qpos[i])
105 |         passive_force = self.robot.compute_passive_force()
106 |         self.robot.set_qf(passive_force)
107 | 
108 |     def calculate_twist(self, time_to_target, target_ee_pose):
109 |         relative_transform = self.end_effector.get_pose().inv().to_transformation_matrix() @ target_ee_pose
110 |         unit_twist, theta = pose2exp_coordinate(relative_transform)
111 |         velocity = theta / time_to_target
112 |         body_twist = unit_twist * velocity
113 |         current_ee_pose = self.end_effector.get_pose().to_transformation_matrix()
114 |         return adjoint_matrix(current_ee_pose) @ body_twist
115 | 
116 |     def move_to_target_pose(self, target_ee_pose: np.ndarray, num_steps: int) -> None:
117 |         """
118 |         Move the robot hand dynamically to a given target pose
119 |         Args:
120 |             target_ee_pose: (4, 4) transformation of robot hand in robot base frame (ee2base)
121 |             num_steps:  how much steps to reach to target pose, 
122 |                         each step correspond to self.scene.get_timestep() seconds
123 |                         in physical simulation
124 |         """
125 |         executed_time = num_steps * self.timestep
126 | 
127 |         spatial_twist = self.calculate_twist(executed_time, target_ee_pose)
128 |         for i in range(num_steps):
129 |             if i % 100 == 0:
130 |                 spatial_twist = self.calculate_twist((num_steps - i) * self.timestep, target_ee_pose)
131 |             qvel = self.compute_joint_velocity_from_twist(spatial_twist)
132 |             self.internal_controller(qvel)
133 |             self.env.step()
134 |             self.env.render()
135 |         return
136 | 
137 |     def close_gripper(self):
138 |         for joint in self.gripper_joints:
139 |             joint.set_drive_target(0.0)
140 | 
141 |     def open_gripper(self):
142 |         for joint in self.gripper_joints:
143 |             joint.set_drive_target(0.04)
144 | 
145 |     def clear_velocity_command(self):
146 |         for joint in self.arm_joints:
147 |             joint.set_drive_velocity_target(0)
148 | 
149 |     def wait_n_steps(self, n: int):
150 |         self.clear_velocity_command()
151 |         for i in range(n):
152 |             passive_force = self.robot.compute_passive_force()
153 |             self.robot.set_qf(passive_force)
154 |             self.env.step()
155 |             self.env.render()
156 |         self.robot.set_qf([0] * self.robot.dof)
157 | 
158 | 


--------------------------------------------------------------------------------
/test/camera.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     an RGB-D camera
  3 | """
  4 | import numpy as np
  5 | from sapien.core import Pose
  6 | import utils
  7 | # import wandb
  8 | # wandb.init(project="multi-view-0110")
  9 | class Camera(object):
 10 | 
 11 |     def __init__(self, env, near=0.1, far=100.0, image_size=336, dist=5.0, \
 12 |             phi=np.pi/5, theta=np.pi, fov=35, random_position=False, fixed_position=False):
 13 |         builder = env.scene.create_actor_builder()
 14 |         self.camera_mount_actor = builder.build(is_kinematic=True)
 15 |         self.env = env
 16 |         
 17 |         # set camera intrinsics
 18 |         self.camera = env.scene.add_mounted_camera('camera', self.camera_mount_actor, Pose(), \
 19 |                 image_size, image_size, 0, np.deg2rad(fov), near, far)
 20 | 
 21 |         # set camera extrinsics
 22 |         # if random_position:
 23 |             
 24 |             
 25 |         #     theta = np.random.uniform(low=0.9, high=1.1) * np.pi
 26 |         #     phi = phi
 27 |         #     # dist = 4.8 + np.random.random()
 28 |         #     pos = np.array([dist*np.cos(phi)*np.cos(theta), \
 29 |         #             dist*np.cos(phi)*np.sin(theta), \
 30 |         #             dist*np.sin(phi)])
 31 |         #     # print(pos)
 32 |             
 33 |         #     b = pos[1] + np.random.random()*0.6 - 0.3
 34 |             
 35 |         #     pos[1] = b
 36 |         #     c = pos[2] + np.random.random()*0.4 - 0.2
 37 |             
 38 |         #     pos[2] = c
 39 |         #     # print(pos)
 40 |         # else:
 41 |             
 42 |         #     #theta = -np.pi/10
 43 |         #     #theta = -np.pi/8
 44 |         #     theta = theta
 45 |         #     phi = phi
 46 |         
 47 |         #     pos = np.array([dist*np.cos(phi)*np.cos(theta), \
 48 |         #             dist*np.cos(phi)*np.sin(theta), \
 49 |         #             dist*np.sin(phi)])
 50 |         if random_position:
 51 |             # theta = np.random.random() * np.pi*2
 52 |             # theta = np.random.uniform(low=0.9, high=1.1) * np.pi
 53 |             # phi = (np.random.random()+1) * np.pi/6
 54 |             theta = np.random.uniform(low=0.9, high=1.1) * np.pi
 55 |             phi = phi
 56 |         if fixed_position:
 57 |             #theta = -np.pi/10
 58 |             #theta = -np.pi/8
 59 |             theta = np.pi
 60 |             phi = np.pi/10
 61 |         pos = np.array([dist*np.cos(phi)*np.cos(theta), \
 62 |                 dist*np.cos(phi)*np.sin(theta), \
 63 |                 dist*np.sin(phi)])
 64 |         # print(print('1',pos)pos)
 65 |             # print('2',pos)
 66 |         # pos = np.array([-3.54468498,-0.36440411,3.52577091])
 67 |         
 68 |         # assert(0)
 69 |         forward = -pos / np.linalg.norm(pos)
 70 |         left = np.cross([0, 0, 1], forward)
 71 |         left = left / np.linalg.norm(left)
 72 |         up = np.cross(forward, left)
 73 |         mat44 = np.eye(4)
 74 |         mat44[:3, :3] = np.vstack([forward, left, up]).T
 75 |         mat44[:3, 3] = pos      # mat44 is cam2world
 76 |         mat44[0, 3] += env.object_position_offset
 77 |         self.mat44 = mat44
 78 |         self.camera_mount_actor.set_pose(Pose.from_transformation_matrix(mat44))
 79 | 
 80 |         # log parameters
 81 |         self.near = near
 82 |         self.far = far
 83 |         self.dist = dist
 84 |         self.theta = theta
 85 |         self.phi = phi
 86 |         self.pos = pos
 87 | 
 88 |     def change_pose_by_mat(self, mat44):
 89 |         self.mat44 = mat44
 90 |         self.camera_mount_actor.set_pose(Pose.from_transformation_matrix(mat44))
 91 |         self.pos = mat44[:3, 3]
 92 |         self.dist = None
 93 |         self.theta = None
 94 |         self.phi = None
 95 | 
 96 |     def get_observation(self):
 97 |         self.camera.take_picture()
 98 |         rgba = self.camera.get_color_rgba()
 99 |         rgba = (rgba * 255).clip(0, 255).astype(np.float32) / 255
100 |         white = np.ones((rgba.shape[0], rgba.shape[1], 3), dtype=np.float32)
101 |         mask = np.tile(rgba[:, :, 3:4], [1, 1, 3])
102 |         rgb = rgba[:, :, :3] * mask + white * (1 - mask)
103 |         depth = self.camera.get_depth().astype(np.float32)
104 |         # depth = self.camera.get_depth()
105 |         return rgb, depth
106 | 
107 |     def compute_camera_XYZA(self, depth):
108 |         camera_matrix = self.camera.get_camera_matrix()[:3, :3]
109 |         y, x = np.where(depth < 1)
110 |         z = self.near * self.far / (self.far + depth * (self.near - self.far))
111 |         # z = depth
112 |         permutation = np.array([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
113 |         points = (permutation @ np.dot(np.linalg.inv(camera_matrix), \
114 |             np.stack([x, y, np.ones_like(x)] * z[y, x], 0))).T
115 |         # wandb.log({"point_cloud":wandb.Object3D(points)})
116 |         
117 |         out = np.zeros((996, 996, 4), dtype=np.float32)
118 |         out[y, x, :3] = points
119 |         out[y, x, 3] = 1
120 |         return y, x, points, out
121 | 
122 |     @staticmethod
123 |     def compute_XYZA_matrix(id1, id2, pts, size1, size2):
124 |         out = np.zeros((size1, size2, 4), dtype=np.float32)
125 |         out[id1, id2, :3] = pts
126 |         out[id1, id2, 3] = 1
127 |         return out
128 |  
129 |     def get_normal_map(self):
130 |         nor = self.camera.get_normal_rgba()
131 |         # convert from PartNet-space (x-right, y-up, z-backward) to SAPIEN-space (x-front, y-left, z-up)
132 |         new_nor = np.array(nor, dtype=np.float32)
133 |         new_nor[:, :, 0] = -nor[:, :, 2]
134 |         new_nor[:, :, 1] = -nor[:, :, 0]
135 |         new_nor[:, :, 2] = nor[:, :, 1]
136 |         return new_nor
137 | 
138 |     def get_movable_link_mask(self, link_ids):
139 |         link_seg = self.camera.get_segmentation()
140 |         link_mask = np.zeros((link_seg.shape[0], link_seg.shape[1])).astype(np.uint8)
141 |         for idx, lid in enumerate(link_ids):
142 |             cur_link_pixels = int(np.sum(link_seg==lid))
143 |             if cur_link_pixels > 0:
144 |                 
145 |                 link_mask[link_seg == lid] = idx+1
146 |         return link_mask
147 |     
148 |     def get_handle_mask(self):
149 |         # read part seg partid2renderids
150 |         partid2renderids = dict()
151 |         for k in self.env.scene.render_id_to_visual_name:
152 |             if self.env.scene.render_id_to_visual_name[k].split('-')[0] == 'handle':
153 |                 part_id = int(self.env.scene.render_id_to_visual_name[k].split('-')[-1])
154 |                 if part_id not in partid2renderids:
155 |                     partid2renderids[part_id] = []
156 |                 partid2renderids[part_id].append(k)
157 |         # generate 0/1 handle mask
158 |         part_seg = self.camera.get_obj_segmentation()
159 |         handle_mask = np.zeros((part_seg.shape[0], part_seg.shape[1])).astype(np.uint8)
160 |         for partid in partid2renderids:
161 |             cur_part_mask = np.isin(part_seg, partid2renderids[partid])
162 |             cur_part_mask_pixels = int(np.sum(cur_part_mask))
163 |             if cur_part_mask_pixels > 0:
164 |                 handle_mask[cur_part_mask] = 1
165 |         return handle_mask
166 | 
167 |     def get_object_mask(self):
168 |         rgba = self.camera.get_albedo_rgba()
169 |         return rgba[:, :, 3] > 0.5
170 | 
171 |     # return camera parameters
172 |     def get_metadata(self):
173 |         return {
174 |             'pose': self.camera.get_pose(),
175 |             'near': self.camera.get_near(),
176 |             'far': self.camera.get_far(),
177 |             'width': self.camera.get_width(),
178 |             'height': self.camera.get_height(),
179 |             'fov': self.camera.get_fovy(),
180 |             'camera_matrix': self.camera.get_camera_matrix(),
181 |             'projection_matrix': self.camera.get_projection_matrix(),
182 |             'model_matrix': self.camera.get_model_matrix(),
183 |             'mat44': self.mat44,
184 |         }
185 |     
186 |     # return camera parameters
187 |     def get_metadata_json(self):
188 |         return {
189 |             'dist': self.dist,
190 |             'theta': self.theta,
191 |             'phi': self.phi,
192 |             'near': self.camera.get_near(),
193 |             'far': self.camera.get_far(),
194 |             'width': self.camera.get_width(),
195 |             'height': self.camera.get_height(),
196 |             'fov': self.camera.get_fovy(),
197 |             'camera_matrix': self.camera.get_camera_matrix().tolist(),
198 |             'projection_matrix': self.camera.get_projection_matrix().tolist(),
199 |             'model_matrix': self.camera.get_model_matrix().tolist(),
200 |             'mat44': self.mat44.tolist(),
201 |         }
202 | 
203 | 


--------------------------------------------------------------------------------
/train/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import yaml
  3 | from torch.utils.data import Dataset
  4 | from PIL import Image
  5 | import json
  6 | import llama.utils
  7 | from llama import Tokenizer
  8 | import copy
  9 | import torchvision.transforms as transforms
 10 | import pandas as pd
 11 | import random
 12 | from random import randrange
 13 | import os
 14 | import numpy as np
 15 | 
 16 | try:
 17 |     from torchvision.transforms import InterpolationMode
 18 |     BICUBIC = InterpolationMode.BICUBIC
 19 | except ImportError:
 20 |     BICUBIC = Image.BICUBIC
 21 | 
 22 | 
 23 | # create data
 24 | transform_train = transforms.Compose([
 25 |     transforms.RandomResizedCrop(size=(336, 336), scale=(0.9, 1.0), ratio=(0.75, 1.3333), interpolation=BICUBIC),  # 3 is bicubic
 26 |     transforms.ToTensor(),
 27 |     transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])])
 28 | 
 29 | class FinetuneDataset(Dataset):
 30 |     def __init__(self, config_path, args, max_words=30, tokenizer_path=None):
 31 |         print(f"read dataset config from {config_path}")
 32 |       
 33 |         self.mlm = args.mlm
 34 |         self.bins = args.bins
 35 |         self.config = config_path
 36 |         self.aff_prior = args.aff_prior
 37 |         
 38 |         ann = []
 39 |         for meta_name in os.listdir(self.config):
 40 |            
 41 |             meta_path = os.path.join(self.config, meta_name)
 42 |           
 43 |             ann.append(meta_path) 
 44 |             with open(meta_path, 'r') as f:
 45 |                 meta_data = json.load(f)
 46 |           
 47 |         self.ann = ann
 48 |         print(f"total length: {len(self)}")
 49 |         
 50 |         self.transform = transform_train
 51 |         self.max_words = max_words
 52 |         self.tokenizer = Tokenizer(model_path=tokenizer_path)
 53 |        
 54 |        
 55 | 
 56 |     def __len__(self):
 57 |         return len(self.ann)
 58 | 
 59 |     def __getitem__(self, index):
 60 |        
 61 |         with open(self.ann[index], 'r') as f:
 62 |             data_item = json.load(f)
 63 |         filename = data_item['input']
 64 |         answer = data_item['conversations'][1]['gt']#value
 65 |         start_pixel = 0
 66 |         loc_tokens = []
 67 |         
 68 |         if self.bins == 'True' and  self.mlm == 'True' and self.aff_prior:
 69 |             words = answer.split(' ')
 70 |             for idx, word in enumerate(words):
 71 |                 if '.' in word:
 72 |                     if '[' in word:
 73 |                         # print(word[1:-2])
 74 |                         words[idx] = '['+str(int(float(word[1:-2])//0.02)) + ','
 75 |                     elif ']' in word:
 76 |                         words[idx] = str(int(float(word[:-2])//0.02)) + ']'
 77 |                     else:
 78 |                         words[idx] = str(int(float(word[:-2])//0.02)) + ','
 79 |                     loc_tokens.append(idx)
 80 |                 elif '(' in word:
 81 |                     loc_tokens.append(idx)
 82 |                     words[idx] = '('+str(int(word[1:-1])-start_pixel)+ ','
 83 |                 elif ')' in word:
 84 |                     loc_tokens.append(idx)
 85 |                     words[idx] = str(int(word[:-2])-start_pixel)+ '),'
 86 |             answer = ' '.join([str(elem) for elem in words])
 87 | 
 88 |             i = random.randint(0, 3)
 89 |                 
 90 |             #mlm and aff
 91 |             if i % 4 == 0:
 92 |                 #finetune
 93 |                 question = data_item['conversations'][0]['prompt']
 94 |                 answer = answer
 95 |             elif i % 4 == 1:
 96 |                 #mlm
 97 |                 question_ori = answer.split(' ')
 98 |                 i = random.sample(range(0, len(question_ori)-1), int(len(question_ori)*0.15))
 99 |                 mask_loc = [loc_tokens[random.randint(0, len(loc_tokens)-1)],loc_tokens[random.randint(0, len(loc_tokens)-1)],loc_tokens[random.randint(0, len(loc_tokens)-1)]]
100 |                 question_mask = [word if idx not in mask_loc else "<mask>" for idx, word in enumerate(question_ori)]
101 |                 question = ' '.join([str(elem) for elem in question_mask])
102 |                 answer = answer
103 |             elif i % 4 == 2:
104 |                 #affordance
105 |                 question = data_item['aff_question']
106 |                 answer = data_item['aff_gt']
107 |             elif i % 4 == 3:
108 |                 #cat
109 |                 question = data_item['conversations'][0]['prompt']
110 |                 answer = answer
111 |                 # question = data_item['cat_prompt']
112 |                 # answer = data_item['cat_ans']
113 | 
114 |             image = Image.fromarray(np.array(Image.open(filename).convert('RGB'))[start_pixel:start_pixel+336,start_pixel:start_pixel+336,:])
115 |             
116 |             image = self.transform(image)
117 |             format_instruction = question
118 |             format_input = None
119 |         
120 |         input1 = llama.utils.format_prompt(format_instruction, format_input)
121 |         input2 = input1 + answer
122 |         
123 |         input1 = torch.tensor(self.tokenizer.encode(input1, bos=True, eos=False), dtype=torch.int64)
124 |         input2 = torch.tensor(self.tokenizer.encode(input2, bos=True, eos=True), dtype=torch.int64)
125 |         padding = self.max_words - input2.shape[0]
126 |         if padding > 0:
127 |             input2 = torch.cat((input2, torch.zeros(padding, dtype=torch.int64) - 1))
128 |         elif padding < 0:
129 |             input2 = input2[:self.max_words]
130 |         labels = copy.deepcopy(input2)
131 |         labels[:len(input1)] = -1
132 |         input2_mask = input2.ge(0)
133 |         label_mask = labels.ge(0)
134 |         input2[~input2_mask] = 0
135 |         labels[~label_mask] = 0
136 |         input2_mask = input2_mask.float()
137 |         label_mask = label_mask.float()
138 |         
139 |         return input2, labels, input2_mask, image
140 |         
141 |         
142 |     
143 | 
144 | 
145 | class PretrainDataset(Dataset):
146 |     def __init__(self, config_path, transform, max_words=30, tokenizer_path=None):
147 |         print(f"read dataset config from {config_path}")
148 |         with open(config_path, 'r') as f:
149 |             self.config = yaml.load(f, Loader=yaml.FullLoader)
150 |         print("DATASET CONFIG:")
151 |         print(self.config)
152 |         images, captions = [], []
153 |         for meta_path in self.config['META']:
154 |             images_this_meta, captions_this_meta = [], []
155 |             for chunk in pd.read_csv(meta_path, sep='\t', lineterminator='\n', chunksize=10 ** 6):
156 |                 images_this_meta.extend(chunk['url'].tolist())
157 |                 captions_this_meta.extend(chunk['caption'].tolist())
158 |             print(f"{meta_path}: len {len(images_this_meta)}")
159 |             images.extend(images_this_meta)
160 |             captions.extend(captions_this_meta)
161 | 
162 |         self.data_list = []
163 |         for x, y in zip(images, captions):
164 |             self.data_list.append({'url': x, 'caption': y})
165 |         print(f"total length: {len(self)}")
166 |         self.transform = transform
167 |         self.max_words = max_words
168 |         self.tokenizer = Tokenizer(model_path=tokenizer_path)
169 | 
170 |     def __len__(self):
171 |         return len(self.data_list)
172 | 
173 |     def __getitem__(self, index):
174 |         sample = self.data_list[index]
175 |         image_path, caption = sample['url'], sample['caption']
176 |         if isinstance(caption, list):
177 |             caption = random.choice(caption)
178 |         caption = str(caption)
179 | 
180 |         image = Image.open(image_path).convert('RGB')
181 |         image = self.transform(image)
182 | 
183 |         format_instruction = "Generate caption of this image"
184 |         input1 = llama.utils.format_prompt(format_instruction, None)
185 |         input2 = input1 + caption
186 | 
187 |         input1 = torch.tensor(self.tokenizer.encode(input1, bos=True, eos=False), dtype=torch.int64)
188 |         input2 = torch.tensor(self.tokenizer.encode(input2, bos=True, eos=True), dtype=torch.int64)
189 |         padding = self.max_words - input2.shape[0]
190 |         if padding > 0:
191 |             input2 = torch.cat((input2, torch.zeros(padding, dtype=torch.int64) - 1))
192 |         elif padding < 0:
193 |             input2 = input2[:self.max_words]
194 |         labels = copy.deepcopy(input2)
195 |         labels[:len(input1)] = -1
196 |         input2_mask = input2.ge(0)
197 |         label_mask = labels.ge(0)
198 |         input2[~input2_mask] = 0
199 |         labels[~label_mask] = 0
200 |         input2_mask = input2_mask.float()
201 |         label_mask = label_mask.float()
202 |         return input2, labels, input2_mask, image


--------------------------------------------------------------------------------
/data_collection/code/robots/panda.urdf:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <!-- =================================================================================== -->
  3 | <!-- |    This document was autogenerated by xacro from panda_arm_hand.urdf.xacro      | -->
  4 | <!-- |    EDITING THIS FILE BY HAND IS NOT RECOMMENDED                                 | -->
  5 | <!-- =================================================================================== -->
  6 | <robot name="panda" xmlns:xacro="http://www.ros.org/wiki/xacro">
  7 |   <link name="panda_link0">
  8 |     <visual>
  9 |       <geometry>
 10 |         <mesh filename="franka_description/meshes/visual/link0.dae"/>
 11 |       </geometry>
 12 |     </visual>
 13 |     <collision>
 14 |       <geometry>
 15 |         <mesh filename="franka_description/meshes/collision/link0.stl"/>
 16 |       </geometry>
 17 |     </collision>
 18 |   </link>
 19 |   <link name="panda_link1">
 20 |     <visual>
 21 |       <geometry>
 22 |         <mesh filename="franka_description/meshes/visual/link1.dae"/>
 23 |       </geometry>
 24 |     </visual>
 25 |     <collision>
 26 |       <geometry>
 27 |         <mesh filename="franka_description/meshes/collision/link1.stl"/>
 28 |       </geometry>
 29 |     </collision>
 30 |   </link>
 31 |   <joint name="panda_joint1" type="revolute">
 32 |     <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
 33 |     <origin rpy="0 0 0" xyz="0 0 0.333"/>
 34 |     <parent link="panda_link0"/>
 35 |     <child link="panda_link1"/>
 36 |     <axis xyz="0 0 1"/>
 37 |     <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
 38 |   </joint>
 39 |   <link name="panda_link2">
 40 |     <visual>
 41 |       <geometry>
 42 |         <mesh filename="franka_description/meshes/visual/link2.dae"/>
 43 |       </geometry>
 44 |     </visual>
 45 |     <collision>
 46 |       <geometry>
 47 |         <mesh filename="franka_description/meshes/collision/link2.stl"/>
 48 |       </geometry>
 49 |     </collision>
 50 |   </link>
 51 |   <joint name="panda_joint2" type="revolute">
 52 |     <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-1.7628" soft_upper_limit="1.7628"/>
 53 |     <origin rpy="-1.57079632679 0 0" xyz="0 0 0"/>
 54 |     <parent link="panda_link1"/>
 55 |     <child link="panda_link2"/>
 56 |     <axis xyz="0 0 1"/>
 57 |     <limit effort="87" lower="-1.7628" upper="1.7628" velocity="2.1750"/>
 58 |   </joint>
 59 |   <link name="panda_link3">
 60 |     <visual>
 61 |       <geometry>
 62 |         <mesh filename="franka_description/meshes/visual/link3.dae"/>
 63 |       </geometry>
 64 |     </visual>
 65 |     <collision>
 66 |       <geometry>
 67 |         <mesh filename="franka_description/meshes/collision/link3.stl"/>
 68 |       </geometry>
 69 |     </collision>
 70 |   </link>
 71 |   <joint name="panda_joint3" type="revolute">
 72 |     <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
 73 |     <origin rpy="1.57079632679 0 0" xyz="0 -0.316 0"/>
 74 |     <parent link="panda_link2"/>
 75 |     <child link="panda_link3"/>
 76 |     <axis xyz="0 0 1"/>
 77 |     <limit effort="87" lower="-2.8973" upper="2.8973" velocity="2.1750"/>
 78 |   </joint>
 79 |   <link name="panda_link4">
 80 |     <visual>
 81 |       <geometry>
 82 |         <mesh filename="franka_description/meshes/visual/link4.dae"/>
 83 |       </geometry>
 84 |     </visual>
 85 |     <collision>
 86 |       <geometry>
 87 |         <mesh filename="franka_description/meshes/collision/link4.stl"/>
 88 |       </geometry>
 89 |     </collision>
 90 |   </link>
 91 |   <joint name="panda_joint4" type="revolute">
 92 |     <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-3.0718" soft_upper_limit="-0.0698"/>
 93 |     <origin rpy="1.57079632679 0 0" xyz="0.0825 0 0"/>
 94 |     <parent link="panda_link3"/>
 95 |     <child link="panda_link4"/>
 96 |     <axis xyz="0 0 1"/>
 97 |     <limit effort="87" lower="-3.0718" upper="-0.0698" velocity="2.1750"/>
 98 |   </joint>
 99 |   <link name="panda_link5">
100 |     <visual>
101 |       <geometry>
102 |         <mesh filename="franka_description/meshes/visual/link5.dae"/>
103 |       </geometry>
104 |     </visual>
105 |     <collision>
106 |       <geometry>
107 |         <mesh filename="franka_description/meshes/collision/link5.stl"/>
108 |       </geometry>
109 |     </collision>
110 |   </link>
111 |   <joint name="panda_joint5" type="revolute">
112 |     <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
113 |     <origin rpy="-1.57079632679 0 0" xyz="-0.0825 0.384 0"/>
114 |     <parent link="panda_link4"/>
115 |     <child link="panda_link5"/>
116 |     <axis xyz="0 0 1"/>
117 |     <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
118 |   </joint>
119 |   <link name="panda_link6">
120 |     <visual>
121 |       <geometry>
122 |         <mesh filename="franka_description/meshes/visual/link6.dae"/>
123 |       </geometry>
124 |     </visual>
125 |     <collision>
126 |       <geometry>
127 |         <mesh filename="franka_description/meshes/collision/link6.stl"/>
128 |       </geometry>
129 |     </collision>
130 |   </link>
131 |   <joint name="panda_joint6" type="revolute">
132 |     <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-0.0175" soft_upper_limit="3.7525"/>
133 |     <origin rpy="1.57079632679 0 0" xyz="0 0 0"/>
134 |     <parent link="panda_link5"/>
135 |     <child link="panda_link6"/>
136 |     <axis xyz="0 0 1"/>
137 |     <limit effort="12" lower="-0.0175" upper="3.7525" velocity="2.6100"/>
138 |   </joint>
139 |   <link name="panda_link7">
140 |     <visual>
141 |       <geometry>
142 |         <mesh filename="franka_description/meshes/visual/link7.dae"/>
143 |       </geometry>
144 |     </visual>
145 |     <collision>
146 |       <geometry>
147 |         <mesh filename="franka_description/meshes/collision/link7.stl"/>
148 |       </geometry>
149 |     </collision>
150 |   </link>
151 |   <joint name="panda_joint7" type="revolute">
152 |     <safety_controller k_position="100.0" k_velocity="40.0" soft_lower_limit="-2.8973" soft_upper_limit="2.8973"/>
153 |     <origin rpy="1.57079632679 0 0" xyz="0.088 0 0"/>
154 |     <parent link="panda_link6"/>
155 |     <child link="panda_link7"/>
156 |     <axis xyz="0 0 1"/>
157 |     <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
158 |   </joint>
159 |   <link name="panda_link8"/>
160 |   <joint name="panda_joint8" type="fixed">
161 |     <origin rpy="0 0 0" xyz="0 0 0.107"/>
162 |     <parent link="panda_link7"/>
163 |     <child link="panda_link8"/>
164 |     <axis xyz="0 0 0"/>
165 |   </joint>
166 |   <joint name="panda_hand_joint" type="fixed">
167 |     <parent link="panda_link8"/>
168 |     <child link="panda_hand"/>
169 |     <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
170 |   </joint>
171 |   <link name="panda_hand">
172 |     <visual>
173 |       <geometry>
174 |         <mesh filename="franka_description/meshes/visual/hand.dae"/>
175 |       </geometry>
176 |     </visual>
177 |     <collision>
178 |       <geometry>
179 |         <mesh filename="franka_description/meshes/collision/hand.stl"/>
180 |       </geometry>
181 |     </collision>
182 |   </link>
183 |   <link name="panda_leftfinger">
184 |     <visual>
185 |       <geometry>
186 |         <mesh filename="franka_description/meshes/visual/finger.dae"/>
187 |       </geometry>
188 |     </visual>
189 |     <collision>
190 |       <geometry>
191 |         <mesh filename="franka_description/meshes/collision/finger.stl"/>
192 |       </geometry>
193 |     </collision>
194 |   </link>
195 |   <link name="panda_rightfinger">
196 |     <visual>
197 |       <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
198 |       <geometry>
199 |         <mesh filename="franka_description/meshes/visual/finger.dae"/>
200 |       </geometry>
201 |     </visual>
202 |     <collision>
203 |       <origin rpy="0 0 3.14159265359" xyz="0 0 0"/>
204 |       <geometry>
205 |         <mesh filename="franka_description/meshes/collision/finger.stl"/>
206 |       </geometry>
207 |     </collision>
208 |   </link>
209 |   <joint name="panda_finger_joint1" type="prismatic">
210 |     <parent link="panda_hand"/>
211 |     <child link="panda_leftfinger"/>
212 |     <origin rpy="0 0 0" xyz="0 0 0.0584"/>
213 |     <axis xyz="0 1 0"/>
214 |     <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
215 |   </joint>
216 |   <joint name="panda_finger_joint2" type="prismatic">
217 |     <parent link="panda_hand"/>
218 |     <child link="panda_rightfinger"/>
219 |     <origin rpy="0 0 0" xyz="0 0 0.0584"/>
220 |     <axis xyz="0 -1 0"/>
221 |     <limit effort="20" lower="0.0" upper="0.04" velocity="0.2"/>
222 |     <mimic joint="panda_finger_joint1"/>
223 |   </joint>
224 | </robot>
225 | 
226 | 


--------------------------------------------------------------------------------
/train/main_finetune.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.backends.cudnn as cudnn
  3 | from torch.utils.tensorboard import SummaryWriter
  4 | from torch.utils.data import Dataset
  5 | import torch.nn as nn
  6 | import util.misc as misc
  7 | from util.misc import NativeScalerWithGradNormCount as NativeScaler
  8 | from llama.llama_adapter import LLaMA_adapter
  9 | 
 10 | from data.dataset import FinetuneDataset, transform_train
 11 | 
 12 | import argparse
 13 | import datetime
 14 | import json
 15 | import numpy as np
 16 | import os
 17 | import time
 18 | from pathlib import Path
 19 | from engine_finetune import train_one_epoch
 20 | 
 21 | #torch.cuda.set_device(4)
 22 | def get_args_parser():
 23 |     parser = argparse.ArgumentParser('imagebind-llm pre-training', add_help=False)
 24 |     parser.add_argument('--batch_size', default=32, type=int,
 25 |                         help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
 26 |     parser.add_argument('--epochs', default=4, type=int)
 27 |     parser.add_argument('--accum_iter', default=1, type=int,
 28 |                         help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
 29 | 
 30 |     # Model parameters
 31 |     parser.add_argument('--llama_type', default='7B', type=str,
 32 |                         help='Type of LLaMA model') #
 33 |     parser.add_argument('--llama_path', default='./ckpts/llama_model_weights', type=str,
 34 |                         help='path to LLaMA pretrained checkpoint')
 35 |     parser.add_argument('--pretrained_path', default='./ckpts/BIAS_LORA_NORM-336-Chinese-7B.pth ', type=str,
 36 |                         help='path to checkpoint from pretrain stage')
 37 |     parser.add_argument('--max_words', default=512, type=int,
 38 |                         help='max number of input words')
 39 | 
 40 |     # Optimizer parameters
 41 |     parser.add_argument('--weight_decay', type=float, default=0.05,
 42 |                         help='weight decay (default: 0.05)')
 43 | 
 44 |     parser.add_argument('--lr', type=float, default=None, metavar='LR',
 45 |                         help='learning rate (absolute lr)')
 46 |     parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
 47 |                         help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
 48 |     parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
 49 |                         help='lower lr bound for cyclic schedulers that hit 0')
 50 | 
 51 |     parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N',
 52 |                         help='epochs to warmup LR')
 53 | 
 54 |     # Dataset parameters
 55 |     parser.add_argument('--data_config', default='./data/train_json', type=str,
 56 |                         help='dataset config path')
 57 |     parser.add_argument('--num_workers', default=16, type=int)
 58 |     parser.add_argument('--pin_mem', action='store_true',
 59 |                         help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
 60 |     parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
 61 |     parser.set_defaults(pin_mem=True)
 62 |     parser.add_argument('--mlm', default='False', type=str, help='if use mask language model')
 63 |     parser.add_argument('--bins', default='False', type=str, help='if use bin in orientation')
 64 |     parser.add_argument('--aff_prior', action='store_true', help='if learn from affordance')
 65 | 
 66 | 
 67 |     parser.add_argument('--output_dir', default='./exp/train_model',
 68 |                         help='path where to save, empty for no saving')
 69 |     parser.add_argument('--log_dir', default='./output',
 70 |                         help='path where to tensorboard log')
 71 |     parser.add_argument('--device', default='cuda',
 72 |                         help='device to use for training / testing')
 73 |     parser.add_argument('--seed', default=0, type=int)
 74 | 
 75 | 
 76 |     parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
 77 |                         help='start epoch')
 78 | 
 79 |     # distributed training parameters
 80 |     parser.add_argument('--world_size', default=1, type=int,
 81 |                         help='number of distributed processes')
 82 |     parser.add_argument('--local_rank', default=-1, type=int)
 83 |     parser.add_argument('--dist_on_itp', action='store_true')
 84 |     parser.add_argument('--dist_url', default='env://',
 85 |                         help='url used to set up distributed training')
 86 | 
 87 |     return parser
 88 | 
 89 | 
 90 | def main(args):
 91 |    
 92 |     print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
 93 |     print("{}".format(args).replace(', ', ',\n'))
 94 | 
 95 |     device = torch.device(args.device)
 96 | 
 97 |     # fix the seed for reproducibility
 98 |     seed = args.seed + misc.get_rank()
 99 |     torch.manual_seed(seed)
100 |     np.random.seed(seed)
101 |     cudnn.benchmark = True
102 | 
103 |     # define the model
104 |     llama_type = args.llama_type
105 |     llama_ckpt_dir = os.path.join(args.llama_path)
106 |     llama_tokenzier_path = os.path.join(args.llama_path, 'tokenizer.model')
107 |     
108 |     model = LLaMA_adapter(llama_ckpt_dir, llama_tokenzier_path)
109 |     
110 | 
111 |     model.to(device)
112 | 
113 |     model_without_ddp = model
114 | 
115 |     print("Trainable Params:")
116 |     print([(key, val.shape) for key, val in model.named_parameters() if val.requires_grad])
117 | 
118 |   
119 |     
120 |     # training detail
121 |     eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
122 | 
123 |     if args.lr is None:  # only base_lr is specified
124 |         args.lr = args.blr * eff_batch_size / 256
125 | 
126 |     print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
127 |     print("actual lr: %.2e" % args.lr)
128 | 
129 |     print("accumulate grad iterations: %d" % args.accum_iter)
130 |     print("effective batch size: %d" % eff_batch_size)
131 | 
132 |     # following timm: set wd as 0 for bias and norm layers
133 |     param_groups = misc.add_weight_decay(model_without_ddp, args.weight_decay)
134 |     optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
135 |     
136 |     loss_scaler = NativeScaler()
137 |     
138 |     if args.pretrained_path != 'none':
139 |         misc.load_model(model_without_ddp, args.pretrained_path)
140 |     # print(args.mlm)
141 |     dataset_train = FinetuneDataset(args.data_config, args,
142 |                                     max_words=args.max_words, tokenizer_path=llama_tokenzier_path)
143 |     
144 |     num_tasks = misc.get_world_size()
145 |     global_rank = misc.get_rank()
146 |     sampler_train = torch.utils.data.DistributedSampler(
147 |         dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
148 |     )
149 |     print("Sampler_train = %s" % str(sampler_train))
150 | 
151 |     data_loader_train = torch.utils.data.DataLoader(
152 |         dataset_train, sampler=sampler_train,
153 |         batch_size=args.batch_size,
154 |         num_workers=args.num_workers,
155 |         pin_memory=args.pin_mem,
156 |         drop_last=True,
157 |     )
158 | 
159 |     # SummaryWrite
160 |     if global_rank == 0 and args.log_dir is not None:
161 |         os.makedirs(args.log_dir, exist_ok=True)
162 |         log_writer = SummaryWriter(log_dir=args.log_dir)
163 |     else:
164 |         log_writer = None
165 | 
166 | 
167 |     print(f"Start training for {args.epochs} epochs")
168 |     start_time = time.time()
169 |     for epoch in range(args.start_epoch, args.epochs):
170 |         # if args.distributed:#分布式训练
171 |         #     data_loader_train.sampler.set_epoch(epoch)
172 | 
173 |         train_stats = train_one_epoch(
174 |             model, data_loader_train,
175 |             optimizer, device, epoch, loss_scaler,
176 |             log_writer=log_writer,
177 |             args=args
178 |         )
179 | 
180 |         if args.output_dir and (epoch + 1 == args.epochs):
181 |             misc.save_model(
182 |                 args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
183 |                 loss_scaler=loss_scaler, epoch=epoch)
184 | 
185 |         log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
186 |                      'epoch': epoch}
187 | 
188 |         if args.output_dir and misc.is_main_process():
189 |             if log_writer is not None:
190 |                 log_writer.flush()
191 |             with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
192 |                 f.write(json.dumps(log_stats) + "\n")
193 | 
194 |     total_time = time.time() - start_time
195 |     total_time_str = str(datetime.timedelta(seconds=int(total_time)))
196 |     print('Training time {}'.format(total_time_str))
197 |     print('Training over!!!')
198 | 
199 | if __name__ == '__main__':
200 |     args = get_args_parser()
201 |     args = args.parse_args()
202 |     if args.output_dir:
203 |         Path(args.output_dir).mkdir(parents=True, exist_ok=True)
204 |     main(args)
205 | 


--------------------------------------------------------------------------------
/test/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import h5py
  4 | import torch
  5 | import numpy as np
  6 | import importlib
  7 | import random
  8 | import shutil
  9 | from PIL import Image
 10 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 11 | sys.path.append(os.path.join(BASE_DIR, '../utils'))
 12 | from colors import colors
 13 | colors = np.array(colors, dtype=np.float32)
 14 | import matplotlib.pylab as plt
 15 | from mpl_toolkits.mplot3d import Axes3D
 16 | from subprocess import call
 17 | 
 18 | 
 19 | def force_mkdir(folder):
 20 |     if os.path.exists(folder):
 21 |         shutil.rmtree(folder)
 22 |     os.mkdir(folder)
 23 | 
 24 | def printout(flog, strout):
 25 |     print(strout)
 26 |     if flog is not None:
 27 |         flog.write(strout + '\n')
 28 | 
 29 | def optimizer_to_device(optimizer, device):
 30 |     for state in optimizer.state.values():
 31 |         for k, v in state.items():
 32 |             if torch.is_tensor(v):
 33 |                 state[k] = v.to(device)
 34 | 
 35 | def get_model_module(model_version):
 36 |     importlib.invalidate_caches()
 37 |     return importlib.import_module('models.' + model_version)
 38 | 
 39 | def collate_feats(b):
 40 |     return list(zip(*b))
 41 | 
 42 | def collate_feats_pass(b):
 43 |     return b
 44 | 
 45 | def collate_feats_with_none(b):
 46 |     b = filter (lambda x:x is not None, b)
 47 |     return list(zip(*b))
 48 | 
 49 | def worker_init_fn(worker_id):
 50 |     """ The function is designed for pytorch multi-process dataloader.
 51 |         Note that we use the pytorch random generator to generate a base_seed.
 52 |         Please try to be consistent.
 53 |         References:
 54 |             https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
 55 |     """
 56 |     base_seed = torch.IntTensor(1).random_().item()
 57 |     #print(worker_id, base_seed)
 58 |     np.random.seed(base_seed + worker_id)
 59 | 
 60 | def viz_mask(ids):
 61 |     return colors[ids]
 62 | 
 63 | def draw_dot(img, xy):
 64 |     out = np.array(img, dtype=np.uint8)
 65 |     x, y = xy[0], xy[1]
 66 |     neighbors = np.array([[0, 0, 0, 1, 1, 1, -1, -1, 1], \
 67 |                           [0, 1, -1, 0, 1, -1, 0, 1, -1]], dtype=np.int32)
 68 |     for i in range(neighbors.shape[1]):
 69 |         nx = x + neighbors[0, i]
 70 |         ny = y + neighbors[1, i]
 71 |         if nx >= 0 and nx < img.shape[0] and ny >= 0 and ny < img.shape[1]:
 72 |             out[nx, ny, 0] = 0
 73 |             out[nx, ny, 1] = 0
 74 |             out[nx, ny, 2] = 255
 75 | 
 76 |     return out
 77 | 
 78 | def print_true_false(d):
 79 |     d = int(d)
 80 |     if d > 0.5:
 81 |         return 'True'
 82 |     return 'False'
 83 | 
 84 | def img_resize(data):
 85 |     data = np.array(data, dtype=np.float32)
 86 |     mini, maxi = np.min(data), np.max(data)
 87 |     data -= mini
 88 |     data /= maxi - mini
 89 |     data = np.array(Image.fromarray((data*255).astype(np.uint8)).resize((224, 224)), dtype=np.float32) / 255
 90 |     data *= maxi - mini
 91 |     data += mini
 92 |     return data
 93 | 
 94 | def export_pts(out, v):
 95 |     with open(out, 'w') as fout:
 96 |         for i in range(v.shape[0]):
 97 |             fout.write('%f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2]))
 98 | 
 99 | def export_label(out, l):
100 |     with open(out, 'w') as fout:
101 |         for i in range(l.shape[0]):
102 |             fout.write('%f\n' % (l[i]))
103 | 
104 | def export_pts_label(out, v, l):
105 |     with open(out, 'w') as fout:
106 |         for i in range(l.shape[0]):
107 |             fout.write('%f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], l[i]))
108 | 
109 | def render_pts_label_png(out, v, l):
110 |     export_pts(out+'.pts', v)
111 |     export_label(out+'.label', l)
112 |     export_pts_label(out+'.feats', v, l)
113 |     cmd = 'xvfb-run -a ~/thea/TheaDepsUnix/Source/TheaPrefix/bin/Thea/RenderShape %s.pts -f %s.feats %s.png 448 448 -v 1,0,0,-5,0,0,0,0,1 >> /dev/null' % (out, out, out)
114 | 
115 |     call(cmd, shell=True)
116 |     print('save png')
117 | 
118 | def export_pts_color_obj(out, v, c):
119 |     with open(out+'.obj', 'w') as fout:
120 |         for i in range(v.shape[0]):
121 |             fout.write('v %f %f %f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], c[i, 0], c[i, 1], c[i, 2]))
122 | 
123 | def export_pts_color_pts(out, v, c):
124 |     with open(out+'.pts', 'w') as fout:
125 |         for i in range(v.shape[0]):
126 |             fout.write('%f %f %f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], c[i, 0], c[i, 1], c[i, 2]))
127 | 
128 | def load_checkpoint(models, model_names, dirname, epoch=None, optimizers=None, optimizer_names=None, strict=True):
129 |     if len(models) != len(model_names) or (optimizers is not None and len(optimizers) != len(optimizer_names)):
130 |         raise ValueError('Number of models, model names, or optimizers does not match.')
131 | 
132 |     for model, model_name in zip(models, model_names):
133 |         filename = f'net_{model_name}.pth'
134 |         if epoch is not None:
135 |             filename = f'{epoch}_' + filename
136 |         model.load_state_dict(torch.load(os.path.join(dirname, filename)), strict=strict)
137 | 
138 |     start_epoch = 0
139 |     if optimizers is not None:
140 |         filename = os.path.join(dirname, 'checkpt.pth')
141 |         if epoch is not None:
142 |             filename = f'{epoch}_' + filename
143 |         if os.path.exists(filename):
144 |             checkpt = torch.load(filename)
145 |             start_epoch = checkpt['epoch']
146 |             for opt, optimizer_name in zip(optimizers, optimizer_names):
147 |                 opt.load_state_dict(checkpt[f'opt_{optimizer_name}'])
148 |             print(f'resuming from checkpoint {filename}')
149 |         else:
150 |             response = input(f'Checkpoint {filename} not found for resuming, refine saved models instead? (y/n) ')
151 |             if response != 'y':
152 |                 sys.exit()
153 | 
154 |     return start_epoch
155 | 
156 | def get_global_position_from_camera(camera, depth, x, y):
157 |     """
158 |     This function is provided only to show how to convert camera observation to world space coordinates.
159 |     It can be removed if not needed.
160 | 
161 |     camera: an camera agent
162 |     depth: the depth obsrevation
163 |     x, y: the horizontal, vertical index for a pixel, you would access the images by image[y, x]
164 |     """ 
165 |     cm = camera.get_metadata()
166 |     proj, model = cm['projection_matrix'], cm['model_matrix']
167 |     print('proj:', proj)
168 |     print('model:', model)
169 |     w, h = cm['width'], cm['height']
170 | 
171 |     # get 0 to 1 coordinate for (x, y) coordinates
172 |     xf, yf = (x + 0.5) / w, 1 - (y + 0.5) / h
173 | 
174 |     # get 0 to 1 depth value at (x,y)
175 |     zf = depth[int(y), int(x)]
176 | 
177 |     # get the -1 to 1 (x,y,z) coordinate
178 |     ndc = np.array([xf, yf, zf, 1]) * 2 - 1
179 | 
180 |     # transform from image space to view space
181 |     v = np.linalg.inv(proj) @ ndc
182 |     v /= v[3]
183 | 
184 |     # transform from view space to world space
185 |     v = model @ v
186 | 
187 |     return v
188 | 
189 | def rot2so3(rotation):
190 |     assert rotation.shape == (3, 3)
191 |     if np.isclose(rotation.trace(), 3):
192 |         return np.zeros(3), 1
193 |     if np.isclose(rotation.trace(), -1):
194 |         raise RuntimeError
195 |     theta = np.arccos((rotation.trace() - 1) / 2)
196 |     omega = 1 / 2 / np.sin(theta) * np.array(
197 |         [rotation[2, 1] - rotation[1, 2], rotation[0, 2] - rotation[2, 0], rotation[1, 0] - rotation[0, 1]]).T
198 |     return omega, theta
199 | 
200 | def skew(vec):
201 |     return np.array([[0, -vec[2], vec[1]],
202 |                      [vec[2], 0, -vec[0]],
203 |                      [-vec[1], vec[0], 0]])
204 | 
205 | def adjoint_matrix(pose):
206 |     adjoint = np.zeros([6, 6])
207 |     adjoint[:3, :3] = pose[:3, :3]
208 |     adjoint[3:6, 3:6] = pose[:3, :3]
209 |     adjoint[3:6, 0:3] = skew(pose[:3, 3]) @ pose[:3, :3]
210 |     return adjoint
211 | 
212 | def pose2exp_coordinate(pose):
213 |     """
214 |     Compute the exponential coordinate corresponding to the given SE(3) matrix
215 |     Note: unit twist is not a unit vector
216 | 
217 |     Args:
218 |         pose: (4, 4) transformation matrix
219 | 
220 |     Returns:
221 |         Unit twist: (6, ) vector represent the unit twist
222 |         Theta: scalar represent the quantity of exponential coordinate
223 |     """
224 | 
225 |     omega, theta = rot2so3(pose[:3, :3])
226 |     ss = skew(omega)
227 |     inv_left_jacobian = np.eye(3, dtype=np.float32) / theta - 0.5 * ss + (
228 |             1.0 / theta - 0.5 / np.tan(theta / 2)) * ss @ ss
229 |     v = inv_left_jacobian @ pose[:3, 3]
230 |     return np.concatenate([omega, v]), theta
231 | 
232 | def viz_mask(ids):
233 |     return colors[ids]
234 | 
235 | def process_angle_limit(x):
236 |     if np.isneginf(x):
237 |         x = -10
238 |     if np.isinf(x):
239 |         x = 10
240 |     return x
241 | 
242 | def get_random_number(l, r):
243 |     return np.random.rand() * (r - l) + l
244 | 
245 | def save_h5(fn, data):
246 |     fout = h5py.File(fn, 'w')
247 |     for d, n, t in data:
248 |         fout.create_dataset(n, data=d, compression='gzip', compression_opts=4, dtype=t)
249 |     fout.close()
250 | 


--------------------------------------------------------------------------------
/train/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import h5py
  4 | import torch
  5 | import numpy as np
  6 | import importlib
  7 | import random
  8 | import shutil
  9 | from PIL import Image
 10 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 11 | sys.path.append(os.path.join(BASE_DIR, '../utils'))
 12 | from colors import colors
 13 | colors = np.array(colors, dtype=np.float32)
 14 | import matplotlib.pylab as plt
 15 | from mpl_toolkits.mplot3d import Axes3D
 16 | from subprocess import call
 17 | 
 18 | 
 19 | def force_mkdir(folder):
 20 |     if os.path.exists(folder):
 21 |         shutil.rmtree(folder)
 22 |     os.mkdir(folder)
 23 | 
 24 | def printout(flog, strout):
 25 |     print(strout)
 26 |     if flog is not None:
 27 |         flog.write(strout + '\n')
 28 | 
 29 | def optimizer_to_device(optimizer, device):
 30 |     for state in optimizer.state.values():
 31 |         for k, v in state.items():
 32 |             if torch.is_tensor(v):
 33 |                 state[k] = v.to(device)
 34 | 
 35 | def get_model_module(model_version):
 36 |     importlib.invalidate_caches()
 37 |     return importlib.import_module('models.' + model_version)
 38 | 
 39 | def collate_feats(b):
 40 |     return list(zip(*b))
 41 | 
 42 | def collate_feats_pass(b):
 43 |     return b
 44 | 
 45 | def collate_feats_with_none(b):
 46 |     b = filter (lambda x:x is not None, b)
 47 |     return list(zip(*b))
 48 | 
 49 | def worker_init_fn(worker_id):
 50 |     """ The function is designed for pytorch multi-process dataloader.
 51 |         Note that we use the pytorch random generator to generate a base_seed.
 52 |         Please try to be consistent.
 53 |         References:
 54 |             https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
 55 |     """
 56 |     base_seed = torch.IntTensor(1).random_().item()
 57 |     #print(worker_id, base_seed)
 58 |     np.random.seed(base_seed + worker_id)
 59 | 
 60 | def viz_mask(ids):
 61 |     return colors[ids]
 62 | 
 63 | def draw_dot(img, xy):
 64 |     out = np.array(img, dtype=np.uint8)
 65 |     x, y = xy[0], xy[1]
 66 |     neighbors = np.array([[0, 0, 0, 1, 1, 1, -1, -1, 1], \
 67 |                           [0, 1, -1, 0, 1, -1, 0, 1, -1]], dtype=np.int32)
 68 |     for i in range(neighbors.shape[1]):
 69 |         nx = x + neighbors[0, i]
 70 |         ny = y + neighbors[1, i]
 71 |         if nx >= 0 and nx < img.shape[0] and ny >= 0 and ny < img.shape[1]:
 72 |             out[nx, ny, 0] = 0
 73 |             out[nx, ny, 1] = 0
 74 |             out[nx, ny, 2] = 255
 75 | 
 76 |     return out
 77 | 
 78 | def print_true_false(d):
 79 |     d = int(d)
 80 |     if d > 0.5:
 81 |         return 'True'
 82 |     return 'False'
 83 | 
 84 | def img_resize(data):
 85 |     data = np.array(data, dtype=np.float32)
 86 |     mini, maxi = np.min(data), np.max(data)
 87 |     data -= mini
 88 |     data /= maxi - mini
 89 |     data = np.array(Image.fromarray((data*255).astype(np.uint8)).resize((224, 224)), dtype=np.float32) / 255
 90 |     data *= maxi - mini
 91 |     data += mini
 92 |     return data
 93 | 
 94 | def export_pts(out, v):
 95 |     with open(out, 'w') as fout:
 96 |         for i in range(v.shape[0]):
 97 |             fout.write('%f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2]))
 98 | 
 99 | def export_label(out, l):
100 |     with open(out, 'w') as fout:
101 |         for i in range(l.shape[0]):
102 |             fout.write('%f\n' % (l[i]))
103 | 
104 | def export_pts_label(out, v, l):
105 |     with open(out, 'w') as fout:
106 |         for i in range(l.shape[0]):
107 |             fout.write('%f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], l[i]))
108 | 
109 | def render_pts_label_png(out, v, l):
110 |     export_pts(out+'.pts', v)
111 |     export_label(out+'.label', l)
112 |     export_pts_label(out+'.feats', v, l)
113 |     cmd = 'xvfb-run -a ~/thea/TheaDepsUnix/Source/TheaPrefix/bin/Thea/RenderShape %s.pts -f %s.feats %s.png 448 448 -v 1,0,0,-5,0,0,0,0,1 >> /dev/null' % (out, out, out)
114 | 
115 |     call(cmd, shell=True)
116 |     print('save png')
117 | 
118 | def export_pts_color_obj(out, v, c):
119 |     with open(out+'.obj', 'w') as fout:
120 |         for i in range(v.shape[0]):
121 |             fout.write('v %f %f %f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], c[i, 0], c[i, 1], c[i, 2]))
122 | 
123 | def export_pts_color_pts(out, v, c):
124 |     with open(out+'.pts', 'w') as fout:
125 |         for i in range(v.shape[0]):
126 |             fout.write('%f %f %f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], c[i, 0], c[i, 1], c[i, 2]))
127 | 
128 | def load_checkpoint(models, model_names, dirname, epoch=None, optimizers=None, optimizer_names=None, strict=True):
129 |     if len(models) != len(model_names) or (optimizers is not None and len(optimizers) != len(optimizer_names)):
130 |         raise ValueError('Number of models, model names, or optimizers does not match.')
131 | 
132 |     for model, model_name in zip(models, model_names):
133 |         filename = f'net_{model_name}.pth'
134 |         if epoch is not None:
135 |             filename = f'{epoch}_' + filename
136 |         model.load_state_dict(torch.load(os.path.join(dirname, filename)), strict=strict)
137 | 
138 |     start_epoch = 0
139 |     if optimizers is not None:
140 |         filename = os.path.join(dirname, 'checkpt.pth')
141 |         if epoch is not None:
142 |             filename = f'{epoch}_' + filename
143 |         if os.path.exists(filename):
144 |             checkpt = torch.load(filename)
145 |             start_epoch = checkpt['epoch']
146 |             for opt, optimizer_name in zip(optimizers, optimizer_names):
147 |                 opt.load_state_dict(checkpt[f'opt_{optimizer_name}'])
148 |             print(f'resuming from checkpoint {filename}')
149 |         else:
150 |             response = input(f'Checkpoint {filename} not found for resuming, refine saved models instead? (y/n) ')
151 |             if response != 'y':
152 |                 sys.exit()
153 | 
154 |     return start_epoch
155 | 
156 | def get_global_position_from_camera(camera, depth, x, y):
157 |     """
158 |     This function is provided only to show how to convert camera observation to world space coordinates.
159 |     It can be removed if not needed.
160 | 
161 |     camera: an camera agent
162 |     depth: the depth obsrevation
163 |     x, y: the horizontal, vertical index for a pixel, you would access the images by image[y, x]
164 |     """ 
165 |     cm = camera.get_metadata()
166 |     proj, model = cm['projection_matrix'], cm['model_matrix']
167 |     print('proj:', proj)
168 |     print('model:', model)
169 |     w, h = cm['width'], cm['height']
170 | 
171 |     # get 0 to 1 coordinate for (x, y) coordinates
172 |     xf, yf = (x + 0.5) / w, 1 - (y + 0.5) / h
173 | 
174 |     # get 0 to 1 depth value at (x,y)
175 |     zf = depth[int(y), int(x)]
176 | 
177 |     # get the -1 to 1 (x,y,z) coordinate
178 |     ndc = np.array([xf, yf, zf, 1]) * 2 - 1
179 | 
180 |     # transform from image space to view space
181 |     v = np.linalg.inv(proj) @ ndc
182 |     v /= v[3]
183 | 
184 |     # transform from view space to world space
185 |     v = model @ v
186 | 
187 |     return v
188 | 
189 | def rot2so3(rotation):
190 |     assert rotation.shape == (3, 3)
191 |     if np.isclose(rotation.trace(), 3):
192 |         return np.zeros(3), 1
193 |     if np.isclose(rotation.trace(), -1):
194 |         raise RuntimeError
195 |     theta = np.arccos((rotation.trace() - 1) / 2)
196 |     omega = 1 / 2 / np.sin(theta) * np.array(
197 |         [rotation[2, 1] - rotation[1, 2], rotation[0, 2] - rotation[2, 0], rotation[1, 0] - rotation[0, 1]]).T
198 |     return omega, theta
199 | 
200 | def skew(vec):
201 |     return np.array([[0, -vec[2], vec[1]],
202 |                      [vec[2], 0, -vec[0]],
203 |                      [-vec[1], vec[0], 0]])
204 | 
205 | def adjoint_matrix(pose):
206 |     adjoint = np.zeros([6, 6])
207 |     adjoint[:3, :3] = pose[:3, :3]
208 |     adjoint[3:6, 3:6] = pose[:3, :3]
209 |     adjoint[3:6, 0:3] = skew(pose[:3, 3]) @ pose[:3, :3]
210 |     return adjoint
211 | 
212 | def pose2exp_coordinate(pose):
213 |     """
214 |     Compute the exponential coordinate corresponding to the given SE(3) matrix
215 |     Note: unit twist is not a unit vector
216 | 
217 |     Args:
218 |         pose: (4, 4) transformation matrix
219 | 
220 |     Returns:
221 |         Unit twist: (6, ) vector represent the unit twist
222 |         Theta: scalar represent the quantity of exponential coordinate
223 |     """
224 | 
225 |     omega, theta = rot2so3(pose[:3, :3])
226 |     ss = skew(omega)
227 |     inv_left_jacobian = np.eye(3, dtype=np.float32) / theta - 0.5 * ss + (
228 |             1.0 / theta - 0.5 / np.tan(theta / 2)) * ss @ ss
229 |     v = inv_left_jacobian @ pose[:3, 3]
230 |     return np.concatenate([omega, v]), theta
231 | 
232 | def viz_mask(ids):
233 |     return colors[ids]
234 | 
235 | def process_angle_limit(x):
236 |     if np.isneginf(x):
237 |         x = -10
238 |     if np.isinf(x):
239 |         x = 10
240 |     return x
241 | 
242 | def get_random_number(l, r):
243 |     return np.random.rand() * (r - l) + l
244 | 
245 | def save_h5(fn, data):
246 |     fout = h5py.File(fn, 'w')
247 |     for d, n, t in data:
248 |         fout.create_dataset(n, data=d, compression='gzip', compression_opts=4, dtype=t)
249 |     fout.close()
250 | 


--------------------------------------------------------------------------------
/data_collection/code/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import h5py
  4 | #import torch
  5 | import numpy as np
  6 | import importlib
  7 | import random
  8 | import shutil
  9 | from PIL import Image
 10 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 11 | sys.path.append(os.path.join(BASE_DIR, '../utils'))
 12 | from colors import colors
 13 | colors = np.array(colors, dtype=np.float32)
 14 | import matplotlib.pylab as plt
 15 | from mpl_toolkits.mplot3d import Axes3D
 16 | from subprocess import call
 17 | 
 18 | 
 19 | def force_mkdir(folder):
 20 |     if os.path.exists(folder):
 21 |         shutil.rmtree(folder)
 22 |     os.mkdir(folder)
 23 | 
 24 | def printout(flog, strout):
 25 |     print(strout)
 26 |     if flog is not None:
 27 |         flog.write(strout + '\n')
 28 | 
 29 | def optimizer_to_device(optimizer, device):
 30 |     for state in optimizer.state.values():
 31 |         for k, v in state.items():
 32 |             if torch.is_tensor(v):
 33 |                 state[k] = v.to(device)
 34 | 
 35 | def get_model_module(model_version):
 36 |     importlib.invalidate_caches()
 37 |     return importlib.import_module('models.' + model_version)
 38 | 
 39 | def collate_feats(b):
 40 |     return list(zip(*b))
 41 | 
 42 | def collate_feats_pass(b):
 43 |     return b
 44 | 
 45 | def collate_feats_with_none(b):
 46 |     b = filter (lambda x:x is not None, b)
 47 |     return list(zip(*b))
 48 | 
 49 | def worker_init_fn(worker_id):
 50 |     """ The function is designed for pytorch multi-process dataloader.
 51 |         Note that we use the pytorch random generator to generate a base_seed.
 52 |         Please try to be consistent.
 53 |         References:
 54 |             https://pytorch.org/docs/stable/notes/faq.html#dataloader-workers-random-seed
 55 |     """
 56 |     base_seed = torch.IntTensor(1).random_().item()
 57 |     #print(worker_id, base_seed)
 58 |     np.random.seed(base_seed + worker_id)
 59 | 
 60 | def viz_mask(ids):
 61 |     return colors[ids]
 62 | 
 63 | def draw_dot(img, xy):
 64 |     out = np.array(img, dtype=np.uint8)
 65 |     x, y = xy[0], xy[1]
 66 |     neighbors = np.array([[0, 0, 0, 1, 1, 1, -1, -1, 1], \
 67 |                           [0, 1, -1, 0, 1, -1, 0, 1, -1]], dtype=np.int32)
 68 |     for i in range(neighbors.shape[1]):
 69 |         nx = x + neighbors[0, i]
 70 |         ny = y + neighbors[1, i]
 71 |         if nx >= 0 and nx < img.shape[0] and ny >= 0 and ny < img.shape[1]:
 72 |             out[nx, ny, 0] = 0
 73 |             out[nx, ny, 1] = 0
 74 |             out[nx, ny, 2] = 255
 75 | 
 76 |     return out
 77 | 
 78 | def print_true_false(d):
 79 |     d = int(d)
 80 |     if d > 0.5:
 81 |         return 'True'
 82 |     return 'False'
 83 | 
 84 | def img_resize(data):
 85 |     data = np.array(data, dtype=np.float32)
 86 |     mini, maxi = np.min(data), np.max(data)
 87 |     data -= mini
 88 |     data /= maxi - mini
 89 |     data = np.array(Image.fromarray((data*255).astype(np.uint8)).resize((224, 224)), dtype=np.float32) / 255
 90 |     data *= maxi - mini
 91 |     data += mini
 92 |     return data
 93 | 
 94 | def export_pts(out, v):
 95 |     with open(out, 'w') as fout:
 96 |         for i in range(v.shape[0]):
 97 |             fout.write('%f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2]))
 98 | 
 99 | def export_label(out, l):
100 |     with open(out, 'w') as fout:
101 |         for i in range(l.shape[0]):
102 |             fout.write('%f\n' % (l[i]))
103 | 
104 | def export_pts_label(out, v, l):
105 |     with open(out, 'w') as fout:
106 |         for i in range(l.shape[0]):
107 |             fout.write('%f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], l[i]))
108 | 
109 | def render_pts_label_png(out, v, l):
110 |     export_pts(out+'.pts', v)
111 |     export_label(out+'.label', l)
112 |     export_pts_label(out+'.feats', v, l)
113 |     cmd = 'RenderShape %s.pts -f %s.feats %s.png 448 448 -v 1,0,0,-5,0,0,0,0,1 >> /dev/null' % (out, out, out)
114 |     call(cmd, shell=True)
115 | 
116 | def export_pts_color_obj(out, v, c):
117 |     with open(out+'.obj', 'w') as fout:
118 |         for i in range(v.shape[0]):
119 |             fout.write('v %f %f %f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], c[i, 0], c[i, 1], c[i, 2]))
120 | 
121 | def export_pts_color_pts(out, v, c):
122 |     with open(out+'.pts', 'w') as fout:
123 |         for i in range(v.shape[0]):
124 |             fout.write('%f %f %f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], c[i, 0], c[i, 1], c[i, 2]))
125 | 
126 | def load_checkpoint(models, model_names, dirname, epoch=None, optimizers=None, optimizer_names=None, strict=True):
127 |     if len(models) != len(model_names) or (optimizers is not None and len(optimizers) != len(optimizer_names)):
128 |         raise ValueError('Number of models, model names, or optimizers does not match.')
129 | 
130 |     for model, model_name in zip(models, model_names):
131 |         filename = f'net_{model_name}.pth'
132 |         if epoch is not None:
133 |             filename = f'{epoch}_' + filename
134 |         model.load_state_dict(torch.load(os.path.join(dirname, filename)), strict=strict)
135 | 
136 |     start_epoch = 0
137 |     if optimizers is not None:
138 |         filename = os.path.join(dirname, 'checkpt.pth')
139 |         if epoch is not None:
140 |             filename = f'{epoch}_' + filename
141 |         if os.path.exists(filename):
142 |             checkpt = torch.load(filename)
143 |             start_epoch = checkpt['epoch']
144 |             for opt, optimizer_name in zip(optimizers, optimizer_names):
145 |                 opt.load_state_dict(checkpt[f'opt_{optimizer_name}'])
146 |             print(f'resuming from checkpoint {filename}')
147 |         else:
148 |             response = input(f'Checkpoint {filename} not found for resuming, refine saved models instead? (y/n) ')
149 |             if response != 'y':
150 |                 sys.exit()
151 | 
152 |     return start_epoch
153 | 
154 | def get_global_position_from_camera(camera, depth, x, y):
155 |     """
156 |     This function is provided only to show how to convert camera observation to world space coordinates.
157 |     It can be removed if not needed.
158 | 
159 |     camera: an camera agent
160 |     depth: the depth obsrevation
161 |     x, y: the horizontal, vertical index for a pixel, you would access the images by image[y, x]
162 |     """ 
163 |     cm = camera.get_metadata()
164 |     proj, model = cm['projection_matrix'], cm['model_matrix']
165 |     print('proj:', proj)
166 |     print('model:', model)
167 |     w, h = cm['width'], cm['height']
168 | 
169 |     # get 0 to 1 coordinate for (x, y) coordinates
170 |     xf, yf = (x + 0.5) / w, 1 - (y + 0.5) / h
171 | 
172 |     # get 0 to 1 depth value at (x,y)
173 |     zf = depth[int(y), int(x)]
174 | 
175 |     # get the -1 to 1 (x,y,z) coordinate
176 |     ndc = np.array([xf, yf, zf, 1]) * 2 - 1
177 | 
178 |     # transform from image space to view space
179 |     v = np.linalg.inv(proj) @ ndc
180 |     v /= v[3]
181 | 
182 |     # transform from view space to world space
183 |     v = model @ v
184 | 
185 |     return v
186 | 
187 | def rot2so3(rotation):
188 |     assert rotation.shape == (3, 3)
189 |     if np.isclose(rotation.trace(), 3):
190 |         return np.zeros(3), 1
191 |     if np.isclose(rotation.trace(), -1):
192 |         raise RuntimeError
193 |     theta = np.arccos((rotation.trace() - 1) / 2)
194 |     omega = 1 / 2 / np.sin(theta) * np.array(
195 |         [rotation[2, 1] - rotation[1, 2], rotation[0, 2] - rotation[2, 0], rotation[1, 0] - rotation[0, 1]]).T
196 |     return omega, theta
197 | 
198 | def skew(vec):
199 |     return np.array([[0, -vec[2], vec[1]],
200 |                      [vec[2], 0, -vec[0]],
201 |                      [-vec[1], vec[0], 0]])
202 | 
203 | def adjoint_matrix(pose):
204 |     adjoint = np.zeros([6, 6])
205 |     adjoint[:3, :3] = pose[:3, :3]
206 |     adjoint[3:6, 3:6] = pose[:3, :3]
207 |     adjoint[3:6, 0:3] = skew(pose[:3, 3]) @ pose[:3, :3]
208 |     return adjoint
209 | 
210 | def pose2exp_coordinate(pose):
211 |     """
212 |     Compute the exponential coordinate corresponding to the given SE(3) matrix
213 |     Note: unit twist is not a unit vector
214 | 
215 |     Args:
216 |         pose: (4, 4) transformation matrix
217 | 
218 |     Returns:
219 |         Unit twist: (6, ) vector represent the unit twist
220 |         Theta: scalar represent the quantity of exponential coordinate
221 |     """
222 | 
223 |     omega, theta = rot2so3(pose[:3, :3])
224 |     ss = skew(omega)
225 |     inv_left_jacobian = np.eye(3, dtype=np.float32) / theta - 0.5 * ss + (
226 |             1.0 / theta - 0.5 / np.tan(theta / 2)) * ss @ ss
227 |     v = inv_left_jacobian @ pose[:3, 3]
228 |     return np.concatenate([omega, v]), theta
229 | 
230 | def viz_mask(ids):
231 |     return colors[ids]
232 | 
233 | def process_angle_limit(x):
234 |     if np.isneginf(x):
235 |         x = -10
236 |     if np.isinf(x):
237 |         x = 10
238 |     return x
239 | 
240 | def get_random_number(l, r):
241 |     return np.random.rand() * (r - l) + l
242 | 
243 | def save_h5(fn, data):
244 |     fout = h5py.File(fn, 'w')
245 |     for d, n, t in data:
246 |         fout.create_dataset(n, data=d, compression='gzip', compression_opts=4, dtype=t)
247 |     fout.close()
248 | 


--------------------------------------------------------------------------------
/train/data/create_dataset_aff.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from PIL import Image, ImageDraw, ImageOps
  4 | import numpy as np
  5 | import argparse
  6 | from tqdm import tqdm
  7 | 
  8 | print('Start generating training json..............')
  9 | count = 0
 10 | parser = argparse.ArgumentParser()
 11 | parser.add_argument('--folder_dir', type=str, help='dataset dir')
 12 | parser.add_argument('--output_dir', type=str, help='training json dir')
 13 | parser.add_argument('--num_point', type=int, help='training json dir')
 14 | args = parser.parse_args()
 15 | 
 16 | folder_dir = args.folder_dir
 17 | folder_names = os.listdir(folder_dir)
 18 | output_dir = args.output_dir
 19 | if not os.path.exists(output_dir):
 20 |     os.makedirs(output_dir)
 21 | else:
 22 |     print('json files already exists, beginning training')
 23 |     exit()
 24 | cal_cat = dict()
 25 | 
 26 | for item in tqdm(folder_names):
 27 |     NUM_OF_POINTS = args.num_point
 28 |     cur_dir = os.path.join(folder_dir,str(item))
 29 |     cat = item.split('_')[1]
 30 |     if os.path.exists(os.path.join(cur_dir, 'result.json')):
 31 |         with open(os.path.join(cur_dir, 'result.json'), 'r') as fin:
 32 |             data_inf = json.load(fin)
 33 |             if data_inf['mani_succ'] != 'True':
 34 |                 continue
 35 |             
 36 |             aff_gt_dir = os.path.join(cur_dir, 'aff_gt_all.png')
 37 |             if not os.path.exists(aff_gt_dir):
 38 |                 continue
 39 |             img_pil = Image.open(os.path.join(cur_dir, 'original_rgb.png'))
 40 |             intermask_pil = np.array(Image.open(os.path.join(cur_dir, 'interaction_mask.png')))
 41 |             gray_image = ImageOps.grayscale(img_pil)
 42 |             threshold = 200  # Adjust the threshold value as needed
 43 |             object_mask = gray_image.point(lambda p: p < threshold and 255)
 44 |             object_mask.save(os.path.join(cur_dir, 'object_mask.png'))
 45 |             
 46 |             object_mask = np.array(object_mask)/255
 47 |             
 48 |             
 49 |             aff_gt_pil = Image.open(aff_gt_dir)
 50 |             aff_gt = np.array(aff_gt_pil)/255
 51 |             result_mask = np.where(aff_gt < 0.2, intermask_pil, 0).astype(np.uint8)
 52 |             object_mask = np.where(aff_gt < 0.2, object_mask, 0).astype(np.uint8)
 53 |             Image.fromarray((result_mask).astype(np.uint8)).save(os.path.join(cur_dir, 'result_mask.png'))
 54 |             Image.fromarray((object_mask*255).astype(np.uint8)).save(os.path.join(cur_dir, 'object_mask.png'))
 55 |         
 56 |             row_indices_pos, col_indices_pos = np.where(aff_gt > 0.8)
 57 |             if NUM_OF_POINTS > len(row_indices_pos):
 58 |                 NUM_OF_POINTS = len(row_indices_pos)
 59 |             
 60 |             row_indices_neg1, col_indices_neg1 = np.where(result_mask > 0.8)
 61 |             
 62 |            
 63 |             if NUM_OF_POINTS > len(row_indices_neg1) and len(row_indices_neg1) != 0:
 64 |                 NUM_OF_POINTS = len(row_indices_neg1)
 65 |             
 66 |             if NUM_OF_POINTS == 0:
 67 |                 continue
 68 |             
 69 |             if len(row_indices_neg1) != 0 :
 70 |                 indices_neg = np.random.choice(len(row_indices_neg1), size=NUM_OF_POINTS//2, replace=False)
 71 |                 selected_row_indices_neg = row_indices_neg1[indices_neg].reshape(-1, 1)
 72 |                 selected_col_indices_neg = col_indices_neg1[indices_neg].reshape(-1, 1)
 73 |                 top_indices_neg1 = np.hstack((selected_row_indices_neg, selected_col_indices_neg))
 74 |                 top_indices_neg1_gt = np.zeros(top_indices_neg1.shape[0])
 75 |             
 76 |             row_indices_neg, col_indices_neg = np.where(object_mask > 0.8)
 77 |             
 78 |             if len(row_indices_neg) != 0 and len(row_indices_neg1) != 0:
 79 |                 indices_neg = np.random.choice(len(row_indices_neg), size=NUM_OF_POINTS//2, replace=False)
 80 |                 selected_row_indices_neg = row_indices_neg[indices_neg].reshape(-1, 1)
 81 |                 selected_col_indices_neg = col_indices_neg[indices_neg].reshape(-1, 1)
 82 |                 top_indices_neg2 = np.hstack((selected_row_indices_neg, selected_col_indices_neg))
 83 |                 top_indices_neg2_gt = np.zeros(top_indices_neg2.shape[0])
 84 |             else:
 85 |                 try:
 86 |                     indices_neg = np.random.choice(len(row_indices_neg), size=NUM_OF_POINTS, replace=False)
 87 |                     selected_row_indices_neg = row_indices_neg[indices_neg].reshape(-1, 1)
 88 |                     selected_col_indices_neg = col_indices_neg[indices_neg].reshape(-1, 1)
 89 |                     top_indices_neg2 = np.hstack((selected_row_indices_neg, selected_col_indices_neg))
 90 |                     top_indices_neg2_gt = np.zeros(top_indices_neg2.shape[0])
 91 |                 except:
 92 |                     continue
 93 |             
 94 |             
 95 |             indices_pos = np.random.choice(len(row_indices_pos), size=NUM_OF_POINTS, replace=False)
 96 |             selected_row_indices_pos = row_indices_pos[indices_pos].reshape(-1, 1)
 97 |             selected_col_indices_pos = col_indices_pos[indices_pos].reshape(-1, 1)
 98 |             top_indices_pos = np.hstack((selected_row_indices_pos, selected_col_indices_pos))
 99 |             top_indices_pos_gt = np.ones(top_indices_pos.shape[0])
100 | 
101 |             if len(row_indices_neg1) == 0 :
102 |                 
103 |                 select_indices = np.vstack((top_indices_neg2,  top_indices_pos))
104 |                 select_indices_gt = np.concatenate((top_indices_neg2_gt, top_indices_pos_gt))
105 |                 
106 |             else:
107 |                 
108 |                 select_indices = np.vstack((top_indices_neg1, top_indices_neg2,  top_indices_pos))
109 |                 select_indices_gt = np.concatenate((top_indices_neg1_gt, top_indices_neg2_gt, top_indices_pos_gt))
110 |             
111 |             permutation = np.random.permutation(len(select_indices_gt))
112 |             select_indices = select_indices[permutation]
113 |             select_indices_gt = select_indices_gt[permutation]
114 |             
115 |             mapping = {0: "no", 1: "yes"}
116 |             if len(select_indices_gt) == 0:
117 |                 continue
118 |             select_string_gt = np.vectorize(mapping.get)(select_indices_gt)
119 |             
120 |             
121 |             select_string = np.array2string(select_indices, separator=',', formatter={'all': lambda x: str(x)})[1:-1].strip().replace("\n", " ")
122 |             select_string_gt = np.array2string(select_string_gt, separator=',', formatter={'all': lambda x: str(x)})[1:-1].strip().replace("\n", " ")
123 |             
124 |             aff_question = 'Determine if operating on each following point can effectively manipulate the object within the image: {}'.format(select_string)
125 |             aff_gt = select_string_gt
126 |             
127 |             
128 |             #draw the selected point in the image
129 |             draw = ImageDraw.Draw(img_pil)
130 |             if len(row_indices_neg1) != 0 :
131 |                 for index in range(top_indices_neg1.shape[0]):
132 |                     draw.point((top_indices_neg1[index][1],top_indices_neg1[index][0]),'blue')
133 |             for index in range(top_indices_neg2.shape[0]):
134 |                 draw.point((top_indices_neg2[index][1],top_indices_neg2[index][0]),'blue')
135 |             for index in range(top_indices_pos.shape[0]):
136 |                 draw.point((top_indices_pos[index][1],top_indices_pos[index][0]),'red')
137 |             img_pil.save(os.path.join(cur_dir, 'select_point.png'))
138 | 
139 |             up_cam = data_inf['gripper_up_direction_camera']
140 |             forward_cam = data_inf['gripper_forward_direction_camera']
141 |             x,y = data_inf['pixel_locs']
142 |             data = {
143 |                 
144 |                 "conversations": [
145 |                     {
146 |                         "prompt": "Specify the contact point and gripper direction of manipulating the object."
147 |                     },
148 |                     {
149 |                         "gt": f"The contact point is ({int(x)}, {int(y)}),  the gripper up direction is {up_cam}, the gripper forward direction is {forward_cam}."
150 | 
151 |                     }
152 |                 ],
153 |                 'cat_prompt': 'What is the category of the object in the image?',
154 |                 'cat_ans': item.split('_')[1],
155 |                 "instruction": "Specify the contact point and gripper direction of manipulating the object.",
156 |                 "input": os.path.join(cur_dir, 'original_rgb.png'),
157 |                 'aff_question': aff_question,
158 |                 'aff_gt': aff_gt.strip()
159 |                 
160 |             }
161 |             if not os.path.exists(os.path.join(cur_dir, 'original_rgb.png')):
162 |                 continue
163 |             
164 |             json_data = json.dumps(data, indent=4)
165 |             cat = item.split('_')[1]
166 |             
167 |             if cat not in list(cal_cat.keys()):
168 |                 cal_cat[cat] = 1
169 |             else:
170 |                 if cal_cat[cat] > 900:
171 |                     continue
172 |                 else:
173 |                     cal_cat[cat] += 1
174 |             
175 |             
176 |             with open(os.path.join(output_dir,'{}.json'.format(item)), "w") as file:
177 |                 file.write(json_data)
178 |             
179 | print('Numbers of each training category: ', cal_cat)
180 | print('Finish generating training json..............')


--------------------------------------------------------------------------------
/test/test_one_stick_clean.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import sys
  4 | import shutil
  5 | from argparse import ArgumentParser
  6 | from PIL import Image, ImageDraw
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from sapien.core import Pose
 11 | from env_ori import Env,ContactError
 12 | from camera import Camera
 13 | from robots.panda_robot import Robot
 14 | import imageio
 15 | import cv2
 16 | import json
 17 | import random
 18 | import matplotlib.pyplot as plt
 19 | import matplotlib as mpl
 20 | from matplotlib.colors import ListedColormap, LinearSegmentedColormap
 21 | import llama
 22 | 
 23 | 
 24 | parser = ArgumentParser()
 25 | parser.add_argument('--llama_dir', type=str, help='llama directory')
 26 | parser.add_argument('--adapter_dir', type=str,default='./', help='adapter directory')
 27 | parser.add_argument('--result_suffix', type=str, default='nothing')
 28 | parser.add_argument('--device', type=str, default='cuda:0', help='cpu or cuda:x for using cuda on GPU number x')
 29 | parser.add_argument('--overwrite', action='store_true', default=False, help='overwrite if out_dir exists [default: False]')
 30 | 
 31 | parser.add_argument('--no_gui', action='store_true', default=False, help='no_gui [default: False]')
 32 | parser.add_argument('--data_dir', type=str)
 33 | parser.add_argument('--record_name', type=str)
 34 | parser.add_argument('--out_dir', type=str)
 35 | parser.add_argument('--use_mask', type=str, help='whether use movable mask')
 36 | eval_conf = parser.parse_args()
 37 | 
 38 | random.seed(0)
 39 | np.random.seed(0)
 40 | torch.manual_seed(0)
 41 | 
 42 | 
 43 | 
 44 | 
 45 | #previous info are saved in result.json
 46 | shape_id, category, cnt_id, primact_type, trial_id = eval_conf.record_name.split('_')
 47 | 
 48 | out_dir = os.path.join(eval_conf.out_dir, '%s_%s_%s_%s_%d' % (shape_id, category, cnt_id, primact_type, int(trial_id)))
 49 | 
 50 | 
 51 | flog = open(os.path.join(out_dir, 'log.txt'), 'w')
 52 | out_info = dict()
 53 | try:
 54 |     with open(os.path.join(eval_conf.data_dir, eval_conf.record_name, 'result.json'), 'r') as fin:
 55 |         replay_data = json.load(fin)
 56 | except:
 57 |     print('no replay data')
 58 |     exit(1)
 59 | 
 60 | 
 61 | env = Env(flog=flog, show_gui=(not eval_conf.no_gui))
 62 | 
 63 | # setup camera
 64 | cam_theta = replay_data['camera_metadata']['theta']
 65 | cam_phi = replay_data['camera_metadata']['phi']
 66 | cam_dist = replay_data['camera_metadata']['dist']
 67 | cam = Camera(env, theta=cam_theta, phi=cam_phi, dist=cam_dist)
 68 | out_info['camera_metadata_init'] = cam.get_metadata_json()
 69 | 
 70 | 
 71 | if not eval_conf.no_gui:
 72 |     env.set_controller_camera_pose(cam.pos[0], cam.pos[1], cam.pos[2], np.pi+cam_theta, -cam_phi)
 73 | 
 74 | 
 75 | 
 76 | # load shape
 77 | object_urdf_fn = '../data_collection/asset/original_sapien_dataset/%s/mobility.urdf' % shape_id
 78 | flog.write('object_urdf_fn: %s\n' % object_urdf_fn)
 79 | object_material = env.get_material(4, 4, 0.01)
 80 | state = replay_data['object_state']
 81 | flog.write('Object State: %s\n' % state)
 82 | out_info['object_state'] = state
 83 | scale = replay_data['scale']
 84 | env.load_object(scale, object_urdf_fn, object_material, state=state)
 85 | joint_angles = replay_data['joint_angles']
 86 | env.set_object_joint_angles(joint_angles)
 87 | out_info['joint_angles'] = joint_angles
 88 | out_info['joint_angles_lower'] = env.joint_angles_lower
 89 | out_info['joint_angles_upper'] = env.joint_angles_upper
 90 | cur_qpos = env.get_object_qpos()
 91 | 
 92 | # simulate some steps for the object to stay rest
 93 | still_timesteps = 0
 94 | wait_timesteps = 0
 95 | while still_timesteps < 5000 and wait_timesteps < 20000:
 96 |     env.step()
 97 |     env.render()
 98 |     cur_new_qpos = env.get_object_qpos()
 99 |     invalid_contact = False
100 |     for c in env.scene.get_contacts():
101 |         for p in c.points:
102 |             if abs(p.impulse @ p.impulse) > 1e-4:
103 |                 invalid_contact = True
104 |                 break
105 |         if invalid_contact:
106 |             break
107 |     if np.max(np.abs(cur_new_qpos - cur_qpos)) < 1e-6 and (not invalid_contact):
108 |         still_timesteps += 1
109 |     else:
110 |         still_timesteps = 0
111 |     cur_qpos = cur_new_qpos
112 |     wait_timesteps += 1
113 | 
114 | if still_timesteps < 5000:
115 |     printout(flog, 'Object Not Still!')
116 |     flog.close()
117 |     env.close()
118 |     exit(1)
119 | 
120 | rgb, depth = cam.get_observation()
121 | Image.fromarray((rgb*255).astype(np.uint8)).save(os.path.join(out_dir, 'rgb_img.png'))
122 | img = Image.fromarray((rgb*255).astype(np.uint8))
123 | 
124 | gt_nor = cam.get_normal_map()
125 | Image.fromarray(((gt_nor+1)/2*255).astype(np.uint8)).save(os.path.join(out_dir, 'gt_nor.png'))
126 | 
127 | object_link_ids = env.movable_link_ids
128 | gt_movable_link_mask = cam.get_movable_link_mask(object_link_ids)
129 | mask = (gt_movable_link_mask > 0)
130 | 
131 | 
132 | if os.path.exists(os.path.join(out_dir, 'prediction.json')):
133 |     with open(os.path.join(out_dir, 'prediction.json'), 'r') as fin:
134 |         result = json.load(fin)
135 | else:
136 |     print('!!!!!!!!!!!!!!!!!!!!!!no prediction !!!!!!!!!!!!!!!!!!!!!!!!')
137 |     flog.close()
138 |     env.close()
139 |     exit(2)
140 | 
141 | 
142 | print('answer from model: ', result)
143 | 
144 | object_link_ids = env.movable_link_ids
145 | gt_movable_link_mask = cam.get_movable_link_mask(object_link_ids)
146 | x, y = result.split('(')[1].split(')')[0].split(', ')
147 | x = int(x)
148 | y = int(y)
149 | if eval_conf.use_mask == 'True':
150 |     if gt_movable_link_mask[x,y] == 0:
151 |         exit()
152 | 
153 | norm_dir = gt_nor[x,y]
154 | 
155 | gt_nor = cam.get_normal_map()
156 | Image.fromarray(((gt_nor+1)/2*255).astype(np.uint8)).save(os.path.join(out_dir, 'gt_nor.png'))
157 | 
158 | d_x, d_y, d_z = result.split('[')[1].split(']')[0].split(', ')
159 | gripper_direction_camera = np.array([int(d_x)*0.02, int(d_y)*0.02, int(d_z)*0.02])
160 | fd_x, fd_y, fd_z = result.split('[')[2].split(']')[0].split(', ')
161 | gripper_forward_direction_camera = np.array([int(fd_x)*0.02, int(fd_y)*0.02, int(fd_z)*0.02])
162 | 
163 | draw = ImageDraw.Draw(img)
164 | draw.point((y,x),'red')
165 | img.save(os.path.join(out_dir, 'contact_point.png'))
166 | 
167 | cam_XYZA_id1, cam_XYZA_id2, cam_XYZA_pts,out = cam.compute_camera_XYZA(depth)
168 | cam_XYZA = cam.compute_XYZA_matrix(cam_XYZA_id1, cam_XYZA_id2, cam_XYZA_pts, depth.shape[0], depth.shape[1])
169 | position_cam = cam_XYZA[x, y, :3]
170 | 
171 | position_cam_xyz1 = np.ones((4), dtype=np.float32)
172 | position_cam_xyz1[:3] = position_cam
173 | position_world_xyz1 = cam.get_metadata()['mat44'] @ position_cam_xyz1
174 | position_world = position_world_xyz1[:3]
175 | target_part_id = object_link_ids[gt_movable_link_mask[x, y] - 1]
176 | env.set_target_object_part_actor_id(target_part_id)
177 | out_info['target_object_part_actor_id'] = env.target_object_part_actor_id
178 | out_info['target_object_part_joint_id'] = env.target_object_part_joint_id
179 | 
180 | 
181 | def plot_mani(cam,up, forward):
182 |     # we use the norm of the contact point to correct tge z-axis of end-effector
183 |     if (up @ norm_dir[:3] ) > 0:
184 |         up = -up
185 | 
186 |     up /= np.linalg.norm(up)
187 |     up = cam.get_metadata()['mat44'][:3,:3] @ up
188 |     forward = cam.get_metadata()['mat44'][:3,:3] @ forward
189 |     out_info['gripper_direction_world'] = up.tolist()
190 |     
191 |     up = np.array(up, dtype=np.float32)
192 |     up /= np.linalg.norm(up)
193 |     forward = np.array(forward, dtype=np.float32)
194 |     forward /= np.linalg.norm(forward)
195 |     left = np.cross(up, forward)
196 |     left /= np.linalg.norm(left)
197 |     forward = np.cross(left, up)
198 |     forward /= np.linalg.norm(forward)
199 | 
200 |     rotmat = np.eye(4).astype(np.float32)
201 |     rotmat[:3, 0] = forward
202 |     rotmat[:3, 1] = left
203 |     rotmat[:3, 2] = up
204 |     
205 |     final_rotmat = np.array(rotmat, dtype=np.float32)
206 |     final_rotmat[:3, 3] = position_world - up * 0.1
207 |     final_pose = Pose().from_transformation_matrix(final_rotmat)
208 | 
209 |     start_rotmat = np.array(rotmat, dtype=np.float32)
210 |     start_rotmat[:3, 3] = position_world - up * 0.15
211 |     # start_rotmat[:3, 3] = position_world
212 |     start_pose = Pose().from_transformation_matrix(start_rotmat)
213 | 
214 |     pull_rotmat = np.array(rotmat, dtype=np.float32)
215 |     pull_rotmat[:3, 3] = position_world - up * 0.5
216 |     pull_pose = Pose().from_transformation_matrix(pull_rotmat)
217 |     out_info['pull_rotmat_world'] = pull_rotmat.tolist()
218 |     
219 |     #load robot
220 |     robot_urdf_fn = './robots/panda_gripper.urdf'
221 |     robot_material = env.get_material(4, 4, 0.01)
222 |     robot = Robot(env, robot_urdf_fn, robot_material, open_gripper=('pulling' in primact_type))
223 |     
224 |     
225 |     robot.robot.set_root_pose(start_pose)
226 | 
227 |     
228 |     env.render()
229 |     rgb_final_pose, _ = cam.get_observation()
230 |     Image.fromarray((rgb_final_pose*255).astype(np.uint8)).save(os.path.join(out_dir, 'viz_start_pose.png'))
231 |     
232 |     
233 |     
234 |     out_info['start_target_part_qpos'],_,_ = env.get_target_part_qpos()
235 |     
236 |     
237 |     success = True
238 |     target_link_mat44 = env.get_target_part_pose().to_transformation_matrix()
239 |     position_local_xyz1 = np.linalg.inv(target_link_mat44) @ position_world_xyz1
240 |     
241 |     
242 |     robot.close_gripper()
243 |     robot.wait_n_steps(2000)
244 |     
245 | 
246 |     # approach
247 |     robot.move_to_target_pose(final_rotmat, 2000)
248 |     robot.wait_n_steps(2000)
249 |     rgb_final_pose, _ = cam.get_observation()
250 |     Image.fromarray((rgb_final_pose*255).astype(np.uint8)).save(os.path.join(out_dir, 'viz_mid_pose.png'))
251 |     
252 |     suction_drive = env.scene.create_drive(
253 |                 robot.robot.get_links()[-1],
254 |                 robot.robot.get_links()[-1].get_cmass_local_pose(),
255 |                 env.target_object_part_actor_link,
256 |                 env.target_object_part_actor_link.get_cmass_local_pose(),
257 |             )
258 |     suction_drive.set_x_properties(stiffness=45000, damping=0)
259 |     suction_drive.set_y_properties(stiffness=45000, damping=0)
260 |     suction_drive.set_z_properties(stiffness=45000, damping=0)
261 | 
262 |     
263 |     if primact_type == 'pulling':
264 |         robot.move_to_target_pose(pull_rotmat, 2000)
265 |         robot.wait_n_steps(2000)
266 | 
267 | 
268 |     
269 |     target_link_mat44 = env.get_target_part_pose().to_transformation_matrix()
270 |     position_world_xyz1_end = target_link_mat44 @ position_local_xyz1
271 |     out_info['touch_position_world_xyz_start'] = position_world_xyz1[:3].tolist()
272 |     out_info['touch_position_world_xyz_end'] = position_world_xyz1_end[:3].tolist()
273 |     if success==True:
274 |         succ=True
275 |         out_info['final_target_part_qpos'],_,_ = env.get_target_part_qpos()
276 |         print(out_info['final_target_part_qpos'],out_info['start_target_part_qpos'])
277 |         abs_motion = abs(out_info['final_target_part_qpos'] - out_info['start_target_part_qpos'])
278 |         j = out_info['target_object_part_joint_id']
279 |         tot_motion = out_info['joint_angles_upper'][j] - out_info['joint_angles_lower'][j] + 1e-8
280 |         mani_success = (abs_motion > 0.03) or (abs_motion / tot_motion > 0.5)
281 |     else:
282 |         mani_success = False
283 |     if mani_success:
284 |         if primact_type == 'pushing':
285 |             mani_success = mani_success
286 |         elif primact_type == 'pulling':
287 |             mov_dir = np.array(out_info['touch_position_world_xyz_end'], dtype=np.float32) - \
288 |                         np.array(out_info['touch_position_world_xyz_start'], dtype=np.float32)
289 |             mov_dir /= np.linalg.norm(mov_dir)
290 |             intended_dir = -np.array(out_info['gripper_direction_world'], dtype=np.float32)
291 |             mani_success = (intended_dir @ mov_dir > 0.3)
292 |     return success, mani_success
293 | 
294 | success, mani_succ = plot_mani(cam,gripper_direction_camera, gripper_forward_direction_camera)
295 | out_info['succ'] = np.array(success, dtype=bool).tolist()
296 |  
297 | out_info['mani_succ'] = np.array(mani_succ, dtype=bool).tolist()
298 | rgb_final_pose, _ = cam.get_observation()
299 | Image.fromarray((rgb_final_pose*255).astype(np.uint8)).save(os.path.join(out_dir, 'viz_target_pose.png'))
300 | 
301 | print(success, mani_succ)
302 | with open(os.path.join(out_dir, 'result.json'), 'w') as fout:
303 |     json.dump(out_info, fout)
304 |     print(out_dir)
305 | flog.close()
306 | env.close()
307 | 


--------------------------------------------------------------------------------
/train/llama/llama_adapter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | import clip
  6 | import torch
  7 | import torch.nn as nn
  8 | from timm.models.vision_transformer import Block
  9 | 
 10 | from .llama import ModelArgs, Transformer, BERTTransformer
 11 | from .tokenizer import Tokenizer
 12 | from .utils import sample_top_p, _download
 13 | class RMSNorm(torch.nn.Module):
 14 |     def __init__(self, dim: int, eps: float = 1e-6):
 15 |         super().__init__()
 16 |         self.eps = eps
 17 |         self.weight = nn.Parameter(torch.ones(dim))
 18 | 
 19 |     def _norm(self, x):
 20 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
 21 | 
 22 |     def forward(self, x):
 23 |         output = self._norm(x.float()).type_as(x)
 24 |         return output * self.weight
 25 | 
 26 | class LLaMA_adapter(nn.Module):
 27 | 
 28 |     def __init__(self, llama_ckpt_dir, llama_tokenizer,
 29 |                  max_seq_len=512, max_batch_size=1,
 30 |                  clip_model='ViT-L/14@336px',
 31 |                  v_embed_dim=1024, v_depth=16,
 32 |                  v_num_heads=16, v_mlp_ratio=4.0,
 33 |                  query_len=577, query_layer=32, phase="finetune"):
 34 |         super().__init__()
 35 |         # llama configs
 36 |         # with open(os.path.join(llama_ckpt_dir, "7B/params.json"), "r") as f:
 37 |         with open("./ckpts/llama_model_weights/7B/params.json", "r") as f:
 38 |             params = json.loads(f.read())
 39 |         bias_lora = phase == "finetune"
 40 |         model_args: ModelArgs = ModelArgs(
 41 |             max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
 42 |         ) # max_batch_size only affects inferenc
 43 | 
 44 |         # 1. clip and clip projector
 45 |         self.clip, self.clip_transform = clip.load(clip_model,download_root='./ckpts')
 46 | 
 47 |         clip_dim = self.clip.visual.proj.shape[1]
 48 |         self.clip_proj = nn.Linear(clip_dim, v_embed_dim)
 49 |         self.clip_proj_norm = nn.LayerNorm(v_embed_dim)
 50 | 
 51 |         self.query_len = query_len
 52 |         self.query_layer = query_layer
 53 | 
 54 |         # 2. visual query, blocks and projector
 55 | 
 56 |         visual_model_args = ModelArgs(dim=1024, n_layers=16, n_heads=8, max_seq_len=577)
 57 |         visual_model_args.vocab_size = 1024
 58 |         self.visual_blocks = BERTTransformer(visual_model_args)
 59 |         self.visual_proj = nn.Linear(v_embed_dim, model_args.dim)
 60 |         self.visual_proj_norm = nn.LayerNorm(model_args.dim)
 61 | 
 62 |         # 3. adapter query
 63 |         self.adapter_query = nn.Embedding(
 64 |             query_len * query_layer, model_args.dim)
 65 | 
 66 |         # 4. tokenizer
 67 |         self.tokenizer = Tokenizer(model_path=llama_tokenizer)
 68 | 
 69 |         # 5. llama 
 70 |         model_args.w_bias = bias_lora
 71 |         model_args.w_lora = bias_lora
 72 |         model_args.vocab_size = self.tokenizer.n_words
 73 |         torch.set_default_tensor_type(torch.cuda.HalfTensor)
 74 |         self.llama = Transformer(model_args)
 75 |         torch.set_default_tensor_type(torch.FloatTensor)
 76 | 
 77 |         ckpts = ['./ckpts/llama_model_weights/7B/consolidated.00.pth']
 78 |         
 79 |         for ckpt in ckpts:
 80 |             # print('load_ckpt_path:', ckpt)
 81 |             ckpt = torch.load(ckpt, map_location='cpu')
 82 |             self.llama.load_state_dict(ckpt, strict=False)
 83 |             
 84 | 
 85 |         for name, param in self.named_parameters():
 86 |             param.requires_grad = False
 87 | 
 88 |         for name, para in self.llama.named_parameters():
 89 |             if 'norm' in name:
 90 |                 para.data = para.data.float()
 91 |                 para.requires_grad = True
 92 |             if 'bias' in name:
 93 |                 para.data = para.data.float()
 94 |                 para.requires_grad = True
 95 |             if 'lora' in name:
 96 |                 para.data = para.data.float()
 97 |                 para.requires_grad = True
 98 |         count = 0
 99 |         for name, param in self.named_parameters():
100 |             if param.requires_grad:
101 |                count += 1
102 |                print(f"Trainable param: {name}, {param.shape}, {param.dtype}")
103 |         
104 | 
105 |         # 6. training criterion
106 |         self.criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
107 | 
108 |     def clip_encode_image(self, x):
109 |         
110 |         # modified from CLIP
111 |         x = self.clip.visual.conv1(x)  # shape = [*, width, grid, grid]
112 |         
113 |         # shape = [*, width, grid ** 2]
114 |         x = x.reshape(x.shape[0], x.shape[1], -1)
115 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
116 |         x = torch.cat([self.clip.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1,
117 |                       x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
118 |         x = x + self.clip.visual.positional_embedding.to(x.dtype)
119 |         x = self.clip.visual.ln_pre(x)
120 | 
121 |         x = x.permute(1, 0, 2)  # NLD -> LND
122 |         x = self.clip.visual.transformer(x)
123 |         x = x.permute(1, 0, 2)  # LND -> NLD
124 | 
125 |         # preserve all spatial tokens
126 |         x = self.clip.visual.ln_post(x[:, :, :])
127 | 
128 |         if self.clip.visual.proj is not None:
129 |             x = x @ self.clip.visual.proj
130 | 
131 |         return x
132 | 
133 |     def forward_visual(self, imgs):
134 |         clip_feats = self.clip_encode_image(imgs)
135 |         clip_feats = self.clip_proj_norm(self.clip_proj(clip_feats.float()))
136 | 
137 |         visual_query = clip_feats
138 |         visual_query = self.visual_blocks(visual_query, 0)
139 | 
140 |         visual_query = self.visual_proj(visual_query)
141 |         visual_query = self.visual_proj_norm(visual_query)
142 | 
143 |         return visual_query
144 | 
145 |     def forward(self, tokens, labels, imgs):
146 |         
147 |         visual_proj = self.forward_visual(imgs)
148 | 
149 |         _bsz, seqlen = tokens.shape
150 | 
151 |         h = self.llama.tok_embeddings(tokens)
152 |         freqs_cis = self.llama.freqs_cis.to(h.device)
153 |         freqs_cis = freqs_cis[:seqlen]
154 |         mask = None
155 |         mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=h.device)
156 |         mask = torch.triu(mask, diagonal=0 + 1).type_as(h)
157 | 
158 |         adapter = self.adapter_query.weight.reshape(self.query_layer, self.query_len, -1).unsqueeze(1)
159 |         adapter_index = 0
160 |         for layer in self.llama.layers:
161 |             h = layer(h, 0, freqs_cis, mask, visual_proj + adapter[adapter_index])
162 |             adapter_index = adapter_index + 1
163 | 
164 |         h = self.llama.norm(h)
165 |         output = self.llama.output(h)
166 |         output = output[:, :-1, :]
167 |         labels = labels[:, 1:]
168 | 
169 |         if labels.sum() == 0:
170 |             c_loss = output.mean() * 0
171 |         else:
172 |             assert self.llama.vocab_size == 32000
173 |             c_loss = self.criterion(output.reshape(-1, self.llama.vocab_size), labels.flatten())
174 | 
175 |         return c_loss, c_loss
176 | 
177 |     #@torch.inference_mode()
178 |     @torch.no_grad()
179 |     def forward_inference(self, visual_proj, tokens, start_pos: int):
180 |         _bsz, seqlen = tokens.shape
181 |         h = self.llama.tok_embeddings(tokens)
182 |         freqs_cis = self.llama.freqs_cis.to(h.device)
183 |         freqs_cis = freqs_cis[start_pos : start_pos + seqlen]
184 |         mask = None
185 |         mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=h.device)
186 |         mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
187 | 
188 | 
189 |         adapter = self.adapter_query.weight.reshape(self.query_layer, self.query_len, -1).unsqueeze(1)
190 |         adapter_index = 0
191 | 
192 |         for layer in self.llama.layers:
193 |             h = layer(h, start_pos, freqs_cis, mask, visual_proj + adapter[adapter_index].repeat(_bsz, 1, 1))
194 |             adapter_index = adapter_index + 1
195 | 
196 |         h = self.llama.norm(h)
197 |         output = self.llama.output(h[:, -1, :])
198 | 
199 |         return output.float()
200 | 
201 |     #@torch.inference_mode()
202 |     @torch.no_grad()
203 |     def generate(
204 |         self, imgs, prompts,
205 |         max_gen_len: int = 256,
206 |         temperature: float = 0.1,
207 |         top_p: float = 0.75,
208 |     ):
209 |         bsz = len(imgs)
210 |         params = self.llama.params
211 |         assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
212 |         assert len(imgs) == len(prompts)
213 |         
214 |         with torch.cuda.amp.autocast():
215 |             visual_query = self.forward_visual(imgs)
216 | 
217 |         if isinstance(prompts[0], str):
218 |             prompts = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
219 | 
220 |         min_prompt_size = min([len(t) for t in prompts])
221 |         max_prompt_size = max([len(t) for t in prompts])
222 | 
223 |         total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
224 | 
225 |         tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
226 | 
227 |         for k, t in enumerate(prompts):
228 |             tokens[k, : len(t)] = torch.tensor(t).cuda().long()
229 |         input_text_mask = tokens != self.tokenizer.pad_id
230 |         start_pos = min_prompt_size
231 |         prev_pos = 0
232 |         for cur_pos in range(start_pos, total_len):
233 |             with torch.cuda.amp.autocast():
234 |                 logits = self.forward_inference(visual_query, tokens[:, prev_pos:cur_pos], prev_pos)
235 |             if temperature > 0:
236 |                 probs = torch.softmax(logits / temperature, dim=-1)
237 |                 next_token = sample_top_p(probs, top_p)
238 |             else:
239 |                 next_token = torch.argmax(logits, dim=-1)
240 |             next_token = next_token.reshape(-1)
241 | 
242 |             next_token = torch.where(
243 |                 input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
244 |             )
245 |             tokens[:, cur_pos] = next_token
246 |             # trick: early stop if bsz==1
247 |             if bsz == 1 and next_token[0] == self.tokenizer.eos_id:
248 |                 break
249 |             prev_pos = cur_pos
250 | 
251 |         decoded = []
252 |         for i, t in enumerate(tokens.tolist()):
253 | 
254 |             # cut to max gen len
255 |             t = t[len(prompts[i]): len(prompts[i]) + max_gen_len]
256 |             # cut to eos tok if any
257 |             try:
258 |                 t = t[: t.index(self.tokenizer.eos_id)]
259 |             except ValueError:
260 |                 pass
261 |             decoded.append(self.tokenizer.decode(t))
262 | 
263 |         return decoded
264 | 
265 | 
266 | _MODELS = {
267 |     "BIAS-7B": "https://github.com/ZrrSkywalker/LLaMA-Adapter/releases/download/v.2.0.0/7fa55208379faf2dd862565284101b0e4a2a72114d6490a95e432cf9d9b6c813_BIAS-7B.pth",
268 |     # "LORA16-7B": "",
269 |     # "PARTIAL-7B": ""
270 | }
271 | 
272 | def available_models():
273 |     return list(_MODELS.keys())
274 | 
275 | def load(name, llama_dir, device="cuda" if torch.cuda.is_available() else "cpu", download_root='ckpts', max_seq_len=512,
276 |         phase="finetune"):
277 |     if name in _MODELS:
278 |         model_path = _download(_MODELS[name], download_root)
279 |     elif os.path.isfile(name):
280 |         model_path = name
281 |     else:
282 |         return RuntimeError(f"Model {name} not found; available models = {available_models()}")
283 | 
284 |     ckpt = torch.load(model_path, map_location='cpu')
285 |     
286 |     # BIAS-7B or https://xxx/sha256_BIAS-7B.pth -> 7B
287 |     llama_type = name.split('.')[0].split('-')[-1]
288 |     llama_ckpt_dir = os.path.join(llama_dir, llama_type)
289 |     llama_tokenzier_path = os.path.join(llama_dir, 'tokenizer.model')
290 | 
291 |     # load llama_adapter weights and model_cfg
292 |     print(f'Loading LLaMA-Adapter from {model_path}')
293 |     ckpt = torch.load(model_path, map_location='cpu')
294 | 
295 |     model = LLaMA_adapter(
296 |         llama_ckpt_dir, llama_tokenzier_path,
297 |         max_seq_len=max_seq_len, max_batch_size=1,
298 |         clip_model='ViT-L/14@336px',
299 |         v_embed_dim=1024, v_depth=16,
300 |         v_num_heads=16, v_mlp_ratio=4.0,
301 |         query_len=577, query_layer=32,
302 |         phase=phase)
303 | 
304 |     load_result = model.load_state_dict(ckpt['model'], strict=False)
305 | 
306 |     # assert len(load_result.unexpected_keys) == 0, f"Unexpected keys: {load_result.unexpected_keys}"
307 |     return model.to(device), model.clip_transform


--------------------------------------------------------------------------------
/test/llama/llama_adapter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from pathlib import Path
  4 | 
  5 | import clip
  6 | import torch
  7 | import torch.nn as nn
  8 | from timm.models.vision_transformer import Block
  9 | 
 10 | from .llama import ModelArgs, Transformer, BERTTransformer
 11 | from .tokenizer import Tokenizer
 12 | from .utils import sample_top_p, _download
 13 | class RMSNorm(torch.nn.Module):
 14 |     def __init__(self, dim: int, eps: float = 1e-6):
 15 |         super().__init__()
 16 |         self.eps = eps
 17 |         self.weight = nn.Parameter(torch.ones(dim))
 18 | 
 19 |     def _norm(self, x):
 20 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
 21 | 
 22 |     def forward(self, x):
 23 |         output = self._norm(x.float()).type_as(x)
 24 |         return output * self.weight
 25 | 
 26 | class LLaMA_adapter(nn.Module):
 27 | 
 28 |     def __init__(self, llama_ckpt_dir, llama_tokenizer,
 29 |                  max_seq_len=512, max_batch_size=1,
 30 |                  clip_model='ViT-L/14@336px',
 31 |                  v_embed_dim=1024, v_depth=16,
 32 |                  v_num_heads=16, v_mlp_ratio=4.0,
 33 |                  query_len=577, query_layer=32, phase="finetune"):
 34 |         super().__init__()
 35 |         # llama configs
 36 |         with open(os.path.join(llama_ckpt_dir, "7B/params.json"), "r") as f:
 37 |             params = json.loads(f.read())
 38 |         bias_lora = phase == "finetune"
 39 |         model_args: ModelArgs = ModelArgs(
 40 |             max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
 41 |         ) # max_batch_size only affects inferenc
 42 | 
 43 |         # 1. clip and clip projector
 44 |         self.clip, self.clip_transform = clip.load(clip_model,download_root='./ckpts')
 45 | 
 46 |         clip_dim = self.clip.visual.proj.shape[1]
 47 |         self.clip_proj = nn.Linear(clip_dim, v_embed_dim)
 48 |         self.clip_proj_norm = nn.LayerNorm(v_embed_dim)
 49 | 
 50 |         self.query_len = query_len
 51 |         self.query_layer = query_layer
 52 | 
 53 |         # 2. visual query, blocks and projector
 54 | 
 55 |         visual_model_args = ModelArgs(dim=1024, n_layers=16, n_heads=8, max_seq_len=577)
 56 |         visual_model_args.vocab_size = 1024
 57 |         self.visual_blocks = BERTTransformer(visual_model_args)
 58 |         self.visual_proj = nn.Linear(v_embed_dim, model_args.dim)
 59 |         self.visual_proj_norm = nn.LayerNorm(model_args.dim)
 60 | 
 61 |         # 3. adapter query
 62 |         self.adapter_query = nn.Embedding(
 63 |             query_len * query_layer, model_args.dim)
 64 | 
 65 |         # 4. tokenizer
 66 |         self.tokenizer = Tokenizer(model_path=llama_tokenizer)
 67 | 
 68 |         # 5. llama 
 69 |         model_args.w_bias = bias_lora
 70 |         model_args.w_lora = bias_lora
 71 |         model_args.vocab_size = self.tokenizer.n_words
 72 |         torch.set_default_tensor_type(torch.cuda.HalfTensor)
 73 |         self.llama = Transformer(model_args)
 74 |         torch.set_default_tensor_type(torch.FloatTensor)
 75 | 
 76 |         ckpts = ['./ckpts/llama_model_weights/7B/consolidated.00.pth']
 77 |         #ckpts = sorted(Path().glob("*.pth"))
 78 |         for ckpt in ckpts:
 79 |             # print('load_ckpt_path:', ckpt)
 80 |             ckpt = torch.load(ckpt, map_location='cpu')
 81 |             self.llama.load_state_dict(ckpt, strict=False)
 82 |             # print(ckpt['layers.31.attention.wo.weight'])
 83 |             # assert(0)
 84 |         # for name, para in self.llama.named_parameters():
 85 |         #     if 'layers.31.attention.wo.weight' in name:
 86 |         #         print(para.data)
 87 |         #         assert(0)
 88 | 
 89 |         for name, param in self.named_parameters():
 90 |             param.requires_grad = False
 91 | 
 92 |         for name, para in self.llama.named_parameters():
 93 |             if 'norm' in name:
 94 |                 para.data = para.data.float()
 95 |                 para.requires_grad = True
 96 |             if 'bias' in name:
 97 |                 para.data = para.data.float()
 98 |                 para.requires_grad = True
 99 |             if 'lora' in name:
100 |                 para.data = para.data.float()
101 |                 para.requires_grad = True
102 |         count = 0
103 |         for name, param in self.named_parameters():
104 |             if param.requires_grad:
105 |                count += 1
106 |                print(f"Trainable param: {name}, {param.shape}, {param.dtype}")
107 |         
108 | 
109 |         # 6. training criterion
110 |         self.criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
111 | 
112 |     def clip_encode_image(self, x):
113 |         # print(x.dtype)
114 |         # print(self.clip.visual.conv1.weight.dtype)
115 |         # assert(0)
116 |         # modified from CLIP
117 |         x = self.clip.visual.conv1(x)  # shape = [*, width, grid, grid]
118 |         
119 |         # shape = [*, width, grid ** 2]
120 |         x = x.reshape(x.shape[0], x.shape[1], -1)
121 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
122 |         x = torch.cat([self.clip.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1,
123 |                       x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
124 |         x = x + self.clip.visual.positional_embedding.to(x.dtype)
125 |         x = self.clip.visual.ln_pre(x)
126 | 
127 |         x = x.permute(1, 0, 2)  # NLD -> LND
128 |         x = self.clip.visual.transformer(x)
129 |         x = x.permute(1, 0, 2)  # LND -> NLD
130 | 
131 |         # preserve all spatial tokens
132 |         x = self.clip.visual.ln_post(x[:, :, :])
133 | 
134 |         if self.clip.visual.proj is not None:
135 |             x = x @ self.clip.visual.proj
136 | 
137 |         return x
138 | 
139 |     def forward_visual(self, imgs):
140 |         clip_feats = self.clip_encode_image(imgs)
141 |         clip_feats = self.clip_proj_norm(self.clip_proj(clip_feats.float()))
142 | 
143 |         visual_query = clip_feats
144 |         visual_query = self.visual_blocks(visual_query, 0)
145 | 
146 |         visual_query = self.visual_proj(visual_query)
147 |         visual_query = self.visual_proj_norm(visual_query)
148 | 
149 |         return visual_query
150 | 
151 |     def forward(self, tokens, labels, imgs):
152 |         
153 |         visual_proj = self.forward_visual(imgs)
154 | 
155 |         _bsz, seqlen = tokens.shape
156 | 
157 |         h = self.llama.tok_embeddings(tokens)
158 |         freqs_cis = self.llama.freqs_cis.to(h.device)
159 |         freqs_cis = freqs_cis[:seqlen]
160 |         mask = None
161 |         mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=h.device)
162 |         mask = torch.triu(mask, diagonal=0 + 1).type_as(h)
163 | 
164 |         adapter = self.adapter_query.weight.reshape(self.query_layer, self.query_len, -1).unsqueeze(1)
165 |         adapter_index = 0
166 |         for layer in self.llama.layers:
167 |             h = layer(h, 0, freqs_cis, mask, visual_proj + adapter[adapter_index])
168 |             adapter_index = adapter_index + 1
169 | 
170 |         h = self.llama.norm(h)
171 |         output = self.llama.output(h)
172 |         output = output[:, :-1, :]
173 |         labels = labels[:, 1:]
174 | 
175 |         if labels.sum() == 0:
176 |             c_loss = output.mean() * 0
177 |         else:
178 |             assert self.llama.vocab_size == 32000
179 |             c_loss = self.criterion(output.reshape(-1, self.llama.vocab_size), labels.flatten())
180 | 
181 |         return c_loss, c_loss
182 | 
183 |     #@torch.inference_mode()
184 |     @torch.no_grad()
185 |     def forward_inference(self, visual_proj, tokens, start_pos: int):
186 |         _bsz, seqlen = tokens.shape
187 |         h = self.llama.tok_embeddings(tokens)
188 |         freqs_cis = self.llama.freqs_cis.to(h.device)
189 |         freqs_cis = freqs_cis[start_pos : start_pos + seqlen]
190 |         mask = None
191 |         mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=h.device)
192 |         mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
193 | 
194 | 
195 |         adapter = self.adapter_query.weight.reshape(self.query_layer, self.query_len, -1).unsqueeze(1)
196 |         adapter_index = 0
197 | 
198 |         for layer in self.llama.layers:
199 |             h = layer(h, start_pos, freqs_cis, mask, visual_proj + adapter[adapter_index].repeat(_bsz, 1, 1))
200 |             adapter_index = adapter_index + 1
201 | 
202 |         h = self.llama.norm(h)
203 |         output = self.llama.output(h[:, -1, :])
204 | 
205 |         return output.float()
206 | 
207 |     #@torch.inference_mode()
208 |     @torch.no_grad()
209 |     def generate(
210 |         self, imgs, prompts,
211 |         max_gen_len: int = 256,
212 |         temperature: float = 0.1,
213 |         top_p: float = 0.75,
214 |     ):
215 |         bsz = len(imgs)
216 |         params = self.llama.params
217 |         assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
218 |         assert len(imgs) == len(prompts)
219 |         
220 |         with torch.cuda.amp.autocast():
221 |             visual_query = self.forward_visual(imgs)
222 | 
223 |         if isinstance(prompts[0], str):
224 |             prompts = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
225 | 
226 |         min_prompt_size = min([len(t) for t in prompts])
227 |         max_prompt_size = max([len(t) for t in prompts])
228 | 
229 |         total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
230 | 
231 |         tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
232 | 
233 |         for k, t in enumerate(prompts):
234 |             tokens[k, : len(t)] = torch.tensor(t).cuda().long()
235 |         input_text_mask = tokens != self.tokenizer.pad_id
236 |         start_pos = min_prompt_size
237 |         prev_pos = 0
238 |         for cur_pos in range(start_pos, total_len):
239 |             with torch.cuda.amp.autocast():
240 |                 logits = self.forward_inference(visual_query, tokens[:, prev_pos:cur_pos], prev_pos)
241 |             if temperature > 0:
242 |                 probs = torch.softmax(logits / temperature, dim=-1)
243 |                 next_token = sample_top_p(probs, top_p)
244 |             else:
245 |                 next_token = torch.argmax(logits, dim=-1)
246 |             next_token = next_token.reshape(-1)
247 | 
248 |             next_token = torch.where(
249 |                 input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
250 |             )
251 |             tokens[:, cur_pos] = next_token
252 |             # trick: early stop if bsz==1
253 |             if bsz == 1 and next_token[0] == self.tokenizer.eos_id:
254 |                 break
255 |             prev_pos = cur_pos
256 | 
257 |         decoded = []
258 |         for i, t in enumerate(tokens.tolist()):
259 | 
260 |             # cut to max gen len
261 |             t = t[len(prompts[i]): len(prompts[i]) + max_gen_len]
262 |             # cut to eos tok if any
263 |             try:
264 |                 t = t[: t.index(self.tokenizer.eos_id)]
265 |             except ValueError:
266 |                 pass
267 |             decoded.append(self.tokenizer.decode(t))
268 | 
269 |         return decoded
270 | 
271 | 
272 | _MODELS = {
273 |     "BIAS-7B": "https://github.com/ZrrSkywalker/LLaMA-Adapter/releases/download/v.2.0.0/7fa55208379faf2dd862565284101b0e4a2a72114d6490a95e432cf9d9b6c813_BIAS-7B.pth",
274 |     # "LORA16-7B": "",
275 |     # "PARTIAL-7B": ""
276 | }
277 | 
278 | def available_models():
279 |     return list(_MODELS.keys())
280 | 
281 | def load(name, llama_dir, device="cuda" if torch.cuda.is_available() else "cpu", download_root='ckpts', max_seq_len=512,
282 |         phase="finetune"):
283 |     if name in _MODELS:
284 |         model_path = _download(_MODELS[name], download_root)
285 |     elif os.path.isfile(name):
286 |         model_path = name
287 |     else:
288 |         return RuntimeError(f"Model {name} not found; available models = {available_models()}")
289 | 
290 |     ckpt = torch.load(model_path, map_location='cpu')
291 |     
292 |     # BIAS-7B or https://xxx/sha256_BIAS-7B.pth -> 7B
293 |     llama_type = name.split('.')[0].split('-')[-1]
294 |     llama_ckpt_dir = os.path.join(llama_dir, llama_type)
295 |     llama_tokenzier_path = os.path.join(llama_dir, 'tokenizer.model')
296 | 
297 |     # load llama_adapter weights and model_cfg
298 |     print(f'Loading checkpoint from {model_path}')
299 |     
300 | 
301 |     model = LLaMA_adapter(
302 |         llama_dir, llama_tokenzier_path,
303 |         max_seq_len=max_seq_len, max_batch_size=1,
304 |         clip_model='ViT-L/14@336px',
305 |         v_embed_dim=1024, v_depth=16,
306 |         v_num_heads=16, v_mlp_ratio=4.0,
307 |         query_len=577, query_layer=32,
308 |         phase=phase)
309 | 
310 |     load_result = model.load_state_dict(ckpt['model'], strict=False)
311 | 
312 |     # assert len(load_result.unexpected_keys) == 0, f"Unexpected keys: {load_result.unexpected_keys}"
313 |     return model.to(device), model.clip_transform
314 | 


--------------------------------------------------------------------------------
/data_collection/stats/test_id.txt:
--------------------------------------------------------------------------------
  1 | 24931 Table
  2 | 103319 Window
  3 | 26387 Table
  4 | 100082 USB
  5 | 2095 Faucet
  6 | 34178 Table
  7 | 28164 Table
  8 | 102625 Toilet
  9 | 102018 Oven
 10 | 102001 Oven
 11 | 102765 Kettle
 12 | 1712 Faucet
 13 | 102678 Toilet
 14 | 102984 Window
 15 | 30666 Table
 16 | 103051 Window
 17 | 100065 USB
 18 | 102669 Toilet
 19 | 102037 USB
 20 | 148 Faucet
 21 | 27619 Table
 22 | 100513 USB
 23 | 102663 Toilet
 24 | 103813 Phone
 25 | 1788 Faucet
 26 | 7332 Oven
 27 | 103285 Phone
 28 | 149 Faucet
 29 | 102732 Kettle
 30 | 1653 Faucet
 31 | 103305 Stapler
 32 | 11040 Scissors
 33 | 7290 Oven
 34 | 10960 Scissors
 35 | 1626 Faucet
 36 | 25308 Table
 37 | 2035 Faucet
 38 | 100061 USB
 39 | 102641 Toilet
 40 | 102629 Toilet
 41 | 103238 Window
 42 | 102690 Toilet
 43 | 27267 Table
 44 | 153 Faucet
 45 | 33810 Table
 46 | 102703 Toilet
 47 | 1444 Faucet
 48 | 25160 Table
 49 | 2054 Faucet
 50 | 102687 Toilet
 51 | 1556 Faucet
 52 | 102805 Window
 53 | 103040 Window
 54 | 103340 Window
 55 | 2115 Faucet
 56 | 102621 Toilet
 57 | 1931 Faucet
 58 | 34610 Table
 59 | 26886 Table
 60 | 101924 Oven
 61 | 32174 Table
 62 | 100095 USB
 63 | 100116 USB
 64 | 11111 Scissors
 65 | 33914 Table
 66 | 3971 Kettle
 67 | 7347 Oven
 68 | 102654 Toilet
 69 | 1596 Faucet
 70 | 102675 Toilet
 71 | 102055 Oven
 72 | 1028 Faucet
 73 | 103293 Stapler
 74 | 103814 Phone
 75 | 102677 Toilet
 76 | 103312 Window
 77 | 103276 Stapler
 78 | 103518 WashingMachine
 79 | 101931 Oven
 80 | 1721 Faucet
 81 | 10569 Scissors
 82 | 20985 Table
 83 | 101943 Oven
 84 | 102801 Window
 85 | 102647 Toilet
 86 | 103917 Phone
 87 | 102756 Kettle
 88 | 32625 Table
 89 | 26525 Table
 90 | 102620 Toilet
 91 | 100106 USB
 92 | 103935 Phone
 93 | 102634 Toilet
 94 | 1401 Faucet
 95 | 102707 Toilet
 96 | 102657 Toilet
 97 | 32746 Table
 98 | 103095 Stapler
 99 | 19179 Table
100 | 103235 Window
101 | 102679 Toilet
102 | 1646 Faucet
103 | 1488 Faucet
104 | 11047 Scissors
105 | 102684 Toilet
106 | 102738 Kettle
107 | 102977 Window
108 | 10902 Scissors
109 | 100031 Kettle
110 | 23782 Table
111 | 10559 Scissors
112 | 26503 Table
113 | 103480 WashingMachine
114 | 32259 Table
115 | 103332 Window
116 | 102724 Kettle
117 | 10889 Scissors
118 | 7179 Oven
119 | 102021 USB
120 | 100283 WashingMachine
121 | 19740 Table
122 | 103892 Phone
123 | 929 Faucet
124 | 10622 Scissors
125 | 20411 Table
126 | 103425 WashingMachine
127 | 102981 Window
128 | 10499 Scissors
129 | 103775 WashingMachine
130 | 101983 USB
131 | 10564 Scissors
132 | 102665 Toilet
133 | 102676 Toilet
134 | 102903 Window
135 | 32213 Table
136 | 33457 Table
137 | 29921 Table
138 | 1925 Faucet
139 | 103099 Stapler
140 | 100068 USB
141 | 2084 Faucet
142 | 101947 Oven
143 | 101311 Kettle
144 | 103320 Window
145 | 103201 Kettle
146 | 103208 Kettle
147 | 7220 Oven
148 | 27189 Table
149 | 102704 Toilet
150 | 102688 Toilet
151 | 10962 Scissors
152 | 26657 Table
153 | 1427 Faucet
154 | 23472 Table
155 | 2113 Faucet
156 | 22339 Table
157 | 25493 Table
158 | 102804 Window
159 | 11077 Scissors
160 | 1741 Faucet
161 | 22508 Table
162 | 102730 Kettle
163 | 101319 Toilet
164 | 1794 Faucet
165 | 24644 Table
166 | 103828 Phone
167 | 103301 Stapler
168 | 20279 Table
169 | 23807 Table
170 | 101999 USB
171 | 102024 USB
172 | 23372 Table
173 | 102736 Kettle
174 | 102062 USB
175 | 101917 Oven
176 | 11013 Scissors
177 | 1903 Faucet
178 | 103236 Window
179 | 102667 Toilet
180 | 1280 Faucet
181 | 102906 Window
182 | 103941 Phone
183 | 102710 Toilet
184 | 102628 Toilet
185 | 100064 USB
186 | 2108 Faucet
187 | 102708 Toilet
188 | 102063 USB
189 | 34617 Table
190 | 103792 Stapler
191 | 100071 USB
192 | 1795 Faucet
193 | 30869 Table
194 | 100079 USB
195 | 103255 Window
196 | 2083 Faucet
197 | 26608 Table
198 | 22241 Table
199 | 25959 Table
200 | 103275 Stapler
201 | 103253 Window
202 | 102896 Window
203 | 101948 USB
204 | 102658 Toilet
205 | 102697 Toilet
206 | 31601 Table
207 | 103351 WashingMachine
208 | 32324 Table
209 | 19898 Table
210 | 29557 Table
211 | 103050 Window
212 | 102025 USB
213 | 7187 Oven
214 | 1896 Faucet
215 | 102660 Toilet
216 | 103052 Window
217 | 103369 WashingMachine
218 | 102753 Kettle
219 | 102631 Toilet
220 | 100133 USB
221 | 102692 Toilet
222 | 26670 Table
223 | 102662 Toilet
224 | 103113 Stapler
225 | 866 Faucet
226 | 103058 Window
227 | 27044 Table
228 | 103032 Window
229 | 167 Faucet
230 | 103227 Toilet
231 | 101909 Oven
232 | 101315 Kettle
233 | 102649 Toilet
234 | 10557 Scissors
235 | 102786 Kettle
236 | 102639 Toilet
237 | 19825 Table
238 | 102060 Oven
239 | 103063 Window
240 | 100086 USB
241 | 10561 Scissors
242 | 102622 Toilet
243 | 103044 Window
244 | 103593 Phone
245 | 102016 USB
246 | 102698 Toilet
247 | 24152 Table
248 | 1885 Faucet
249 | 26073 Table
250 | 21467 Table
251 | 101952 USB
252 | 32086 Table
253 | 100073 USB
254 | 103042 Window
255 | 102798 Window
256 | 22301 Table
257 | 100113 USB
258 | 32932 Table
259 | 103325 Window
260 | 102739 Kettle
261 | 931 Faucet
262 | 10562 Scissors
263 | 1961 Faucet
264 | 103251 Phone
265 | 102651 Toilet
266 | 103015 Window
267 | 102726 Kettle
268 | 102664 Toilet
269 | 102636 Toilet
270 | 20453 Table
271 | 102905 Window
272 | 10973 Scissors
273 | 101921 Oven
274 | 22367 Table
275 | 103271 Stapler
276 | 33930 Table
277 | 1785 Faucet
278 | 10502 Scissors
279 | 1823 Faucet
280 | 152 Faucet
281 | 2140 Faucet
282 | 102773 Kettle
283 | 101930 Oven
284 | 102648 Toilet
285 | 103233 Toilet
286 | 1802 Faucet
287 | 1935 Faucet
288 | 26692 Table
289 | 1034 Faucet
290 | 2170 Faucet
291 | 101960 USB
292 | 102619 Toilet
293 | 1011 Faucet
294 | 32354 Table
295 | 10567 Scissors
296 | 103297 Stapler
297 | 10686 Scissors
298 | 103361 WashingMachine
299 | 11089 Scissors
300 | 103778 WashingMachine
301 | 10537 Scissors
302 | 23511 Table
303 | 103234 Toilet
304 | 30663 Table
305 | 100085 USB
306 | 26899 Table
307 | 103776 WashingMachine
308 | 103350 Phone
309 | 26800 Table
310 | 102763 Kettle
311 | 102009 USB
312 | 32052 Table
313 | 103329 Window
314 | 103781 WashingMachine
315 | 156 Faucet
316 | 857 Faucet
317 | 30857 Table
318 | 101320 Toilet
319 | 101808 Oven
320 | 11080 Scissors
321 | 11029 Scissors
322 | 1435 Faucet
323 | 102052 USB
324 | 103230 Toilet
325 | 102666 Toilet
326 | 103927 Phone
327 | 10894 Scissors
328 | 10844 Scissors
329 | 1901 Faucet
330 | 10558 Scissors
331 | 11021 Scissors
332 | 7130 Oven
333 | 1466 Faucet
334 | 20745 Table
335 | 103077 Window
336 | 21473 Table
337 | 103273 Stapler
338 | 1817 Faucet
339 | 101950 USB
340 | 101946 Oven
341 | 103315 Window
342 | 103283 Stapler
343 | 103070 Window
344 | 100128 USB
345 | 960 Faucet
346 | 102646 Toilet
347 | 103239 Window
348 | 101940 Oven
349 | 1380 Faucet
350 | 100072 USB
351 | 1633 Faucet
352 | 1986 Faucet
353 | 103223 Kettle
354 | 102670 Toilet
355 | 1053 Faucet
356 | 102689 Toilet
357 | 103684 Window
358 | 822 Faucet
359 | 103333 Window
360 | 103207 Kettle
361 | 21718 Table
362 | 102715 Kettle
363 | 100087 USB
364 | 103299 Stapler
365 | 102042 USB
366 | 7201 Oven
367 | 103311 Window
368 | 11099 Scissors
369 | 10968 Scissors
370 | 103452 WashingMachine
371 | 811 Faucet
372 | 862 Faucet
373 | 103540 Window
374 | 1370 Faucet
375 | 103149 Window
376 | 103303 Stapler
377 | 102652 Toilet
378 | 102068 USB
379 | 100511 USB
380 | 26545 Table
381 | 100078 USB
382 | 100103 USB
383 | 103056 Window
384 | 22870 Table
385 | 102701 Toilet
386 | 32601 Table
387 | 101323 Toilet
388 | 2017 Faucet
389 | 1288 Faucet
390 | 27478 Table
391 | 103886 Phone
392 | 23724 Table
393 | 101886 USB
394 | 20077 Table
395 | 101305 Kettle
396 | 1479 Faucet
397 | 11036 Scissors
398 | 103925 Phone
399 | 102761 Kettle
400 | 102985 Window
401 | 103347 Phone
402 | 19855 Table
403 | 1668 Faucet
404 | 154 Faucet
405 | 103321 Window
406 | 11026 Scissors
407 | 1528 Faucet
408 | 100108 USB
409 | 102768 Kettle
410 | 31249 Table
411 | 103339 Window
412 | 7120 Oven
413 | 102720 Kettle
414 | 103669 Window
415 | 885 Faucet
416 | 1667 Faucet
417 | 28594 Table
418 | 11052 Scissors
419 | 32566 Table
420 | 101971 Oven
421 | 1343 Faucet
422 | 101773 Oven
423 | 11113 Scissors
424 | 102065 USB
425 | 103318 Window
426 | 29525 Table
427 | 102803 Window
428 | 102714 Kettle
429 | 100982 Window
430 | 2082 Faucet
431 | 908 Faucet
432 | 103292 Stapler
433 | 10495 Scissors
434 | 103242 Window
435 | 101994 USB
436 | 103521 WashingMachine
437 | 26875 Table
438 | 11020 Scissors
439 | 1492 Faucet
440 | 102702 Toilet
441 | 1386 Faucet
442 | 103280 Stapler
443 | 10893 Scissors
444 | 1052 Faucet
445 | 10449 Scissors
446 | 103268 Window
447 | 168 Faucet
448 | 103222 Kettle
449 | 10907 Scissors
450 | 22692 Table
451 | 103323 Window
452 | 100123 USB
453 | 102100 Fan
454 | 101539 Dispenser
455 | 41452 StorageFurniture
456 | 101531 Dispenser
457 | 47686 StorageFurniture
458 | 46768 StorageFurniture
459 | 45523 StorageFurniture
460 | 12727 Keyboard
461 | 9148 Door
462 | 101457 Fan
463 | 45759 StorageFurniture
464 | 46440 StorageFurniture
465 | 46057 StorageFurniture
466 | 45747 StorageFurniture
467 | 100550 Suitcase
468 | 101611 Safe
469 | 100740 Globe
470 | 101117 Remote
471 | 101010 Remote
472 | 45134 StorageFurniture
473 | 103372 Dispenser
474 | 46380 StorageFurniture
475 | 101332 Eyeglasses
476 | 100794 Globe
477 | 100162 Box
478 | 10612 Refrigerator
479 | 101467 Fan
480 | 100526 FoldingChair
481 | 100760 Globe
482 | 15084 Lamp
483 | 102939 Pen
484 | 100426 Box
485 | 102588 Eyeglasses
486 | 102617 Eyeglasses
487 | 100498 Cart
488 | 46230 StorageFurniture
489 | 102095 Fan
490 | 48746 StorageFurniture
491 | 12477 TrashCan
492 | 3571 Bottle
493 | 9032 Door
494 | 102996 TrashCan
495 | 101793 Pen
496 | 46616 StorageFurniture
497 | 48876 StorageFurniture
498 | 100038 KitchenPot
499 | 14205 Lamp
500 | 3933 Bottle
501 | 101068 Knife
502 | 100756 Globe
503 | 12923 Keyboard
504 | 100924 Switch
505 | 103755 Suitcase
506 | 45001 StorageFurniture
507 | 101054 Knife
508 | 100172 Pliers
509 | 101541 Dispenser
510 | 102556 Cart
511 | 7366 Microwave
512 | 100999 Remote
513 | 49038 StorageFurniture
514 | 46108 StorageFurniture
515 | 102209 TrashCan
516 | 45690 StorageFurniture
517 | 11030 Laptop
518 | 103866 Printer
519 | 13928 Lamp
520 | 44962 StorageFurniture
521 | 45219 StorageFurniture
522 | 101565 Dispenser
523 | 46549 StorageFurniture
524 | 11211 Refrigerator
525 | 103010 TrashCan
526 | 100214 Box
527 | 100750 Globe
528 | 103582 Knife
529 | 103353 Dispenser
530 | 100189 Box
531 | 104040 Remote
532 | 46966 StorageFurniture
533 | 102186 TrashCan
534 | 102966 Pen
535 | 102423 Safe
536 | 48010 StorageFurniture
537 | 45132 StorageFurniture
538 | 101845 Eyeglasses
539 | 46825 StorageFurniture
540 | 100590 FoldingChair
541 | 100914 Switch
542 | 103770 Suitcase
543 | 103408 Dispenser
544 | 103996 Printer
545 | 45135 StorageFurniture
546 | 102177 TrashCan
547 | 10040 Laptop
548 | 100191 Box
549 | 48013 StorageFurniture
550 | 10144 Refrigerator
551 | 100792 Globe
552 | 10068 Refrigerator
553 | 46801 StorageFurniture
554 | 45677 StorageFurniture
555 | 102155 TrashCan
556 | 4094 Display
557 | 48740 StorageFurniture
558 | 103728 Knife
559 | 10707 Laptop
560 | 103740 Knife
561 | 100408 Remote
562 | 100056 KitchenPot
563 | 46466 StorageFurniture
564 | 100292 Lighter
565 | 104030 Printer
566 | 102922 Pen
567 | 13525 Lamp
568 | 45194 StorageFurniture
569 | 101440 Fan
570 | 3854 Bottle
571 | 10280 Laptop
572 | 102314 FoldingChair
573 | 102254 TrashCan
574 | 45423 StorageFurniture
575 | 100013 Remote
576 | 101593 Safe
577 | 48379 StorageFurniture
578 | 102171 TrashCan
579 | 100974 Switch
580 | 44817 StorageFurniture
581 | 47926 StorageFurniture
582 | 46439 StorageFurniture
583 | 10849 Refrigerator
584 | 3616 Bottle
585 | 4681 Display
586 | 46060 StorageFurniture
587 | 35059 StorageFurniture
588 | 101476 Fan
589 | 100767 Suitcase
590 | 45622 StorageFurniture
591 | 104004 Printer
592 | 101060 Cart
593 | 14306 Lamp
594 | 101605 Safe
595 | 4541 Display
596 | 4542 Display
597 | 102210 TrashCan
598 | 4592 Display
599 | 103967 Globe
600 | 4500 Bottle
601 | 9117 Door
602 | 46700 StorageFurniture
603 | 12851 Keyboard
604 | 101260 Knife
605 | 100948 Switch
606 | 10270 Laptop
607 | 102946 Pen
608 | 101115 Knife
609 | 100021 KitchenPot
610 | 101493 Fan
611 | 10101 Laptop
612 | 45822 StorageFurniture
613 | 45908 StorageFurniture
614 | 47645 Box
615 | 10697 Laptop
616 | 101375 Fan
617 | 101524 Fan
618 | 48479 StorageFurniture
619 | 101090 Cart
620 | 9263 Door
621 | 9968 Laptop
622 | 103762 Suitcase
623 | 9016 Door
624 | 101139 Remote
625 | 5688 Bottle
626 | 103194 Eyeglasses
627 | 10638 Refrigerator
628 | 102608 Eyeglasses
629 | 11141 Laptop
630 | 4590 Display
631 | 103989 Printer
632 | 45427 StorageFurniture
633 | 103358 Dispenser
634 | 46403 StorageFurniture
635 | 48686 StorageFurniture
636 | 100279 Printer
637 | 3593 Bottle
638 | 7119 Microwave
639 | 100491 Cart
640 | 100828 Remote
641 | 12059 Refrigerator
642 | 15423 Lamp
643 | 16047 Lamp
644 | 6040 Bottle
645 | 101860 Eyeglasses
646 | 3519 Bottle
647 | 100057 KitchenPot
648 | 101450 Fan
649 | 100310 Lighter
650 | 46180 StorageFurniture
651 | 102318 Safe
652 | 100853 Cart
653 | 101416 Mouse
654 | 102160 TrashCan
655 | 45780 StorageFurniture
656 | 104044 Remote
657 | 12447 TrashCan
658 | 100075 Cart
659 | 101861 Eyeglasses
660 | 48413 StorageFurniture
661 | 100234 Box
662 | 45403 StorageFurniture
663 | 100350 Lighter
664 | 16675 Lamp
665 | 100968 Switch
666 | 102292 Pliers
667 | 101066 Cart
668 | 45397 StorageFurniture
669 | 103013 TrashCan
670 | 46653 StorageFurniture
671 | 100221 Box
672 | 103380 Dispenser
673 | 45092 StorageFurniture
674 | 100759 Globe
675 | 100557 FoldingChair
676 | 100825 Suitcase
677 | 7263 Microwave
678 | 100243 Box
679 | 11242 Laptop
680 | 48381 StorageFurniture
681 | 102943 Pen
682 | 103978 Printer
683 | 48243 StorageFurniture
684 | 103583 Knife
685 | 102193 TrashCan
686 | 45699 StorageFurniture
687 | 100202 Box
688 | 9987 Laptop
689 | 101420 Fan
690 | 46084 StorageFurniture
691 | 100032 KitchenPot
692 | 46033 StorageFurniture
693 | 101099 Cart
694 | 100965 Switch
695 | 100801 Globe
696 | 101419 Fan
697 | 102173 TrashCan
698 | 48467 StorageFurniture
699 | 102573 Eyeglasses
700 | 100720 Globe
701 | 3615 Bottle
702 | 101432 Fan
703 | 45213 StorageFurniture
704 | 47238 StorageFurniture
705 | 100776 Suitcase
706 | 101055 Cart
707 | 41086 StorageFurniture
708 | 45420 StorageFurniture
709 | 41083 StorageFurniture
710 | 13004 Keyboard
711 | 12902 Keyboard
712 | 101489 Dispenser
713 | 102130 Remote
714 | 


--------------------------------------------------------------------------------