├── tma ├── __init__.py ├── imageqa │ ├── __init__.py │ ├── scene_graph │ │ ├── __init__.py │ │ └── single_image_task.py │ ├── sticker_2d │ │ ├── __init__.py │ │ └── utils.py │ ├── tabletop_3d │ │ ├── __init__.py │ │ ├── single_image_task.py │ │ ├── utils.py │ │ ├── run_blender.py │ │ ├── size_single_image_task.py │ │ └── distance_single_image_task.py │ └── metadata.py ├── videoqa │ ├── __init__.py │ ├── scene_graph │ │ ├── __init__.py │ │ └── single_video_task.py │ ├── tabletop_3d │ │ ├── __init__.py │ │ ├── single_video_task.py │ │ ├── utils.py │ │ ├── movement_single_video_task.py │ │ └── run_blender.py │ └── metadata.py ├── models │ ├── __init__.py │ └── qa_model │ │ ├── __init__.py │ │ ├── prompt.py │ │ ├── base_qa_model.py │ │ └── videoqa_model.py ├── constant.py ├── metadata.py ├── base.py └── task_store.py ├── teaser.png ├── assets ├── 2024-imageqa-result.png ├── 2024-videoqa-result.png ├── 2024vsrandom-imageqa.png ├── 2024vsrandom-videoqa.png ├── random-imageqa-result.png └── random-videoqa-result.png ├── requirements.txt ├── .gitignore ├── annotations ├── relation_to_type.json └── attribute_category.json ├── LICENSE └── README.md /tma/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tma/imageqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tma/videoqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tma/models/__init__.py: -------------------------------------------------------------------------------- 1 | class Model: 2 | model_name: str 3 | -------------------------------------------------------------------------------- /tma/imageqa/scene_graph/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_image_task import * 2 | -------------------------------------------------------------------------------- /tma/imageqa/sticker_2d/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_image_task import * 2 | -------------------------------------------------------------------------------- /tma/videoqa/scene_graph/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_video_task import * 2 | -------------------------------------------------------------------------------- /teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/teaser.png -------------------------------------------------------------------------------- /assets/2024-imageqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024-imageqa-result.png -------------------------------------------------------------------------------- /assets/2024-videoqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024-videoqa-result.png -------------------------------------------------------------------------------- /assets/2024vsrandom-imageqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024vsrandom-imageqa.png -------------------------------------------------------------------------------- /assets/2024vsrandom-videoqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024vsrandom-videoqa.png -------------------------------------------------------------------------------- /assets/random-imageqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/random-imageqa-result.png -------------------------------------------------------------------------------- /assets/random-videoqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/random-videoqa-result.png -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/__init__.py: -------------------------------------------------------------------------------- 1 | from .movement_single_video_task import * 2 | from .rotation_single_video_task import * 3 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/__init__.py: -------------------------------------------------------------------------------- 1 | from .distance_single_image_task import * 2 | from .single_image_task import * 3 | from .size_single_image_task import * 4 | -------------------------------------------------------------------------------- /tma/constant.py: -------------------------------------------------------------------------------- 1 | NUM_OPTIONS = 4 2 | 3 | # ImageQA 4 | 5 | IMAGE_H = 512 6 | IMAGE_W = 512 7 | 8 | # VideoQA 9 | 10 | VIDEO_H = 224 11 | VIDEO_W = 224 12 | VIDEO_FPS = 4 13 | VIDEO_NUM_FRAMES = 16 14 | -------------------------------------------------------------------------------- /tma/models/qa_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_qa_model import QAModel, QAModelInstance 2 | from .imageqa_model import ImageQAModel, list_imageqa_models, set_imageqa_model_key 3 | from .videoqa_model import ImageQAModel4Video, VideoQAModel, list_videoqa_models 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sentence-transformers==2.5.1 2 | transformers==4.38.1 3 | accelerate==0.27.2 4 | diskcache 5 | networkx 6 | openai 7 | pyarrow 8 | scikit-learn 9 | pandas 10 | matplotlib 11 | tiktoken 12 | einops 13 | transformers_stream_generator 14 | prefixspan 15 | dashscope 16 | oss2 17 | google.generativeai 18 | replicate 19 | decord 20 | opencv-python -------------------------------------------------------------------------------- /tma/metadata.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import networkx as nx 4 | 5 | 6 | class MetaData: 7 | """ 8 | Abstract class for metadata 9 | """ 10 | 11 | 12 | class CategoryMetaData(MetaData): 13 | def __init__(self): 14 | super().__init__() 15 | 16 | self.taxonomy = None 17 | self.categories = None 18 | self.category_info = None 19 | 20 | def check_category_exists(self, cateid): 21 | return cateid in self.categories 22 | 23 | def get_surfacename(self, node): 24 | return self.category_info[node]['surface_name'][0] 25 | 26 | def get_relevant_categories(self, cateid): 27 | return set(nx.descendants(self.taxonomy, cateid)) | set(nx.ancestors(self.taxonomy, cateid)) | {cateid} 28 | 29 | def get_irrelevant_categories(self, cateid): 30 | if isinstance(cateid, List): 31 | relevant_categories = set() 32 | for c in cateid: 33 | relevant_categories |= self.get_relevant_categories(c) 34 | else: 35 | relevant_categories = self.get_relevant_categories(cateid) 36 | return set(self.categories) - relevant_categories 37 | -------------------------------------------------------------------------------- /tma/videoqa/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pickle 4 | 5 | from ..imageqa.metadata import Objaverse3DMetaData 6 | from ..metadata import MetaData 7 | 8 | 9 | class ObjaverseVideoMetaData(Objaverse3DMetaData): 10 | pass 11 | 12 | 13 | def load_video_scene_graph(video_scene_graph_folder): 14 | video_folder = os.path.join(video_scene_graph_folder, "Charades_v1_480") 15 | scene_graphs = json.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/video_scene_graph.json"))) 16 | idx2name = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/idx2name.pkl"), "rb")) 17 | objects = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/objects.pkl"), "rb")) 18 | actions = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/actions.pkl"), "rb")) 19 | spatial_relations = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/spatial_relations.pkl"), "rb")) 20 | contact_relations = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/contact_relations.pkl"), "rb")) 21 | return video_folder, scene_graphs, idx2name, objects, actions, spatial_relations, contact_relations 22 | 23 | 24 | class VideoSceneGraphMetaData(MetaData): 25 | def __init__(self, path_to_metadata, video_scene_graph_folder): 26 | super().__init__() 27 | # video scene graph use idx to represent relations, objects, and actions, like r1, o1, idx_to_name is a dict to map idx to its name. 28 | self.image_folder, self.video_scene_graphs, self.idx2name, self.objects, self.actions, self.spatial_relations, self.contact_relations = ( 29 | load_video_scene_graph(video_scene_graph_folder)) 30 | 31 | def get_video_path(self, video_scene_graph_id): 32 | return os.path.join(self.image_folder, video_scene_graph_id + ".mp4") 33 | -------------------------------------------------------------------------------- /tma/base.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | 5 | from .constant import NUM_OPTIONS 6 | from .metadata import MetaData 7 | from .task_store import TaskStore 8 | 9 | 10 | class TaskGenerator: 11 | schema = {} 12 | 13 | def __init__(self, metadata: MetaData, seed=42): 14 | self.metadata = metadata 15 | self.rng = np.random.default_rng(seed=seed) 16 | 17 | def _compose_options(self, answer, negatives): 18 | if len(negatives) > NUM_OPTIONS - 1: 19 | negatives = self.rng.choice(negatives, NUM_OPTIONS - 1, replace=False).tolist() 20 | options = [answer] + negatives 21 | return options 22 | 23 | def _task_plan_to_str(self, task_plan) -> str: 24 | "(Abstract method) convert task plan to string for task embedding" 25 | 26 | def enumerate_task_plans(self, task_store: TaskStore): 27 | "(Abstract method) enumerate task plan" 28 | 29 | def generate(self, task_plan, return_data=True, seed=None): 30 | "(Abstract method) enumerate task" 31 | 32 | 33 | class JointTaskGenerator: 34 | def __init__(self, metadata: MetaData, generators: Dict, seed=42): 35 | self.generators = { 36 | k: v(metadata, seed=seed) for k, v in generators.items() 37 | } 38 | self.stats = {generator_type: 0 for generator_type in generators} 39 | self.schema = {} 40 | for generator_type, generator in self.generators.items(): 41 | self.schema.update(generator.schema) 42 | 43 | def enumerate_task_plans(self, task_store: TaskStore): 44 | for generator_type, generator in self.generators.items(): 45 | before = len(task_store) 46 | generator.enumerate_task_plans(task_store) 47 | self.stats[generator_type] = len(task_store) - before 48 | print(f"Generated [{self.stats[generator_type]}] {generator_type} tasks") 49 | task_store.dump() 50 | 51 | def generate(self, task_plan, return_data=True, seed=None): 52 | return self.generators[task_plan['task type']].generate(task_plan, return_data=return_data, seed=seed) 53 | -------------------------------------------------------------------------------- /tma/models/qa_model/prompt.py: -------------------------------------------------------------------------------- 1 | def succinct_prompt(question, choices=[]): 2 | if len(choices) == 0: 3 | prompt = question 4 | else: 5 | choices = '\n'.join(choices) 6 | prompt = (f"{question}\n" 7 | f"Select from the following choices.\n" 8 | f"{choices}") 9 | 10 | return prompt 11 | 12 | 13 | #################################################################################################### 14 | # videoqa 15 | #################################################################################################### 16 | 17 | 18 | def detailed_videoqa_prompt(question, choices=[]): 19 | if len(choices) == 0: 20 | prompt = f"Based on the video, answer the question. Question: {question} Answer:" 21 | else: 22 | prompt = (f"Based on the video, output the best option for the question.\n" 23 | f"You must only output the option.\n" 24 | f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(") 25 | return prompt 26 | 27 | 28 | def detailed_video2imageqa_prompt(question, choices=[]): 29 | if len(choices) == 0: 30 | prompt = f"This is a series of images sampled at equal intervals from the beginning to the end of a video, based on the series of images, answer the question. Question: {question} Answer:" 31 | else: 32 | prompt = (f"This is a series of images sampled at equal intervals from the beginning to the end of a video, based on the series of images, output the best option for the question.\n" 33 | f"You must only output the option.\n" 34 | f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(") 35 | return prompt 36 | 37 | 38 | #################################################################################################### 39 | # imageqa 40 | #################################################################################################### 41 | 42 | def detailed_imageqa_prompt(question, choices=[]): 43 | if len(choices) == 0: 44 | prompt = f"Based on the image, answer the question. Question: {question} Answer:" 45 | else: 46 | prompt = (f"Based on the image, output the best option for the question.\n" 47 | f"You must only output the option.\n" 48 | f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(") 49 | return prompt 50 | -------------------------------------------------------------------------------- /tma/task_store.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pyarrow as pa 3 | import pyarrow.parquet as pq 4 | 5 | pa_schema_map = { 6 | 'str' : pa.string(), 7 | 'int' : pa.int64(), 8 | 'list': pa.list_(pa.string()), 9 | } 10 | 11 | pd_schema_map = { 12 | 'str' : 'string', 13 | 'int' : 'Int64', 14 | 'list': 'object', 15 | } 16 | 17 | 18 | def get_pa_schema(schema): 19 | return pa.schema([(k, pa_schema_map[v]) for k, v in schema.items()]) 20 | 21 | 22 | def get_pd_schema(schema): 23 | return {k: pd_schema_map[v] for k, v in schema.items()} 24 | 25 | 26 | class TaskStore: 27 | 28 | def __init__(self, schema, output_file=None, buffer_size=1e8): 29 | self.columns = list(schema.keys()) 30 | self.dtypes = list(schema.values()) 31 | self.buffer = [] 32 | self.buffer_size = buffer_size 33 | self.output_file = output_file 34 | if output_file is None: 35 | self.schema = get_pd_schema(schema) 36 | self.task_plan_df = pd.DataFrame({k: pd.Series(dtype=v) for k, v in self.schema.items()}) 37 | else: 38 | print(f'Writing to {output_file}') 39 | self.counter = 0 40 | self.schema = get_pa_schema(schema) 41 | self.parquet_writer = pq.ParquetWriter(output_file, schema=self.schema) 42 | 43 | def _update_buffer(self): 44 | if len(self.buffer) > self.buffer_size: 45 | self.dump() 46 | 47 | def dump(self): 48 | if len(self.buffer) > 0: 49 | if self.output_file is None: 50 | self.task_plan_df = pd.concat( 51 | [self.task_plan_df, pd.DataFrame(self.buffer, columns=self.columns).astype(self.schema, errors='ignore')], 52 | ignore_index=True, 53 | sort=False 54 | ) 55 | else: 56 | self.parquet_writer.write_table(pa.Table.from_pylist(self.buffer, schema=self.schema)) 57 | self.counter += len(self.buffer) 58 | self.buffer = [] 59 | 60 | def add_many(self, xs): 61 | self.buffer.extend(xs) 62 | self._update_buffer() 63 | 64 | def add(self, x): 65 | self.buffer.append(x) 66 | self._update_buffer() 67 | 68 | def __len__(self): 69 | if self.output_file is None: 70 | return len(self.task_plan_df) + len(self.buffer) 71 | else: 72 | return self.counter + len(self.buffer) 73 | 74 | def return_df(self): 75 | self.dump() 76 | return self.task_plan_df 77 | 78 | def close(self): 79 | if self.output_file is not None: 80 | self.dump() 81 | self.parquet_writer.close() 82 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/single_image_task.py: -------------------------------------------------------------------------------- 1 | from .utils import grid_mappings, grid_options, make_image, relative_grid, relative_position_phrase, relative_positions 2 | from ..metadata import Objaverse3DMetaData, ObjaverseMetaData 3 | from ..sticker_2d import GridTaskGenerator, HowManyGridTaskGenerator, WhatAttributeGridTaskGenerator, WhatGridTaskGenerator, WhereAttributeGridTaskGenerator, WhereGridTaskGenerator 4 | from ...constant import IMAGE_H, IMAGE_W 5 | 6 | 7 | class _3DGridTaskGenerator(GridTaskGenerator): 8 | metadata: Objaverse3DMetaData 9 | 10 | def __init__(self, metadata: ObjaverseMetaData, seed=42): 11 | super().__init__(metadata, seed=seed) 12 | self.grid_mappings = grid_mappings 13 | self.grid_options = grid_options 14 | self.relative_positions = relative_positions 15 | self.relative_position_phrase = relative_position_phrase 16 | 17 | def _make_image_metadata(self, grid_size, grids, queries, remaining_query=...): 18 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 19 | 20 | remaining_grids = [g for g in range(grid_size ** 2) if g not in grids] 21 | for _ in remaining_grids: 22 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 23 | objects.append(uid) 24 | 25 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 26 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 27 | 28 | image_metadata = { 29 | 'grid number' : grid_size, 30 | 'objects' : objects, 31 | 'object_path' : object_path, 32 | 'object_angles' : angles, 33 | 'grids' : grids + remaining_grids, 34 | 'blender_config': self.metadata.sample_blender_configuration(self.rng) 35 | } 36 | 37 | return image_metadata 38 | 39 | def make_image(self, image_metadata): 40 | return make_image(image_metadata, self.metadata, IMAGE_H, IMAGE_W) 41 | 42 | def _relative_grid(self, grid_size, grid, reference_pos): 43 | return relative_grid(grid_size, grid, reference_pos) 44 | 45 | 46 | class What3DGridTaskGenerator(_3DGridTaskGenerator, WhatGridTaskGenerator): 47 | metadata: Objaverse3DMetaData 48 | 49 | 50 | class Where3DGridTaskGenerator(_3DGridTaskGenerator, WhereGridTaskGenerator): 51 | metadata: Objaverse3DMetaData 52 | 53 | 54 | class WhatAttribute3DGridTaskGenerator(_3DGridTaskGenerator, WhatAttributeGridTaskGenerator): 55 | metadata: Objaverse3DMetaData 56 | 57 | 58 | class WhereAttribute3DGridTaskGenerator(_3DGridTaskGenerator, WhereAttributeGridTaskGenerator): 59 | metadata: Objaverse3DMetaData 60 | 61 | 62 | class HowMany3DGridTaskGenerator(_3DGridTaskGenerator, HowManyGridTaskGenerator): 63 | metadata: Objaverse3DMetaData 64 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/single_video_task.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from typing import Dict, List, Tuple 3 | 4 | import numpy as np 5 | 6 | from .utils import grid_mappings, grid_options, make_video, relative_grid 7 | from ..metadata import ObjaverseVideoMetaData 8 | from ...base import TaskGenerator 9 | from ...constant import VIDEO_H, VIDEO_W 10 | 11 | 12 | def check_video(video): 13 | from decord import VideoReader, cpu 14 | with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp: 15 | try: 16 | with open(tmp.name, 'wb') as file: 17 | file.write(video) 18 | with open(tmp.name, 'rb') as f: 19 | VideoReader(f, ctx=cpu(0)) 20 | except Exception as e: 21 | return False 22 | return True 23 | 24 | 25 | class GridVideoTaskGenerator(TaskGenerator): 26 | metadata: ObjaverseVideoMetaData 27 | 28 | def __init__(self, metadata: ObjaverseVideoMetaData, seed=42): 29 | super().__init__(metadata, seed=seed) 30 | self.grid_options = grid_options 31 | self.grid_mappings = grid_mappings 32 | 33 | def _relative_grid(self, grid_size, grid, reference_pos): 34 | return relative_grid(grid_size, grid, reference_pos) 35 | 36 | def _get_target_object_query(self, task_plan): 37 | if 'attribute type' in task_plan: 38 | return self.metadata.and_query([("category", task_plan['target category'], True), (task_plan['attribute type'], task_plan['attribute value'], True)]) 39 | else: 40 | return self.metadata.and_query([("category", task_plan['target category'], True)]) 41 | 42 | def _task_plan_to_str(self, task_plan): 43 | t = [] 44 | for k, v in task_plan.items(): 45 | if self.metadata.check_category_exists(v): 46 | t.append(f'{k}: {self.metadata.get_surfacename(v)}') 47 | else: 48 | t.append(f'{k}: {v}') 49 | return '\n'.join(t) 50 | 51 | def make_video(self, video_metadata): 52 | return make_video(video_metadata, self.metadata, VIDEO_H, VIDEO_W) 53 | 54 | def _generate_task(self, task_plan) -> Tuple[str, str, List[str], Dict]: 55 | "(Abstract method) generate task" 56 | 57 | def generate(self, task_plan, return_data=True, seed=None): 58 | if seed is not None: 59 | self.rng = np.random.default_rng(seed=seed) 60 | 61 | retry = 0 62 | while True: 63 | question, answer, options, video_metadata = self._generate_task(task_plan) 64 | task = { 65 | 'question' : question.replace('_', ' '), 66 | 'answer' : answer.replace('_', ' '), 67 | 'options' : [o.replace('_', ' ') for o in options], 68 | 'task_plan' : self._task_plan_to_str(task_plan), 69 | 'video_metadata': video_metadata, 70 | 'video' : self.make_video(video_metadata) if return_data else None 71 | } 72 | if return_data: 73 | if check_video(task['video']): 74 | break 75 | else: 76 | retry -= 1 77 | if retry <= 0: 78 | raise Exception("Failed to generate video") 79 | else: 80 | break 81 | 82 | return task 83 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess 4 | 5 | from ..metadata import ObjaverseVideoMetaData 6 | 7 | grid_options = [2, 3] 8 | 9 | grid_mappings = { 10 | 2: 11 | { 12 | 'back left' : 0, 13 | 'back right' : 1, 14 | 'front left' : 2, 15 | 'front right': 3 16 | }, 17 | 3: 18 | { 19 | 'back left' : 0, 20 | 'back middle' : 1, 21 | 'back right' : 2, 22 | 'middle left' : 3, 23 | 'middle' : 4, 24 | 'middle right': 5, 25 | 'front left' : 6, 26 | 'front middle': 7, 27 | 'front right' : 8 28 | } 29 | } 30 | 31 | relative_positions = ['left', 'right', 'back', 'front', 'back left', 'back right', 'front left', 'front right'] 32 | relative_position_phrase = { 33 | 'left' : 'to the left of', 34 | 'right' : 'to the right of', 35 | 'back' : 'behind', 36 | 'front' : 'in front of', 37 | 'back left' : 'behind and to the left of', 38 | 'back right' : 'behind and to the right of', 39 | 'front left' : 'in front and to the left of', 40 | 'front right': 'in front and to the right of' 41 | } 42 | reverse_relative_positions = { 43 | 'left' : 'right', 44 | 'right' : 'left', 45 | 'back' : 'front', 46 | 'front' : 'back', 47 | 'front left' : 'back right', 48 | 'front right': 'back left', 49 | 'back left' : 'front right', 50 | 'back right' : 'front left' 51 | } 52 | 53 | 54 | def relative_grid(grid_size, grid, reference_pos): 55 | if 'right' in reference_pos: 56 | if grid % grid_size == 0: return -1 57 | grid = grid - 1 58 | if 'left' in reference_pos: 59 | if grid % grid_size == grid_size - 1: return -1 60 | grid = grid + 1 61 | if 'back' in reference_pos: 62 | if grid + grid_size >= grid_size * grid_size: return -1 63 | grid = grid + grid_size 64 | if 'front' in reference_pos: 65 | if grid - grid_size < 0: return -1 66 | grid = grid - grid_size 67 | return grid 68 | 69 | 70 | import tempfile 71 | import diskcache 72 | 73 | run_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "run_blender.py") 74 | 75 | 76 | def make_video(scene_json, metadata: ObjaverseVideoMetaData, VIDEO_H, VIDEO_W): 77 | device = metadata.render_device 78 | blender_cache = metadata.blender_cache 79 | assert len(scene_json["objects"]) <= (scene_json["grid number"] ** 2) 80 | scene_json["VIDEO_H"] = VIDEO_H 81 | scene_json["VIDEO_W"] = VIDEO_W 82 | 83 | with diskcache.Cache(blender_cache, size_limit=100 * (2 ** 30)) as cache: 84 | key = json.dumps(scene_json, sort_keys=True) 85 | video = cache.get(key, None) 86 | if video is None: 87 | with (tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp_video, 88 | tempfile.NamedTemporaryFile(delete=True, suffix=".json") as tmp_json): 89 | json.dump(scene_json, open(tmp_json.name, 'w')) 90 | 91 | env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(device)) 92 | command = ( 93 | f"{metadata.blender_path} -b -noaudio --python {run_script_path} -- " 94 | f"--save_video_path {tmp_video.name} " 95 | f"--json_file {tmp_json.name}" 96 | ) 97 | 98 | subprocess.run(command, shell=True, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) 99 | 100 | with open(tmp_video.name, 'rb') as video_file: 101 | video = video_file.read() # save video to a binary files 102 | cache.set(key, video) 103 | 104 | return video 105 | -------------------------------------------------------------------------------- /tma/imageqa/sticker_2d/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from PIL import Image 5 | 6 | grid_options = [2, 3] 7 | grid_mappings = { 8 | 2: 9 | { 10 | 'top left' : 0, 11 | 'top right' : 1, 12 | 'bottom left' : 2, 13 | 'bottom right': 3 14 | }, 15 | 3: 16 | { 17 | 'top left' : 0, 18 | 'top middle' : 1, 19 | 'top right' : 2, 20 | 'middle left' : 3, 21 | 'middle' : 4, 22 | 'middle right' : 5, 23 | 'bottom left' : 6, 24 | 'bottom middle': 7, 25 | 'bottom right' : 8 26 | } 27 | } 28 | 29 | relative_positions = ['left', 'right', 'top', 'bottom', 'top left', 'top right', 'bottom left', 'bottom right'] 30 | relative_position_phrase = { 31 | 'left' : 'to the left of', 32 | 'right' : 'to the right of', 33 | 'top' : 'above', 34 | 'bottom' : 'below', 35 | 'top left' : 'above and to the left of', 36 | 'top right' : 'above and to the right of', 37 | 'bottom left' : 'below and to the left of', 38 | 'bottom right': 'below and to the right of' 39 | } 40 | 41 | 42 | def relative_grid(grid_size, grid, reference_pos): 43 | if 'right' in reference_pos: 44 | if grid % grid_size == 0: return -1 45 | grid = grid - 1 46 | if 'left' in reference_pos: 47 | if grid % grid_size == grid_size - 1: return -1 48 | grid = grid + 1 49 | if 'top' in reference_pos: 50 | if grid + grid_size >= grid_size * grid_size: return -1 51 | grid = grid + grid_size 52 | if 'bottom' in reference_pos: 53 | if grid - grid_size < 0: return -1 54 | grid = grid - grid_size 55 | return grid 56 | 57 | 58 | def does_overlap(box1, box2): 59 | # Returns True if box1 and box2 overlap, False otherwise 60 | x1, y1, x2, y2 = box1 61 | x3, y3, x4, y4 = box2 62 | return not (x2 < x3 or x4 < x1 or y2 < y3 or y4 < y1) 63 | 64 | 65 | def sample_bounding_boxes(num_objects, H, W, size_range=(0.3, 0.45)): 66 | while True: 67 | frac = random.uniform(*size_range) 68 | boxes = [] 69 | count = 0 70 | num_chances = 5 71 | while len(boxes) < num_objects and count < num_chances: 72 | box_w = int(frac * W) 73 | box_h = int(frac * H) 74 | box_x = random.randint(0, W - box_w) 75 | box_y = random.randint(0, H - box_h) 76 | new_box = (box_x, box_y, box_x + box_w, box_y + box_h) 77 | if not any(does_overlap(new_box, box) for box in boxes): 78 | boxes.append(new_box) 79 | count += 1 80 | if count >= num_chances: 81 | continue 82 | return boxes 83 | 84 | 85 | def grid_to_box(H, W, grid_size, grid_index, grid_H, grid_W): 86 | grid_height = H // grid_size 87 | grid_width = W // grid_size 88 | 89 | # grid_x, grid_y = np.unravel_index(grid_index, (grid_size, grid_size)) 90 | grid_y, grid_x = np.unravel_index(grid_index, (grid_size, grid_size)) 91 | 92 | box_x = grid_x * grid_width 93 | box_y = grid_y * grid_height 94 | box_w = grid_W * grid_width 95 | box_h = grid_H * grid_height 96 | return (box_x, box_y, box_x + box_w, box_y + box_h) 97 | 98 | 99 | def paste_image(background, obj, box): 100 | obj = obj.resize((box[2] - box[0], box[3] - box[1])) 101 | background.paste(obj, box=box, mask=obj) 102 | 103 | 104 | def make_image(metadata, H=512, W=512): 105 | # sample bounding boxes 106 | grid_size = metadata["grid number"] 107 | object_paths = metadata["object paths"] 108 | assert len(metadata["objects"]) <= (grid_size ** 2) 109 | boxes = [grid_to_box(H, W, grid_size, x, 1, 1) for x in metadata["grids"]] 110 | 111 | im_target = Image.new("RGBA", (W, H), 'WHITE') # you can load this as a background image if you want 112 | 113 | for view, box in zip(object_paths, boxes): 114 | obj = Image.open(view) 115 | paste_image(im_target, obj, box) 116 | 117 | return im_target.convert('RGB') 118 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | 4 | from ..metadata import Objaverse3DMetaData 5 | 6 | grid_options = [2, 3] 7 | 8 | grid_mappings = { 9 | 2: 10 | { 11 | 'back left' : 0, 12 | 'back right' : 1, 13 | 'front left' : 2, 14 | 'front right': 3 15 | }, 16 | 3: 17 | { 18 | 'back left' : 0, 19 | 'back middle' : 1, 20 | 'back right' : 2, 21 | 'middle left' : 3, 22 | 'middle' : 4, 23 | 'middle right': 5, 24 | 'front left' : 6, 25 | 'front middle': 7, 26 | 'front right' : 8 27 | } 28 | } 29 | 30 | relative_positions = ['left', 'right', 'back', 'front', 'back left', 'back right', 'front left', 'front right'] 31 | relative_position_phrase = { 32 | 'left' : 'to the left of', 33 | 'right' : 'to the right of', 34 | 'back' : 'behind', 35 | 'front' : 'in front of', 36 | 'back left' : 'behind and to the left of', 37 | 'back right' : 'behind and to the right of', 38 | 'front left' : 'in front and to the left of', 39 | 'front right': 'in front and to the right of' 40 | } 41 | reverse_relative_positions = { 42 | 'left' : 'right', 43 | 'right' : 'left', 44 | 'back' : 'front', 45 | 'front' : 'back', 46 | 'front left' : 'back right', 47 | 'front right': 'back left', 48 | 'back left' : 'front right', 49 | 'back right' : 'front left' 50 | } 51 | 52 | 53 | def relative_grid(grid_size, grid, reference_pos): 54 | if 'right' in reference_pos: 55 | if grid % grid_size == 0: return -1 56 | grid = grid - 1 57 | if 'left' in reference_pos: 58 | if grid % grid_size == grid_size - 1: return -1 59 | grid = grid + 1 60 | if 'back' in reference_pos: 61 | if grid + grid_size >= grid_size * grid_size: return -1 62 | grid = grid + grid_size 63 | if 'front' in reference_pos: 64 | if grid - grid_size < 0: return -1 65 | grid = grid - grid_size 66 | return grid 67 | 68 | 69 | import os 70 | import tempfile 71 | import io, base64 72 | from PIL import Image 73 | import diskcache 74 | 75 | run_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "run_blender.py") 76 | 77 | 78 | def image_to_base64(pil_image): 79 | import io 80 | import base64 81 | img_byte_arr = io.BytesIO() 82 | pil_image.save(img_byte_arr, format='PNG') 83 | img_byte_arr = img_byte_arr.getvalue() 84 | base64_str = base64.b64encode(img_byte_arr).decode('utf-8') 85 | return base64_str 86 | 87 | 88 | def make_image(scene_json, metadata: Objaverse3DMetaData, H=512, W=512): 89 | device = metadata.render_device 90 | blender_cache = metadata.blender_cache 91 | assert len(scene_json["objects"]) <= (scene_json["grid number"] ** 2) 92 | scene_json["H"] = H 93 | scene_json["W"] = W 94 | 95 | with diskcache.Cache(blender_cache, size_limit=100 * (2 ** 30)) as cache: 96 | key = json.dumps(scene_json, sort_keys=True) 97 | base64_str = cache.get(key, None) 98 | if base64_str is None: 99 | with (tempfile.NamedTemporaryFile(delete=True, suffix=".png") as tmp_image, 100 | tempfile.NamedTemporaryFile(delete=True, suffix=".json") as tmp_json): 101 | json.dump(scene_json, open(tmp_json.name, 'w')) 102 | 103 | env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(device)) 104 | command = ( 105 | f"{metadata.blender_path} -b -noaudio --python {run_script_path} -- " 106 | f"--save_image_path {tmp_image.name} " 107 | f"--json_file {tmp_json.name}" 108 | ) 109 | 110 | subprocess.run(command, shell=True, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) 111 | 112 | img = Image.open(tmp_image.name).convert("RGB") 113 | cache.set(key, image_to_base64(img)) 114 | else: 115 | img = Image.open(io.BytesIO(base64.decodebytes(bytes(base64_str, "utf-8")))) 116 | 117 | return img 118 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | **/.DS_STORE 163 | 164 | 165 | 166 | output/ -------------------------------------------------------------------------------- /tma/models/qa_model/base_qa_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from typing import Callable 4 | 5 | import diskcache 6 | import numpy as np 7 | import sentence_transformers 8 | import torch 9 | 10 | from .. import Model 11 | 12 | 13 | def make_options(choices, format='letter'): 14 | assert format in ['numeric', 'letter'] 15 | if format == 'numeric': 16 | prefix1 = [str(i + 1) for i in range(len(choices))] 17 | else: 18 | prefix1 = [chr(ord("a") + i).upper() for i in range(len(choices))] 19 | prefix2 = [f"({p})" for p in prefix1] 20 | return prefix1, prefix2, [f'{p} {c}' for p, c in zip(prefix2, choices)] 21 | 22 | 23 | def check_contain(answer, options): 24 | contains = [option in answer for option in options] 25 | if sum(contains) == 1: 26 | return contains.index(True) 27 | else: 28 | return -1 29 | 30 | 31 | class QAModelInstance: 32 | def qa(self, data, prompt): 33 | "(Abstract method) abstract QA method" 34 | 35 | 36 | class QAModel(Model): 37 | def __init__( 38 | self, 39 | model_name: str, 40 | prompt_name: str, 41 | prompt_func: Callable, 42 | choice_format='letter', 43 | enable_choice_search: bool = False, 44 | cache_path: str = None, 45 | ): 46 | self.model = None 47 | self.model_name = f'{model_name} ({prompt_name})' 48 | self.prompt_func = prompt_func 49 | self.format = choice_format 50 | self.cache_path = cache_path 51 | 52 | if self.cache_path is None: 53 | print("[IMPORTANT] model cache is disabled") 54 | else: 55 | print(f"[IMPORTANT] model cache is enabled, cache path: {cache_path}") 56 | 57 | self.enable_choice_search = enable_choice_search 58 | if enable_choice_search: 59 | # use SBERT to find the closest choice 60 | self.sentence_transformer = sentence_transformers.SentenceTransformer("all-mpnet-base-v2", device='cpu') 61 | 62 | @torch.no_grad() 63 | def choice_search(self, free_form_answer, choices): 64 | query_embedding = self.sentence_transformer.encode([free_form_answer], normalize_embeddings=True) 65 | choices_embedding = self.sentence_transformer.encode(choices, normalize_embeddings=True) 66 | top_choice_index = np.argmax(np.dot(choices_embedding, query_embedding.T)) 67 | return choices[top_choice_index] 68 | 69 | def _data_to_str(self, data): 70 | """ abstract method """ 71 | 72 | @torch.no_grad() 73 | def _qa(self, data, prompt): 74 | if self.cache_path is None: 75 | return self.model.qa(data, prompt) 76 | else: 77 | with diskcache.Cache(self.cache_path, size_limit=10 * (2 ** 30)) as cache: 78 | key = json.dumps([self.model_name, self._data_to_str(data), prompt]) 79 | response = cache.get(key, None) 80 | if response is None: 81 | response = self.model.qa(data, prompt) 82 | cache.set(key, response) 83 | return response 84 | 85 | @torch.no_grad() 86 | def qa(self, data, question): 87 | prompt = self.prompt_func(question) 88 | return self._qa(data, prompt) 89 | 90 | @torch.no_grad() 91 | def multiple_choice_qa(self, data, question, choices, answer=None): 92 | # Get VQA model's answer 93 | prefix1, prefix2, options = make_options(choices, self.format) 94 | prompt = self.prompt_func(question, options) 95 | free_form_answer = self._qa(data, prompt) 96 | free_form_answer = free_form_answer.strip() 97 | 98 | # Limit the answer to the choices 99 | if free_form_answer in choices: 100 | multiple_choice_answer = free_form_answer 101 | elif free_form_answer in options: 102 | multiple_choice_answer = choices[options.index(free_form_answer)] 103 | elif free_form_answer in prefix1: 104 | multiple_choice_answer = choices[prefix1.index(free_form_answer)] 105 | elif free_form_answer in prefix2: 106 | multiple_choice_answer = choices[prefix2.index(free_form_answer)] 107 | elif self.enable_choice_search: 108 | multiple_choice_answer = self.choice_search(free_form_answer, choices) 109 | else: 110 | multiple_choice_answer = "" 111 | for to_check in [choices, options, prefix1, prefix2]: 112 | idx = check_contain(free_form_answer, to_check) 113 | if idx != -1: 114 | multiple_choice_answer = choices[idx] 115 | break 116 | 117 | result = { 118 | "free_form_answer" : free_form_answer, 119 | "multiple_choice_answer": multiple_choice_answer, 120 | "choices" : choices.copy(), 121 | } 122 | if answer is not None: 123 | result["accuracy"] = int(answer == multiple_choice_answer) 124 | return result 125 | 126 | @torch.no_grad() 127 | def multiple_choice_qa_random_ordering(self, data, question, choices, answer=None, n_trials=3): 128 | results = {} 129 | accuracy = 0 130 | for i in range(n_trials): 131 | choices_i = choices.copy() 132 | random.shuffle(choices_i) 133 | results[i] = self.multiple_choice_qa(data, question, choices_i, answer) 134 | accuracy += results[i]["accuracy"] 135 | results["accuracy"] = accuracy / n_trials 136 | return results 137 | -------------------------------------------------------------------------------- /tma/models/qa_model/videoqa_model.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from typing import Callable, Union 3 | 4 | import numpy as np 5 | import torch 6 | from PIL import Image, ImageDraw, ImageFont 7 | 8 | from .base_qa_model import QAModel, QAModelInstance 9 | from .imageqa_model import ImageQAModel 10 | 11 | videoqa_models = { 12 | 13 | } 14 | 15 | 16 | def list_videoqa_models(): 17 | return list(videoqa_models.keys()) 18 | 19 | 20 | class VideoQAModel(QAModel): 21 | def __init__( 22 | self, 23 | model_name, 24 | prompt_name: str, 25 | prompt_func: Callable, 26 | model: QAModelInstance = None, 27 | torch_device: Union[int, str] = -1, 28 | precision=torch.bfloat16, 29 | choice_format='letter', 30 | enable_choice_search: bool = False, 31 | ): 32 | super().__init__(model_name, prompt_name, prompt_func, choice_format, enable_choice_search) 33 | 34 | if isinstance(torch_device, str): 35 | torch_device = torch.device(torch_device) 36 | else: 37 | if torch_device == -1: 38 | torch_device = torch.device("cuda") if torch.cuda.is_available() else "cpu" 39 | else: 40 | torch_device = torch.device(f"cuda:{torch_device}") 41 | 42 | if model is None: 43 | print(f"Loading {model_name}...") 44 | class_name, ckpt = videoqa_models[model_name] 45 | self.model_precision = precision 46 | self.model = eval(class_name)(ckpt, torch_device, self.model_precision) 47 | print(f"Finish loading {model_name}") 48 | else: 49 | print(f"Using provided self.model...") 50 | self.model = model 51 | 52 | @torch.no_grad() 53 | def _qa(self, data, prompt): 54 | if isinstance(data, str): 55 | return self.model.qa(data, prompt) 56 | else: 57 | with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp: 58 | with open(tmp.name, 'wb') as file: 59 | file.write(data) 60 | video_path = tmp.name 61 | answer = self.model.qa(video_path, prompt) 62 | return answer 63 | 64 | 65 | def sample_frames(video_path, n): 66 | import cv2 67 | # Open the video file 68 | cap = cv2.VideoCapture(video_path) 69 | if not cap.isOpened(): 70 | print("Error: Could not open video.") 71 | return [] 72 | 73 | # Calculate total number of frames and video FPS 74 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 75 | 76 | # Calculate interval in terms of frames 77 | interval = max(1, total_frames // n) 78 | 79 | # Sample frames 80 | sampled_frames = [] 81 | for i in range(0, total_frames, interval): 82 | # Set the current frame position 83 | cap.set(cv2.CAP_PROP_POS_FRAMES, i) 84 | 85 | # Read the frame 86 | ret, frame = cap.read() 87 | if not ret: 88 | print(f"Error: Could not read frame {i}.") 89 | break 90 | 91 | # Convert the frame to PIL Image 92 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 93 | pil_img = Image.fromarray(frame_rgb) 94 | sampled_frames.append(pil_img) 95 | 96 | # Stop if we have collected n frames 97 | if len(sampled_frames) >= n: 98 | break 99 | 100 | # Release the video capture object 101 | cap.release() 102 | 103 | return sampled_frames 104 | 105 | 106 | def get_contrasting_color(image, x, y, width, height): 107 | """ 108 | Determine a contrasting color (black or white) based on the average color of a specified area in the image. 109 | """ 110 | # Crop the relevant part of the image 111 | cropped_image = image.crop((x, y, x + width, y + height)) 112 | # Convert to numpy array for analysis 113 | np_image = np.array(cropped_image) 114 | # Calculate the average color 115 | average_color = np.mean(np_image, axis=(0, 1)) 116 | # Brightness calculation based on perceived luminance 117 | brightness = np.sqrt(0.299 * average_color[0] ** 2 + 0.587 * average_color[1] ** 2 + 0.114 * average_color[2] ** 2) 118 | # Return white for dark backgrounds and black for light backgrounds 119 | return 'white' if brightness < 128 else 'black' 120 | 121 | 122 | def concatenate_image(images, rows, columns, separator_width=10): 123 | # Ensure we have the exact number of images needed 124 | if len(images) != rows * columns: 125 | raise ValueError(f"Expected {rows * columns} images, but got {len(images)}.") 126 | 127 | # Calculate the max width and height of images to standardize sizes 128 | max_width = max(img.width for img in images) 129 | max_height = max(img.height for img in images) 130 | 131 | # Resize images to the max width and height 132 | resized_images = [img.resize((max_width, max_height), Image.Resampling.LANCZOS) for img in images] 133 | 134 | # Calculate the total width and height for the combined image 135 | total_width = max_width * columns + separator_width * (columns - 1) 136 | total_height = max_height * rows + separator_width * (rows - 1) 137 | combined_image = Image.new('RGB', (total_width, total_height), color='white') 138 | 139 | # Place images in the specified grid 140 | x_offset = 0 141 | y_offset = 0 142 | for i, img in enumerate(resized_images): 143 | combined_image.paste(img, (x_offset, y_offset)) 144 | if (i + 1) % columns == 0: # Move to the next row after the last column 145 | x_offset = 0 146 | y_offset += img.height + separator_width 147 | else: # Move to the next column 148 | x_offset += img.width + separator_width 149 | 150 | # Add numbers to each image for identification 151 | draw = ImageDraw.Draw(combined_image) 152 | try: 153 | font_size = (max_width + max_height) // 2 // 12 154 | font = ImageFont.load_default(size=font_size) 155 | except IOError: 156 | font = ImageFont.truetype("arial", 20) 157 | 158 | x_offset = 0 159 | y_offset = 0 160 | for i, img in enumerate(resized_images): 161 | text = str(i + 1) 162 | text_x = x_offset + 10 163 | text_y = y_offset + 10 164 | text_width, text_height = font_size, font_size 165 | font_color = get_contrasting_color(combined_image, text_x, text_y, text_width, text_height) 166 | draw.text((text_x, text_y), text, fill=font_color, font=font) 167 | if (i + 1) % columns == 0: 168 | x_offset = 0 169 | y_offset += img.height + separator_width 170 | else: 171 | x_offset += img.width + separator_width 172 | 173 | return combined_image 174 | 175 | 176 | def video_to_concat_image(video_path, num_rows, num_columns): 177 | return concatenate_image(sample_frames(video_path, num_rows * num_columns), num_rows, num_columns) 178 | 179 | 180 | class ImageQAModel4Video(VideoQAModel): 181 | def __init__( 182 | self, 183 | model: ImageQAModel, 184 | prompt_name: str, 185 | prompt_func: Callable, 186 | num_rows: int = 2, 187 | num_columns: int = 2, 188 | choice_format='letter', 189 | enable_choice_search: bool = False, 190 | ): 191 | super(VideoQAModel, self).__init__(model.model_name, prompt_name, prompt_func, choice_format, enable_choice_search) 192 | self.num_rows = num_rows 193 | self.num_columns = num_columns 194 | self.num_frames = self.num_rows * self.num_columns 195 | self.model = model 196 | 197 | @torch.no_grad() 198 | def _qa(self, data, prompt): 199 | if isinstance(data, Image.Image): 200 | return self.model._qa(data, prompt) 201 | elif isinstance(data, str): 202 | return self.model._qa(video_to_concat_image(data, self.num_rows, self.num_columns), prompt) 203 | else: 204 | with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp: 205 | with open(tmp.name, 'wb') as file: 206 | file.write(data) 207 | video_path = tmp.name 208 | answer = self.model._qa(video_to_concat_image(video_path, self.num_rows, self.num_columns), prompt) 209 | return answer 210 | -------------------------------------------------------------------------------- /annotations/relation_to_type.json: -------------------------------------------------------------------------------- 1 | { 2 | "standing behind": "spatial", 3 | "displayed in": "interactional", 4 | "jumping on": "interactional", 5 | "sitting next to": "interactional", 6 | "moving": "interactional", 7 | "exiting": "interactional", 8 | "sitting with": "social", 9 | "drinking from": "interactional", 10 | "herding": "interactional", 11 | "larger than": "spatial", 12 | "tied around": "spatial", 13 | "covered with": "spatial", 14 | "lying inside": "interactional", 15 | "growing behind": "interactional", 16 | "reflecting in": "functional", 17 | "on": "spatial", 18 | "sitting atop": "spatial", 19 | "topped with": "interactional", 20 | "brushing": "interactional", 21 | "sitting in": "spatial", 22 | "pushed by": "interactional", 23 | "walking up": "spatial", 24 | "tossing": "interactional", 25 | "sitting under": "spatial", 26 | "entering": "interactional", 27 | "by": "spatial", 28 | "sitting in front of": "spatial", 29 | "standing against": "spatial", 30 | "about to hit": "interactional", 31 | "buying": "interactional", 32 | "tying": "interactional", 33 | "reflected in": "spatial", 34 | "lying next to": "interactional", 35 | "cutting": "interactional", 36 | "surrounding": "spatial", 37 | "pushing": "interactional", 38 | "skiing on": "interactional", 39 | "walking in": "spatial", 40 | "with": "spatial", 41 | "looking toward": "spatial", 42 | "lying on": "spatial", 43 | "grazing in": "interactional", 44 | "drawn on": "spatial", 45 | "connected to": "spatial", 46 | "taller than": "spatial", 47 | "longer than": "spatial", 48 | "pouring": "interactional", 49 | "sitting by": "spatial", 50 | "smaller than": "spatial", 51 | "on the side of": "spatial", 52 | "jumping off": "interactional", 53 | "sitting beside": "spatial", 54 | "throwing": "interactional", 55 | "pulling": "interactional", 56 | "waiting for": "interactional", 57 | "running through": "spatial", 58 | "contain": "interactional", 59 | "hitting": "interactional", 60 | "at": "spatial", 61 | "smoking": "interactional", 62 | "growing by": "spatial", 63 | "drinking": "interactional", 64 | "hanging from": "spatial", 65 | "hugging": "interactional", 66 | "sleeping in": "interactional", 67 | "towing": "interactional", 68 | "walking across": "spatial", 69 | "parked in front of": "spatial", 70 | "growing along": "interactional", 71 | "resting on": "interactional", 72 | "looking over": "interactional", 73 | "parked along": "spatial", 74 | "beside": "spatial", 75 | "driving": "interactional", 76 | "sewn on": "interactional", 77 | "looking into": "interactional", 78 | "eating in": "spatial", 79 | "traveling down": "spatial", 80 | "close to": "spatial", 81 | "slicing": "interactional", 82 | "bigger than": "spatial", 83 | "underneath": "spatial", 84 | "leading": "interactional", 85 | "talking to": "interactional", 86 | "getting on": "spatial", 87 | "growing from": "interactional", 88 | "swimming in": "interactional", 89 | "talking on": "interactional", 90 | "hung on": "interactional", 91 | "catching": "interactional", 92 | "sprinkled on": "interactional", 93 | "opening": "interactional", 94 | "mounted to": "spatial", 95 | "standing in front of": "spatial", 96 | "seen through": "spatial", 97 | "going into": "spatial", 98 | "growing in": "spatial", 99 | "licking": "interactional", 100 | "full of": "interactional", 101 | "hanging out of": "spatial", 102 | "next to": "spatial", 103 | "hanging above": "spatial", 104 | "standing on top of": "spatial", 105 | "cooking": "interactional", 106 | "looking through": "interactional", 107 | "between": "spatial", 108 | "riding": "interactional", 109 | "playing with": "interactional", 110 | "eating from": "interactional", 111 | "going through": "spatial", 112 | "leaning against": "spatial", 113 | "scattered on": "spatial", 114 | "parked behind": "spatial", 115 | "flying in": "spatial", 116 | "worn on": "interactional", 117 | "surrounded by": "spatial", 118 | "feeding": "interactional", 119 | "standing under": "spatial", 120 | "floating on": "spatial", 121 | "walking down": "spatial", 122 | "skating on": "interactional", 123 | "under": "spatial", 124 | "playing in": "interactional", 125 | "lying on top of": "spatial", 126 | "on the bottom of": "spatial", 127 | "inside": "spatial", 128 | "kissing": "interactional", 129 | "playing at": "interactional", 130 | "standing at": "spatial", 131 | "helping": "interactional", 132 | "riding in": "interactional", 133 | "chained to": "spatial", 134 | "parked in": "spatial", 135 | "on top of": "spatial", 136 | "kept in": "spatial", 137 | "covering": "spatial", 138 | "grazing on": "interactional", 139 | "approaching": "interactional", 140 | "climbing": "interactional", 141 | "covered in": "spatial", 142 | "growing next to": "spatial", 143 | "in between": "spatial", 144 | "behind": "spatial", 145 | "growing near": "spatial", 146 | "painted on": "spatial", 147 | "driving down": "spatial", 148 | "parked next to": "spatial", 149 | "touching": "interactional", 150 | "parked by": "interactional", 151 | "walking to": "spatial", 152 | "posing with": "social", 153 | "standing beside": "spatial", 154 | "standing on": "spatial", 155 | "using": "interactional", 156 | "mounted on": "spatial", 157 | "walking by": "spatial", 158 | "playing on": "interactional", 159 | "blowing out": "interactional", 160 | "sitting near": "interactional", 161 | "crossing": "spatial", 162 | "to the left of": "spatial", 163 | "cooked in": "functional", 164 | "eating at": "interactional", 165 | "walking towards": "interactional", 166 | "floating in": "spatial", 167 | "hang from": "spatial", 168 | "photographing": "interactional", 169 | "sniffing": "interactional", 170 | "stuck on": "interactional", 171 | "walking toward": "interactional", 172 | "looking down at": "interactional", 173 | "traveling on": "spatial", 174 | "typing on": "interactional", 175 | "guiding": "interactional", 176 | "shining through": "spatial", 177 | "jumping over": "interactional", 178 | "following": "interactional", 179 | "dragging": "interactional", 180 | "on the front of": "spatial", 181 | "standing next to": "interactional", 182 | "reflected on": "spatial", 183 | "on the other side of": "spatial", 184 | "lying in": "spatial", 185 | "boarding": "interactional", 186 | "pointing at": "interactional", 187 | "draped over": "spatial", 188 | "observing": "interactional", 189 | "working in": "interactional", 190 | "followed by": "interactional", 191 | "chasing": "interactional", 192 | "wrapped in": "spatial", 193 | "leaning on": "spatial", 194 | "sitting at": "spatial", 195 | "parked on": "spatial", 196 | "piled on": "spatial", 197 | "walking with": "interactional", 198 | "carrying": "interactional", 199 | "beneath": "spatial", 200 | "served on": "functional", 201 | "wading in": "interactional", 202 | "walking into": "spatial", 203 | "sitting inside": "spatial", 204 | "holding": "interactional", 205 | "enclosing": "spatial", 206 | "looking out": "interactional", 207 | "standing near": "interactional", 208 | "of": "spatial", 209 | "to the right of": "spatial", 210 | "walking next to": "interactional", 211 | "petting": "interactional", 212 | "driving on": "spatial", 213 | "standing in": "spatial", 214 | "hidden by": "spatial", 215 | "flying through": "spatial", 216 | "hanging over": "spatial", 217 | "playing": "interactional", 218 | "covered by": "spatial", 219 | "stuck in": "spatial", 220 | "attached to": "spatial", 221 | "facing": "interactional", 222 | "stacked on": "interactional", 223 | "walking near": "spatial", 224 | "wrapped around": "spatial", 225 | "higher than": "spatial", 226 | "chewing": "interactional", 227 | "parked near": "spatial", 228 | "preparing": "interactional", 229 | "skiing in": "interactional", 230 | "jumping in": "interactional", 231 | "flying": "interactional", 232 | "leaning over": "interactional", 233 | "picking up": "interactional", 234 | "walking through": "interactional", 235 | "in front of": "spatial", 236 | "decorated by": "functional", 237 | "growing on": "interactional", 238 | "standing around": "spatial", 239 | "standing by": "spatial", 240 | "going down": "spatial", 241 | "grabbing": "interactional", 242 | "eating": "interactional", 243 | "walking behind": "interactional", 244 | "in": "spatial", 245 | "mixed with": "interactional", 246 | "coming down": "spatial", 247 | "cleaning": "interactional", 248 | "adjusting": "interactional", 249 | "perched on": "interactional", 250 | "riding on": "interactional", 251 | "sitting on": "spatial", 252 | "parked alongside": "spatial", 253 | "working on": "interactional", 254 | "hanging on": "spatial", 255 | "pulled by": "interactional", 256 | "splashing": "interactional", 257 | "hanging in": "spatial", 258 | "tied to": "spatial", 259 | "plugged into": "interactional", 260 | "printed on": "spatial", 261 | "decorated with": "interactional", 262 | "on the back of": "spatial", 263 | "on the edge of": "spatial", 264 | "below": "spatial", 265 | "sleeping on": "interactional", 266 | "walking along": "spatial", 267 | "hanging off": "spatial", 268 | "walking on": "spatial", 269 | "around": "spatial", 270 | "looking in": "interactional", 271 | "looking at": "interactional", 272 | "near": "spatial", 273 | "parked at": "spatial", 274 | "staring at": "interactional", 275 | "reading": "interactional", 276 | "swinging": "interactional", 277 | "wearing": "interactional", 278 | "falling off": "interactional", 279 | "selling": "interactional", 280 | "above": "spatial", 281 | "holding onto": "interactional", 282 | "biting": "interactional", 283 | "running on": "spatial", 284 | "decorating": "interactional", 285 | "leaving": "spatial", 286 | "making": "interactional", 287 | "balancing on": "interactional", 288 | "running in": "spatial", 289 | "flying above": "spatial", 290 | "sitting around": "spatial", 291 | "coming out of": "spatial", 292 | "washing": "interactional", 293 | "worn around": "interactional", 294 | "sitting on top of": "spatial", 295 | "skiing down": "interactional", 296 | "kicking": "interactional", 297 | "running across": "spatial", 298 | "parked beside": "spatial", 299 | "walking past": "interactional", 300 | "reaching for": "interactional", 301 | "displayed on": "interactional", 302 | "serving": "interactional", 303 | "smiling at": "emotional", 304 | "trying to catch": "interactional", 305 | "flying over": "spatial", 306 | "watching": "interactional", 307 | "shorter than": "spatial", 308 | "smelling": "interactional", 309 | "coming from": "spatial", 310 | "sitting behind": "spatial", 311 | "filled with": "interactional", 312 | "writing on": "interactional", 313 | "wiping": "interactional", 314 | "having it on the back": "spatial", 315 | "twisting": "interactional" 316 | } -------------------------------------------------------------------------------- /tma/imageqa/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from itertools import product 4 | from math import radians 5 | from typing import List, Tuple 6 | 7 | import networkx as nx 8 | import pandas as pd 9 | 10 | from ..metadata import CategoryMetaData 11 | 12 | ambiguous_colors = [ 13 | ["red", "pink", "purple"], 14 | ["yellow", "orange", "brown", "gold", "beige"], 15 | ] 16 | 17 | 18 | def get_confusing_colors(color): 19 | for colors in ambiguous_colors: 20 | if color in colors: 21 | return colors 22 | return [color] 23 | 24 | 25 | def remove_skip_edge(edges): 26 | G = nx.DiGraph() 27 | G.add_edges_from(edges) 28 | new_edges = [] 29 | for source, target in edges: 30 | G.remove_edge(source, target) 31 | if not nx.has_path(G, source, target): 32 | G.add_edge(source, target) 33 | new_edges.append((source, target)) 34 | return new_edges 35 | 36 | 37 | def remove_nodes(G, nodes): 38 | for node in nodes: 39 | successors = list(G.successors(node)) 40 | predecessors = list(G.predecessors(node)) 41 | G.remove_node(node) 42 | for s in successors: 43 | for p in predecessors: 44 | G.add_edge(p, s) 45 | return G 46 | 47 | 48 | def build_taxonomy(path_to_metadata, mode): 49 | assert mode in ['objaverse', 'scene_graph'] 50 | 51 | cateid_to_concept = json.load(open(os.path.join(path_to_metadata, 'cateid_to_concept.json'))) 52 | taxonomy = json.load(open(os.path.join(path_to_metadata, 'taxonomy.json'))) 53 | edges, nodes = taxonomy['edges'], taxonomy['nodes'] 54 | G = nx.DiGraph() 55 | G.add_edges_from(remove_skip_edge(edges)) 56 | 57 | nodes_to_remove = [] 58 | categories_with_object = set([k for k, v in cateid_to_concept.items() if len(v[mode]) > 0]) 59 | for node in G.nodes(): 60 | if node not in categories_with_object and len(nx.descendants(G, node) & categories_with_object) == 0: 61 | nodes_to_remove.append(node) 62 | G = remove_nodes(G, nodes_to_remove) 63 | G.add_nodes_from(categories_with_object) 64 | 65 | categories, category_info = [], {} 66 | for node in G.nodes(): 67 | categories.append(node) 68 | if node in cateid_to_concept: 69 | category_info[node] = cateid_to_concept[node] 70 | else: 71 | category_info[node] = nodes[node] 72 | categories = sorted(categories) 73 | 74 | return G, categories, category_info, categories_with_object 75 | 76 | 77 | class ObjaverseMetaData(CategoryMetaData): 78 | def __init__(self, path_to_metadata): 79 | super().__init__() 80 | 81 | self.taxonomy, self.categories, self.category_info, categories_with_object = \ 82 | build_taxonomy(path_to_metadata, 'objaverse') 83 | 84 | cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json'))) 85 | 86 | def get_category_objects(category): 87 | if category in cateid_to_objects: 88 | return list(cateid_to_objects[category].keys()) 89 | else: 90 | return [] 91 | 92 | cateid_to_objid = {} 93 | for cateid in self.categories: 94 | objs = get_category_objects(cateid) 95 | for c in nx.descendants(self.taxonomy, cateid) & categories_with_object: 96 | objs.extend(get_category_objects(c)) 97 | cateid_to_objid[cateid] = objs 98 | assert len(objs) > 0 99 | assert len(objs) == len(set(objs)) 100 | 101 | self.attribute_vocab, objid_to_attribute = {}, {} 102 | for cateid in cateid_to_objects: 103 | for objid in cateid_to_objects[cateid]: 104 | objid_to_attribute[objid] = cateid_to_objects[cateid][objid]["attributes"] 105 | for attr, values in cateid_to_objects[cateid][objid]["attributes"].items(): 106 | if attr not in self.attribute_vocab: 107 | self.attribute_vocab[attr] = set() 108 | self.attribute_vocab[attr].update(values) 109 | 110 | data = [] 111 | for cateid, objs in cateid_to_objid.items(): 112 | for objid in objs: 113 | attribute_data = [] 114 | for attr in self.attribute_vocab: 115 | values = objid_to_attribute[objid].get(attr, []) 116 | if len(values) == 0: 117 | values = [None] 118 | attribute_data.append(values) 119 | 120 | for attribute_combination in product(*attribute_data): 121 | data.append([objid, cateid] + list(attribute_combination)) 122 | 123 | self.df = pd.DataFrame(data, columns=['object', 'category'] + list(self.attribute_vocab.keys())) 124 | 125 | def check_object_attribute(self, objid, attributes): 126 | for attr, values in attributes.items(): 127 | for value in values: 128 | if value not in self.df[self.df['object'] == objid][attr].unique(): 129 | return False 130 | return True 131 | 132 | def and_query(self, conditions: List[Tuple]) -> str: 133 | q = set() 134 | for k, v, i in conditions: 135 | # k: column name; v: value; i: is equal 136 | if v is None: 137 | if i: 138 | q.add(f'{k} in [None]') 139 | else: 140 | q.add(f'{k} not in [None]') 141 | else: 142 | if i: 143 | q.add(f'{k} == {repr(v)}') 144 | else: 145 | if k == 'category': 146 | # exclude all relevant categories 147 | for c in self.get_relevant_categories(v): 148 | q.add(f'{k} != {repr(c)}') 149 | elif k == 'color': 150 | # exclude all confusing colors 151 | for c in get_confusing_colors(v): 152 | q.add(f'{k} != {repr(c)}') 153 | else: 154 | q.add(f'{k} != {repr(v)}') 155 | return ' and '.join(q) 156 | 157 | def or_query(self, conditions: List[str]) -> str: 158 | conditions = [f'({c})' for c in conditions if len(c) > 0] 159 | return ' or '.join(conditions) 160 | 161 | def query_metadata(self, target, query: str): 162 | if len(query) == 0: 163 | return sorted(self.df[target].dropna().unique()) 164 | else: 165 | return sorted(self.df.query(query)[target].dropna().unique().tolist()) 166 | 167 | def sample(self, rng, n, target, query: str): 168 | if n == 1: 169 | return rng.choice(self.query_metadata(target, query)) 170 | else: 171 | candidates = self.query_metadata(target, query) 172 | return rng.choice(candidates, n, replace=len(candidates) < n).tolist() 173 | 174 | def sample_category_for_object(self, rng, objid, exclude_category=None): 175 | candidates = self.query_metadata("category", self.and_query([("object", objid, True)])) 176 | if exclude_category is not None: 177 | exclude_category = self.get_relevant_categories(exclude_category) 178 | candidates = [c for c in candidates if c not in exclude_category] 179 | return rng.choice(candidates) 180 | 181 | def get_category_attribute_dict(self, cateid): 182 | attribute_dict = {} 183 | for attr in self.attribute_vocab: 184 | attribute_dict[attr] = self.query_metadata(attr, self.and_query([("category", cateid, True)])) 185 | return attribute_dict 186 | 187 | 188 | class Objaverse2DMetaData(ObjaverseMetaData): 189 | def __init__(self, path_to_metadata, image_folder): 190 | super().__init__(path_to_metadata) 191 | 192 | self.image_folder = image_folder 193 | cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json'))) 194 | 195 | self.objid_to_images = {} 196 | for cateid in cateid_to_objects: 197 | for objid in cateid_to_objects[cateid]: 198 | self.objid_to_images[objid] = [os.path.join(image_folder, cateid, objid, i) 199 | for i in cateid_to_objects[cateid][objid]["images"]] 200 | 201 | def sample_image(self, rng, objid): 202 | return rng.choice(self.objid_to_images[objid]) 203 | 204 | 205 | class Objaverse3DMetaData(ObjaverseMetaData): 206 | def __init__(self, path_to_metadata, blender_path, assets_path, render_device='cpu', blender_cache='./blender_cache'): 207 | super().__init__(path_to_metadata) 208 | self.assets_path = assets_path 209 | self.blender_path = blender_path 210 | self.blender_cache = blender_cache 211 | self.render_device = render_device 212 | plane_dir = os.path.join(assets_path, "plane_glbs") 213 | self.plane_texture_path = [os.path.join(plane_dir, f) for f in os.listdir(plane_dir) if f.endswith(".glb")] 214 | hdri_dir = os.path.join(assets_path, "hdri") 215 | self.hdri_path = [os.path.join(hdri_dir, f) for f in os.listdir(hdri_dir) if f.endswith(".exr")] 216 | 217 | cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json'))) 218 | self.object_to_angles = {objid: cateid_to_objects[cateid][objid]['angles'] 219 | for cateid in cateid_to_objects for objid in cateid_to_objects[cateid]} 220 | 221 | def get_object_path(self, objid): 222 | return os.path.join(self.assets_path, "objects", objid + ".glb") 223 | 224 | def sample_object_angle(self, rng, objid): 225 | angles = self.object_to_angles[objid] 226 | return angles[rng.choice(len(angles))] 227 | 228 | def sample_blender_configuration(self, rng): 229 | orientation = rng.choice([-1, 1]) 230 | key_light_horizontal_angle = orientation * radians(rng.uniform(15, 45)) 231 | fill_light_horizontal_angle = - orientation * radians(rng.uniform(15, 60)) 232 | key_light_vertical_angle = -radians(rng.uniform(15, 45)) 233 | fill_light_vertical_angle = -radians(rng.uniform(0, 30)) 234 | 235 | sun_x, sun_y = radians(rng.uniform(0, 45)), radians(rng.uniform(0, 45)) 236 | sun_energy = rng.uniform(1.0, 6.0) 237 | 238 | plane_texture_path = rng.choice(self.plane_texture_path) 239 | hdri_path = rng.choice(self.hdri_path) 240 | 241 | return { 242 | "key_light_horizontal_angle" : key_light_horizontal_angle, 243 | "fill_light_horizontal_angle": fill_light_horizontal_angle, 244 | "key_light_vertical_angle" : key_light_vertical_angle, 245 | "fill_light_vertical_angle" : fill_light_vertical_angle, 246 | "sun_x" : sun_x, 247 | "sun_y" : sun_y, 248 | "sun_energy" : sun_energy, 249 | "plane_texture_path" : plane_texture_path, 250 | "hdri_path" : hdri_path 251 | } 252 | 253 | 254 | def load_scene_graph(scene_graph_folder): 255 | image_folder = os.path.join(scene_graph_folder, "images/images") 256 | sg_json_folder = os.path.join(scene_graph_folder, "sceneGraphs") 257 | # train_scene_graphs = json.load(open(os.path.join(sg_json_folder, "train_sceneGraphs.json"))) 258 | val_scene_graphs = json.load(open(os.path.join(sg_json_folder, "val_sceneGraphs.json"))) 259 | scene_graphs = val_scene_graphs # TODO: first only use val_scene_graphs 260 | return image_folder, scene_graphs 261 | 262 | 263 | class SceneGraphMetaData(CategoryMetaData): 264 | def __init__(self, path_to_metadata, scene_graph_folder): 265 | super().__init__() 266 | self.taxonomy, self.categories, self.category_info, self.categories_with_object = \ 267 | build_taxonomy(path_to_metadata, 'scene_graph') 268 | 269 | self.type_to_attribute = json.load(open(os.path.join(path_to_metadata, 'attribute_category.json'))) 270 | self.attribute_to_type = {attr: k for k, vs in self.type_to_attribute.items() for attr in vs} 271 | 272 | self.image_folder, self.scene_graphs = load_scene_graph(scene_graph_folder) 273 | self.scene_graphs_list = list(self.scene_graphs.keys()) 274 | self.sg_object_to_cateid = {} 275 | for k, v in self.category_info.items(): 276 | if k in self.categories_with_object: 277 | for sg_object in v['scene_graph']: 278 | self.sg_object_to_cateid[sg_object] = k 279 | 280 | relations = set() 281 | for sg in self.scene_graphs.values(): 282 | for obj in sg['objects'].values(): 283 | for rel in obj['relations']: 284 | relations.add(rel['name']) 285 | self.relations = list(relations) 286 | 287 | def check_object_in_category(self, object_name): 288 | return object_name in self.sg_object_to_cateid 289 | 290 | def object_name_to_cateid(self, object_name): 291 | return self.sg_object_to_cateid[object_name] 292 | 293 | def get_attribute_type(self, attribute): 294 | return self.attribute_to_type.get(attribute, "other") 295 | 296 | def get_image_path(self, scene_graph_id): 297 | return os.path.join(self.image_folder, scene_graph_id + ".jpg") 298 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /annotations/attribute_category.json: -------------------------------------------------------------------------------- 1 | { 2 | "color": [ 3 | "white", 4 | "yellow", 5 | "maroon", 6 | "navy", 7 | "purple", 8 | "light brown", 9 | "green", 10 | "pink", 11 | "blue", 12 | "light blue", 13 | "red", 14 | "dark", 15 | "black and white", 16 | "dark colored", 17 | "brunette", 18 | "dark brown", 19 | "transparent", 20 | "bronze", 21 | "gold", 22 | "beige", 23 | "gray", 24 | "brown", 25 | "opaque", 26 | "copper", 27 | "dark blue", 28 | "orange", 29 | "neon", 30 | "cream colored", 31 | "rainbow colored", 32 | "silver", 33 | "blond", 34 | "khaki", 35 | "black", 36 | "tan", 37 | "translucent" 38 | ], 39 | "other": [ 40 | "sweet", 41 | "clear", 42 | "wii", 43 | "tinted", 44 | "analog", 45 | "powerful", 46 | "made", 47 | "scarce", 48 | "power", 49 | "electric", 50 | "christmas", 51 | "public", 52 | "wine", 53 | "tennis", 54 | "urban", 55 | "roman", 56 | "abundant", 57 | "commercial", 58 | "deciduous", 59 | "bright", 60 | "toy", 61 | "cordless", 62 | "real", 63 | "tail", 64 | "computer", 65 | "mixed", 66 | "evergreen", 67 | "portable", 68 | "fluorescent", 69 | "strong", 70 | "regular", 71 | "kitchen", 72 | "digital", 73 | "exterior", 74 | "oriental", 75 | "abstract", 76 | "adidas", 77 | "telephone", 78 | "baseball", 79 | "support", 80 | "chinese", 81 | "soccer", 82 | "wireless", 83 | "asian", 84 | "tropical", 85 | "railroad", 86 | "wired", 87 | "rustic", 88 | "professional", 89 | "toilet", 90 | "military", 91 | "simple", 92 | "bathroom", 93 | "safety", 94 | "disposable", 95 | "license", 96 | "calico", 97 | "birthday", 98 | "directional", 99 | "fancy", 100 | "nike", 101 | "sharp", 102 | "industrial", 103 | "ski", 104 | "american", 105 | "office", 106 | "capital", 107 | "garbage", 108 | "assorted", 109 | "electronic", 110 | "tasty", 111 | "ocean", 112 | "artificial", 113 | "caucasian", 114 | "protective", 115 | "foreign", 116 | "double decker", 117 | "french", 118 | "fake", 119 | "formal", 120 | "designed", 121 | "tabby", 122 | "delicious", 123 | "polar", 124 | "typical", 125 | "trash", 126 | "wrist", 127 | "street", 128 | "park", 129 | "wild", 130 | "sparse", 131 | "wedding", 132 | "ugly", 133 | "winter", 134 | "polo", 135 | "sturdy", 136 | "traffic", 137 | "new", 138 | "burning", 139 | "lined", 140 | "intricate", 141 | "on", 142 | "dangling", 143 | "breaking", 144 | "paved", 145 | "loose", 146 | "high", 147 | "beautiful", 148 | "short", 149 | "long", 150 | "outdoor", 151 | "crouched", 152 | "mature", 153 | "checkered", 154 | "chain-link", 155 | "gloomy", 156 | "attached", 157 | "pastel", 158 | "wide", 159 | "slanted", 160 | "fine", 161 | "weathered", 162 | "healthy", 163 | "cracked", 164 | "heavy", 165 | "athletic", 166 | "used", 167 | "rocky", 168 | "floating", 169 | "plain", 170 | "lush", 171 | "halved", 172 | "pointing", 173 | "outstretched", 174 | "still", 175 | "old fashioned", 176 | "shallow", 177 | "cut", 178 | "chocolate", 179 | "off", 180 | "young", 181 | "eaten", 182 | "ivory", 183 | "discolored", 184 | "light", 185 | "decorative", 186 | "dense", 187 | "baby", 188 | "low", 189 | "pulled back", 190 | "teal", 191 | "alert", 192 | "spread", 193 | "perched", 194 | "immature", 195 | "textured", 196 | "outdoors", 197 | "collared", 198 | "shaped", 199 | "inflatable", 200 | "elevated", 201 | "strawberry", 202 | "narrow", 203 | "reflected", 204 | "thin", 205 | "vanilla", 206 | "parked", 207 | "indoors", 208 | "sheer", 209 | "rippling", 210 | "pale", 211 | "hard", 212 | "antique", 213 | "warm", 214 | "dull", 215 | "pretty", 216 | "comfortable", 217 | "wooded", 218 | "funny", 219 | "colorful", 220 | "handmade", 221 | "curly", 222 | "groomed", 223 | "displayed", 224 | "corded", 225 | "straight", 226 | "uneven", 227 | "tilted", 228 | "complete", 229 | "modern", 230 | "vibrant", 231 | "homemade", 232 | "vintage", 233 | "rippled", 234 | "balding", 235 | "adult", 236 | "forested", 237 | "deep", 238 | "tall", 239 | "tangled", 240 | "wavy", 241 | "elderly", 242 | "sandy", 243 | "thick", 244 | "manicured", 245 | "ornamental", 246 | "light colored", 247 | "old" 248 | ], 249 | "size": [ 250 | "huge", 251 | "miniature", 252 | "tiny", 253 | "giant", 254 | "little", 255 | "massive", 256 | "oversized", 257 | "large", 258 | "small", 259 | "skinny", 260 | "chubby", 261 | "vast", 262 | "fat" 263 | ], 264 | "activity": [ 265 | "walking", 266 | "sliding", 267 | "having meeting", 268 | "posing", 269 | "skiing", 270 | "sitting", 271 | "squatting", 272 | "hitting", 273 | "blowing", 274 | "drinking", 275 | "waving", 276 | "looking up", 277 | "blooming", 278 | "driving", 279 | "crashing", 280 | "staring", 281 | "laughing", 282 | "standing", 283 | "cooking", 284 | "riding", 285 | "skating", 286 | "performing trick", 287 | "snowboarding", 288 | "kneeling", 289 | "crouching", 290 | "talking", 291 | "batting", 292 | "smiling", 293 | "looking down", 294 | "bending", 295 | "hanging", 296 | "playing", 297 | "skateboarding", 298 | "running", 299 | "flying", 300 | "eating", 301 | "grazing", 302 | "waiting", 303 | "jumping", 304 | "splashing", 305 | "spinning", 306 | "resting", 307 | "swinging", 308 | "reading", 309 | "spraying", 310 | "surfing", 311 | "sleeping", 312 | "lying", 313 | "swimming" 314 | ], 315 | "state": [ 316 | "sliced", 317 | "lighted", 318 | "toasted", 319 | "ripe", 320 | "fried", 321 | "shaved", 322 | "abandoned", 323 | "trimmed", 324 | "fenced", 325 | "painted", 326 | "dried", 327 | "juicy", 328 | "diced", 329 | "barefoot", 330 | "bunched", 331 | "drawn", 332 | "suspended", 333 | "seasoned", 334 | "shirtless", 335 | "rolled", 336 | "potted", 337 | "uncomfortable", 338 | "overcast", 339 | "grated", 340 | "stained", 341 | "chopped", 342 | "messy", 343 | "crowded", 344 | "raised", 345 | "vacant", 346 | "crossed", 347 | "cushioned", 348 | "faded", 349 | "decorated", 350 | "shadowed", 351 | "piled", 352 | "powdered", 353 | "padded", 354 | "shredded", 355 | "wrapped", 356 | "sealed", 357 | "mowed", 358 | "barren", 359 | "clean", 360 | "turned", 361 | "overgrown", 362 | "framed", 363 | "breakable", 364 | "chipped", 365 | "damaged", 366 | "crumbled", 367 | "hazy", 368 | "edged", 369 | "sunny", 370 | "partly cloudy", 371 | "cloudy", 372 | "gloved", 373 | "clumped", 374 | "patched", 375 | "dirty", 376 | "full", 377 | "inflated", 378 | "snowy", 379 | "short sleeved", 380 | "packed", 381 | "sunlit", 382 | "uncooked", 383 | "roasted", 384 | "rotten", 385 | "glazed", 386 | "scattered", 387 | "bald", 388 | "grouped", 389 | "torn", 390 | "glowing", 391 | "unoccupied", 392 | "hollow", 393 | "scrambled", 394 | "illuminated", 395 | "rimmed", 396 | "tied", 397 | "leafless", 398 | "peeled", 399 | "sculpted", 400 | "fallen", 401 | "upholstered", 402 | "fresh", 403 | "unpeeled", 404 | "half full", 405 | "packaged", 406 | "open", 407 | "melting", 408 | "closed", 409 | "unripe", 410 | "covered", 411 | "mounted", 412 | "worn", 413 | "sprinkled", 414 | "foggy", 415 | "sleeveless", 416 | "unlit", 417 | "cluttered", 418 | "carved", 419 | "grilled", 420 | "frozen", 421 | "baked", 422 | "iced", 423 | "incomplete", 424 | "steamed", 425 | "blurry", 426 | "boiled", 427 | "stormy", 428 | "lit", 429 | "shut", 430 | "written", 431 | "unhealthy", 432 | "blank", 433 | "neat", 434 | "bare", 435 | "connected", 436 | "folding", 437 | "wet", 438 | "shaded", 439 | "peeling", 440 | "folded", 441 | "muscular", 442 | "filled", 443 | "stuffed", 444 | "tight", 445 | "empty", 446 | "shining", 447 | "long sleeved", 448 | "stacked", 449 | "browned", 450 | "cloudless", 451 | "printed", 452 | "busy", 453 | "misty", 454 | "rainy", 455 | "murky", 456 | "raw", 457 | "burnt", 458 | "recessed", 459 | "choppy", 460 | "melted", 461 | "cooked", 462 | "broken", 463 | "docked" 464 | ], 465 | "material": [ 466 | "water", 467 | "rock", 468 | "bamboo", 469 | "soap", 470 | "paper", 471 | "wood", 472 | "metal", 473 | "hardwood", 474 | "cardboard", 475 | "cheese", 476 | "tomato", 477 | "apple", 478 | "gas", 479 | "tin", 480 | "aluminum", 481 | "cotton", 482 | "asphalt", 483 | "mesh", 484 | "styrofoam", 485 | "silk", 486 | "banana", 487 | "granite", 488 | "wicker", 489 | "steel", 490 | "crystal", 491 | "vinyl", 492 | "concrete", 493 | "leather", 494 | "porcelain", 495 | "beer", 496 | "plastic", 497 | "diamond", 498 | "straw", 499 | "rubber", 500 | "fire", 501 | "iron", 502 | "pine", 503 | "glass", 504 | "palm", 505 | "wire", 506 | "cobblestone", 507 | "wool", 508 | "jeans", 509 | "gravel", 510 | "soda", 511 | "cloth", 512 | "stainless steel", 513 | "denim", 514 | "brick", 515 | "pepper", 516 | "coffee", 517 | "lace", 518 | "brass", 519 | "ceramic", 520 | "clay", 521 | "chrome", 522 | "marble", 523 | "chalk", 524 | "pizza", 525 | "snow", 526 | "stone" 527 | ], 528 | "texture": [ 529 | "plaid", 530 | "frosted", 531 | "crumpled", 532 | "braided", 533 | "quilted", 534 | "wrinkled", 535 | "paneled", 536 | "knotted", 537 | "crispy", 538 | "crusty", 539 | "beaded", 540 | "muddy", 541 | "barbed", 542 | "foamy", 543 | "reflective", 544 | "unpaved", 545 | "bushy", 546 | "creamy", 547 | "ruffled", 548 | "furry", 549 | "carpeted", 550 | "flowered", 551 | "polished", 552 | "jagged", 553 | "coarse", 554 | "fuzzy", 555 | "dusty", 556 | "soft", 557 | "puffy", 558 | "dry", 559 | "wrinkly", 560 | "glossy", 561 | "wispy", 562 | "tiled", 563 | "shaggy", 564 | "greasy", 565 | "patchy", 566 | "hairy", 567 | "fluffy", 568 | "plush", 569 | "woven", 570 | "floral", 571 | "shiny", 572 | "shingled", 573 | "rugged", 574 | "ridged", 575 | "rusty", 576 | "dotted", 577 | "spiky", 578 | "patterned", 579 | "speckled", 580 | "grassy", 581 | "feathered", 582 | "smooth", 583 | "crisp", 584 | "floppy", 585 | "ornate", 586 | "knit", 587 | "leafy", 588 | "rough", 589 | "striped" 590 | ], 591 | "mood": [ 592 | "sad", 593 | "angry", 594 | "sleepy", 595 | "happy", 596 | "unhappy", 597 | "curious", 598 | "calm" 599 | ], 600 | "shape": [ 601 | "crooked", 602 | "triangular", 603 | "pointy", 604 | "elongated", 605 | "oblong", 606 | "octagonal", 607 | "sloped", 608 | "curved", 609 | "round", 610 | "domed", 611 | "rounded", 612 | "bent", 613 | "curled", 614 | "winding", 615 | "angled", 616 | "spiral", 617 | "rectangular", 618 | "twisted", 619 | "irregular", 620 | "steep", 621 | "square", 622 | "flat", 623 | "cylindrical", 624 | "arched", 625 | "curvy" 626 | ], 627 | "orientation": [ 628 | "horizontal", 629 | "down", 630 | "upside down", 631 | "overhead", 632 | "upper", 633 | "vertical", 634 | "up", 635 | "lower" 636 | ], 637 | "gender": [ 638 | "female", 639 | "male" 640 | ] 641 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TaskMeAnything 2 | 3 | 4 | 5 | 6 | 7 |

Task Me Anything

8 | 9 |

10 | 11 |

12 | 13 |

🌐 Website | 📑 Paper | 🤗 Huggingface | 💻 Interface

14 | 15 |
If you like our project, please give us a star ⭐ on GitHub for latest update.
16 | 17 | 18 | 19 | ## 🔔News 20 | **🔥[2024-09-26]: Task Me Anything got accepted by NeurIPS 2024 Dataset & Benchmark track!** 21 | 22 | **🔥[2024-08-03]: TaskMeAnything-v1-2024 released! A benchmark for reflecting the current progress of MLMs by `automatically` finding tasks that popular MLMs struggle with using the `TaskMeAnything Top-K query and query approximation algorithms`. This includes [12,270 ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2024) and [3,567 VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024) questions that TaskMeAnything automatically approximated as challenging.** 23 | 24 | **🔥[2024-07-04]: Demo for TaskMeAnything released! checkout our demo for [generating customized ImageQa, VideoQA benchmarks](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/generate) and [model evaluation query](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/query)!** 25 | 26 | **🔥[2024-06-17]: Paper arXived!** 27 | 28 | **🔥[2024-06-01]: Code released!** 29 | 30 | ## What's TaskMeAnything? 31 | TaskMeAnything is a benchmark generation engine which produces a benchmark for large multimodal language models (MLMs) tailored to a user's needs. 32 | In particular, TaskMeAnything maintains an extendable taxonomy of visual assets and can programmatically generate a vast number of task instances. 33 | Additionally, it algorithmically addresses user queries regarding MLM performance efficiently within a computational budget. 34 | The current version can generate > 750M image/video question-answering pairs, which focus on evaluating MLM perceptual capabilities. 35 | 36 | :exclamation: **TaskMeAnything does NOT involve any AI model during image/video, question, and answer generation, so the generated tasks do NOT suffer from model imperfection or hallucinations.** 37 | 38 | We release the following resources: 39 | 1. [**TaskMeAnything-v1**](https://github.com/JieyuZ2/TaskMeAnything): the first version of TaskMeAnything, includes 28 task generators which can generate over 750M VQA task. 40 | 2. **TaskMeAnything-v1-Random**[[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random)]: A randomly selected from TaskMeAnything-v1, including 5,700 ImageQA and 1,800 VideoQA task instances. 41 | 3. **TaskMeAnything-v1-2024**[[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2924)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024)]: A benchmark for reflecting the current progress of MLMs by `automatically` finding tasks that popular MLMs struggle with using the TaskMeAnything Top-K query and query approximation algorithms. This includes [12,270 ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2024) and [3,567 VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024) questions that TaskMeAnything automatically approximated as challenging for over 20 popular MLMs. 42 | 4. [**TaskMeAnything-DB**](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-eval-db): A database for TaskMeAnything, which stores the evaluation results of 13 open-source MLMs over 1M VQA task instances. 43 | 5. [**TaskMeAnything-UI**](): An interactive graphical interface built upon TaskMeAnything-DB, which allows users to interact with the performance of models on TaskMeAnything-v1 in a intuitve way. 44 | 45 | 46 | 47 | ## TaskMeAnything-v1 48 | 49 | ### Usage 50 | Demo for TaskMeAnything released! checkout our demo for 51 | * [generating customized ImageQa, VideoQA benchmarks](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/generate) 52 | * [model evaluation query](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/query) 53 | 54 | Notice: If you want to evaluate videoqa models, please check our [videoqa model branch](https://github.com/JieyuZ2/TaskMeAnything/tree/videoqa_model) 55 | 56 | 57 | ### Installation 58 | You can easily download the repo and set up the environments via: 59 | ``` 60 | git clone https://github.com/JieyuZ2/TaskMeAnything.git 61 | cd ./TaskMeAnything 62 | 63 | pip install -r requirements.txt 64 | ``` 65 | 66 | Notice: if you want to render 3D images/videos by `Blender` locally or use `Internvl-chat-v1.5-24B` that required `flash-attn` which hard to install by pip, you can use the docker image we provide. 67 | You can pull the docker image from DockerHub which includes all the dependencies like `Blender`, `flash-attn`, `cuda driver`, `nvcc`, etc. 68 | ``` 69 | docker pull weikaih/ubuntu20.4_internvl_blender_v1.2:latest 70 | docker run --gpus all -it weikaih/ubuntu20.4_internvl_blender_v1.2:latest /bin/bash # run the docker image with GPU support 71 | 72 | git clone https://github.com/JieyuZ2/TaskMeAnything.git 73 | cd ./TaskMeAnything 74 | 75 | pip install -r requirements.txt 76 | ``` 77 | 78 | 79 | ### Source data 80 | Source data is stored in [HuggingFace](https://huggingface.co/datasets/jieyuz2/TaskMeAnything-v1-source). It includes `3d_assets`, `agqa_video`, and `object_images`. 81 | 82 | For real image with scene graphs, please download the images and scene graphs from the following links: [SceneGraph](https://downloads.cs.stanford.edu/nlp/data/gqa/sceneGraphs.zip), [Image](https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip). 83 | After downloading, move the scene graphs and images into the source data folder, and arrange them as format below. 84 | ``` 85 | TaskMeAnything-v1-source/vg/sceneGraphs: move scene graphs files to this folder (e.g. TaskMeAnything-v1-source/vg/sceneGraphs/train_sceneGraphs.json). 86 | TaskMeAnything-v1-source/vg/images/images: move all the images to this folder (e.g. TaskMeAnything-v1-source/vg/images/images/2323739.jpg). 87 | ``` 88 | 89 | 90 | ### Task Generator 91 | We have 28 task generators in TaskMeAnything-v1, across 5 Scenarios: 92 | 1. `2D Sticker Image`: grid-how-many, grid-what, grid-where, grid-what-attribute, grid-where-attribute 93 | 2. `3D Tabletop Image`: 3d-what, 3d-where, 3d-what-attribute, 3d-where-attribute, 3d-how-many, 3d-what-size, 3d-where-size, 3d-what-attribute-size, 3d-what-distance, 3d-where-distance, 3d-what-attribute-distance 94 | 3. `3D Tabletop Video`: video-3d-what-move, video-3d-where-move, video-3d-what-attribute-move, video-3d-what-rotate, video-3d-where-rotate, video-3d-what-attribute-rotate 95 | 4. `Real Images`: sg-what-object, sg-what-relation, sg-what-attribute 96 | 5. `Real Videos`: video-sg-what-object, video-sg-what-relation, video-sg-what-action 97 | 98 | ### Tested Models 99 | We support the following ImageQA and VideoQA models: 100 | - `ImageQA`: qwenvl-chat, qwenvl, llavav1.5-7b, llavav1.5-13b, instructblip-vicuna7b, instructblip-vicuna13b, internvl-chat-v1.5, gemini-vision-pro, qwen-vl-max, gpt4v, gpt4o 101 | - `VideoQA`: video-llama2-7b, video-llama2-13b, video-llava-7b, chat-univi-7b, chat-univi-13b, video-chatgpt-7b, video-chat2-7b 102 | 103 | 104 | 105 | 106 | You can also use our unified vqa interface for inference: 107 | ```python 108 | from PIL import Image 109 | from tma.models.qa_model import ImageQAModel 110 | # from tma.models.qa_model.prompt import succinct_prompt 111 | from tma.models.qa_model.prompt import detailed_imageqa_prompt 112 | 113 | model = ImageQAModel( 114 | model_name= "llava-v1.5-7b", 115 | prompt_name= "detailed", 116 | prompt_func= detailed_imageqa_prompt 117 | ) 118 | 119 | image = './path/to/image.jpg' 120 | # or image = Image.open(image_path) 121 | question = "Describe the image." 122 | 123 | model.qa(image, question) 124 | ``` 125 | Or check [videoqa model branch](https://github.com/JieyuZ2/TaskMeAnything/tree/videoqa_model) for videoqa models qa inference. 126 | 127 | ## TaskMeAnything-v1 Benchmark 128 | Currently, we provide two versions of TaskMeAnything-v1 benchmark: 129 | * TaskMeAnything-v1-Random: [[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random)]: A randomly selected from TaskMeAnything-v1, including 5,700 ImageQA and 1,800 VideoQA task instances. 130 | * TaskMeAnything-v1-2024: [[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2924)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024)]: A benchmark for reflecting the current progress of MLMs by `automatically` finding tasks that popular MLMs struggle with using the TaskMeAnything Top-K query and query approximation algorithms. 131 | 132 | ### Load TaskMeAnything-v1 ImageQA Dataset 133 | ```python 134 | import datasets 135 | dataset_name = 'weikaih/TaskMeAnything-v1-imageqa-random' 136 | #dataset_name = 'weikaih/TaskMeAnything-v1-imageqa-2024' 137 | dataset = datasets.load_dataset(dataset_name, split = TASK_GENERATOR_SPLIT) 138 | ``` 139 | where `TASK_GENERATOR_SPLIT` is one of the task generators, eg, `2d_how_many`. 140 | 141 | 142 | ### Load TaskMeAnything-v1 VideoQA Dataset and Convert Video Binary Stream to mp4 143 | * Since Huggingface does not support saving .mp4 files in datasets, we save videos in the format of binary streams. After loading, you can convert the video binary stream to .mp4 using the following method. 144 | ```python 145 | import datasets 146 | 147 | dataset_name = 'weikaih/TaskMeAnything-v1-videoqa-random' 148 | #dataset_name = 'weikaih/TaskMeAnything-v1-videoqa-2024' 149 | dataset = datasets.load_dataset(dataset_name, split = TASK_GENERATOR_SPLIT) 150 | 151 | # example: convert binary stream in dataset to .mp4 files 152 | video_binary = dataset[0]['video'] 153 | with open('/path/save/video.mp4', 'wb') as f: 154 | f.write(video_binary) 155 | ``` 156 | 157 | ### Evalution results in TaskMeAnything-v1 benchmark 158 | * ImageQA in Random 159 |

160 | 161 |

162 | 163 | * VideoQA in Random 164 |

165 | 166 |

167 | 168 | * ImageQA in 2024 169 |

170 | 171 |

172 | 173 | * VideoQA in 2024 174 |

175 | 176 |

177 | 178 | * TaskMeAnything-v1-2024 v.s TaskMeAnything-v1-Random 179 |

180 | 181 |

182 |

183 | 184 |

185 | 186 | * we can see that the performance drops are more significant in the 2024 version, which indicates that the 2024 version is more challenging for the models. 187 | 188 | 189 | For more details, please check out the [paper](https://arxiv.org/abs/2406.11775). 190 | 191 | ## TaskMeAnything-DB 192 | **TaskMeAnything-DB** are stored in [HuggingFace](https://huggingface.co/datasets/jieyuz2/TaskMeAnything-v1-db) 193 | 194 | ## TaskMeAnything-UI 195 | **TaskMeAnything-UI** are hosted in [HuggingFace](todo), check out our interactive interface to explore the performance of models on TaskMeAnything-v1 in your own way! 196 | 197 | ## Disclaimers 198 | **TaskMeAnything** and its associated resources are provided for research and educational purposes only. 199 | The authors and contributors make no warranties regarding the accuracy or reliability of the data and software. 200 | Users are responsible for ensuring their use complies with applicable laws and regulations. 201 | The project is not liable for any damages or losses resulting from the use of these resources. 202 | 203 | 204 | ## Contact 205 | 206 | - Jieyu Zhang: jieyuz2@cs.washington.edu 207 | 208 | ## Citation 209 | 210 | **BibTeX:** 211 | 212 | ```bibtex 213 | @inproceedings{zhang2024task, 214 | title={Task Me Anything}, 215 | author={Zhang, Jieyu and Huang, Weikai and Ma, Zixian and Michel, Oscar and He, Dong and Gupta, Tanmay and Ma, Wei-Chiu and Farhadi, Ali and Kembhavi, Aniruddha and Krishna, Ranjay}, 216 | booktitle={Thirty-Eighth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track}, 217 | year={2024} 218 | } 219 | ``` 220 | 221 | -------------------------------------------------------------------------------- /tma/videoqa/scene_graph/single_video_task.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from ..metadata import VideoSceneGraphMetaData 7 | from ...base import TaskGenerator 8 | from ...task_store import TaskStore 9 | 10 | 11 | def load_mp4_video(video_path): 12 | with open(video_path, "rb") as file: 13 | mp4_data = file.read() 14 | return mp4_data 15 | 16 | 17 | def enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type): 18 | relation_to_actions = {} 19 | video_scene_graph_keyframes = list(video_scene_graph.keys()) 20 | 21 | if temporal_reference_type == "before": 22 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[:-1]): 23 | next_keyframe_name = video_scene_graph_keyframes[idx + 1] 24 | for relation, obj in video_scene_graph[keyframe_name][relation_type].items(): 25 | if relation not in video_scene_graph[next_keyframe_name][relation_type]: 26 | if relation not in relation_to_actions: 27 | relation_to_actions[(relation, obj)] = set() 28 | for after_keyframe in video_scene_graph_keyframes[idx + 1:]: 29 | for action in video_scene_graph[after_keyframe]['actions']: 30 | if action not in video_scene_graph[keyframe_name]['actions']: 31 | relation_to_actions[(relation, obj)].add(action) 32 | 33 | elif temporal_reference_type == "after": 34 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[1:], start=1): 35 | previous_keyframe_name = video_scene_graph_keyframes[idx - 1] 36 | for relation, obj in video_scene_graph[keyframe_name][relation_type].items(): 37 | if relation not in video_scene_graph[previous_keyframe_name][relation_type]: 38 | if relation not in relation_to_actions: 39 | relation_to_actions[(relation, obj)] = set() 40 | for before_keyframe in video_scene_graph_keyframes[:idx]: 41 | for action in video_scene_graph[before_keyframe]['actions']: 42 | if action not in video_scene_graph[keyframe_name]['actions']: 43 | relation_to_actions[(relation, obj)].add(action) 44 | 45 | elif temporal_reference_type == "while": 46 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes): 47 | for relation, obj in video_scene_graph[keyframe_name][relation_type].items(): 48 | if relation not in relation_to_actions: 49 | relation_to_actions[(relation, obj)] = set() 50 | for action in video_scene_graph[keyframe_name]['actions']: 51 | relation_to_actions[(relation, obj)].add(action) 52 | 53 | # Convert sets to lists for the output 54 | relation_to_actions = {k: list(v) for k, v in relation_to_actions.items()} 55 | return relation_to_actions 56 | 57 | 58 | def enumerate_target_action_to_possible_reference_actions(video_scene_graph, temporal_reference_type): 59 | action_to_actions = {} 60 | video_scene_graph_keyframes = list(video_scene_graph.keys()) 61 | 62 | if temporal_reference_type == "before": 63 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[:-1]): 64 | next_keyframe_name = video_scene_graph_keyframes[idx + 1] 65 | for action in video_scene_graph[keyframe_name]['actions']: 66 | if action not in video_scene_graph[next_keyframe_name]['actions']: 67 | if action not in action_to_actions: 68 | action_to_actions[action] = set() 69 | for after_keyframe in video_scene_graph_keyframes[idx + 1:]: 70 | for reference_action in video_scene_graph[after_keyframe]['actions']: 71 | if reference_action not in video_scene_graph[keyframe_name]['actions'] and reference_action != action: 72 | action_to_actions[action].add(reference_action) 73 | 74 | elif temporal_reference_type == "after": 75 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[1:], start=1): 76 | previous_keyframe_name = video_scene_graph_keyframes[idx - 1] 77 | for action in video_scene_graph[keyframe_name]['actions']: 78 | if action not in video_scene_graph[previous_keyframe_name]['actions']: 79 | if action not in action_to_actions: 80 | action_to_actions[action] = set() 81 | for before_keyframe in video_scene_graph_keyframes[:idx]: 82 | for reference_action in video_scene_graph[before_keyframe]['actions']: 83 | if reference_action not in video_scene_graph[keyframe_name]['actions'] and reference_action != action: 84 | action_to_actions[action].add(reference_action) 85 | 86 | elif temporal_reference_type == "while": 87 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes): 88 | for action in video_scene_graph[keyframe_name]['actions']: 89 | if action not in action_to_actions: 90 | action_to_actions[action] = set() 91 | for reference_action in video_scene_graph[keyframe_name]['actions']: 92 | if reference_action != action: 93 | action_to_actions[action].add(reference_action) 94 | 95 | # Convert sets to lists for the output 96 | action_to_actions = {k: list(v) for k, v in action_to_actions.items()} 97 | return action_to_actions 98 | 99 | 100 | def get_all_spatial_relations(video_scene_graph): 101 | relations = set() 102 | for keyframe_name, keyframe in video_scene_graph.items(): 103 | relations.update(keyframe['spatial']) 104 | return relations 105 | 106 | 107 | def get_all_contact_relations(video_scene_graph): 108 | relations = set() 109 | for keyframe_name, keyframe in video_scene_graph.items(): 110 | relations.update(keyframe['contact']) 111 | return relations 112 | 113 | 114 | def get_all_objects(video_scene_graph): 115 | objects = set() 116 | for keyframe_name, keyframe in video_scene_graph.items(): 117 | for relation in keyframe['spatial']: 118 | objects.add(keyframe['spatial'][relation]) 119 | for relation in keyframe['contact']: 120 | objects.add(keyframe['contact'][relation]) 121 | return objects 122 | 123 | 124 | def get_all_actions(video_scene_graph): 125 | actions = set() 126 | for keyframe_name, keyframe in video_scene_graph.items(): 127 | actions.update(keyframe['actions']) 128 | return actions 129 | 130 | 131 | class VideoSceneGraphTaskGenerator(TaskGenerator): 132 | metadata: VideoSceneGraphMetaData 133 | 134 | embed_schema = [ 135 | "task type", 136 | "object", 137 | "relation", 138 | "action", 139 | "reference action", 140 | "relation type", 141 | "temporal reference type", 142 | ] 143 | 144 | def __init__(self, metadata: VideoSceneGraphMetaData, seed=42): 145 | super().__init__(metadata, seed=seed) 146 | 147 | def _generate_task(self, task_plan) -> Tuple[str, str, List[str], str]: 148 | "(Abstract method) generate task" 149 | 150 | def _task_plan_to_str(self, task_plan) -> str: 151 | t = [] 152 | for k, v in task_plan.items(): 153 | if k in self.embed_schema: 154 | assert isinstance(v, str) 155 | t.append(f'{k}: {v}') 156 | return '\n'.join(t) 157 | 158 | def generate(self, task_plan, return_data=True, seed=None): 159 | if seed is not None: 160 | self.rng = np.random.default_rng(seed=seed) 161 | 162 | question, answer, options, video_scene_graph_id = self._generate_task(task_plan) 163 | 164 | task = { 165 | "question" : question, 166 | "answer" : answer, 167 | "options" : options, 168 | "task_plan" : self._task_plan_to_str(task_plan), 169 | "video_scene_graph_id": video_scene_graph_id, 170 | 'video' : load_mp4_video(self.metadata.get_video_path(video_scene_graph_id)) if return_data else None 171 | } 172 | return task 173 | 174 | 175 | class WhatObjectVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator): 176 | schema = { 177 | "task type" : "str", 178 | "object" : "str", 179 | "relation" : "str", 180 | "reference action" : "str", 181 | "relation type" : "str", 182 | "temporal reference type": "str", 183 | "video scene graph id" : "str", 184 | } 185 | 186 | def enumerate_task_plans(self, task_store: TaskStore): 187 | for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what object video] task"): 188 | for relation_type in ["spatial", "contact"]: 189 | for temporal_reference_type in ["before", "after", "while"]: 190 | target_relation_to_possible_reference_actions = enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type) 191 | for (target_relation, target_object), possible_reference_actions in target_relation_to_possible_reference_actions.items(): 192 | for reference_action in possible_reference_actions: 193 | task_plan = { 194 | "task type" : "what object video", 195 | "video scene graph id" : video_scene_graph_id, 196 | "object" : self.metadata.idx2name[target_object], 197 | "relation" : self.metadata.idx2name[target_relation], 198 | 'relation type' : relation_type, 199 | "reference action" : self.metadata.idx2name[reference_action], 200 | "temporal reference type": temporal_reference_type, 201 | } 202 | task_store.add(task_plan) 203 | 204 | def _generate_task(self, task_plan): 205 | question = f"What is the object that the person is {task_plan['relation']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 206 | 207 | answer = task_plan["object"] 208 | negatives = list(set(self.metadata.objects) - get_all_objects(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 209 | negatives = [self.metadata.idx2name[neg] for neg in negatives] 210 | 211 | options = self._compose_options(answer, negatives) 212 | return question, answer, options, task_plan["video scene graph id"] 213 | 214 | 215 | class WhatRelationVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator): 216 | schema = { 217 | "task type" : "str", 218 | "object" : "str", 219 | "relation" : "str", 220 | "reference action" : "str", 221 | "relation type" : "str", 222 | "temporal reference type": "str", 223 | "video scene graph id" : "str", 224 | } 225 | 226 | def enumerate_task_plans(self, task_store: TaskStore): 227 | for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what relation video] task"): 228 | for relation_type in ["spatial", "contact"]: 229 | for temporal_reference_type in ["before", "after", "while"]: 230 | target_relation_to_possible_reference_actions = enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type) 231 | for (target_relation, target_object), possible_reference_actions in target_relation_to_possible_reference_actions.items(): 232 | for reference_action in possible_reference_actions: 233 | task_plan = { 234 | "task type" : "what relation video", 235 | "video scene graph id" : video_scene_graph_id, 236 | "object" : self.metadata.idx2name[target_object], 237 | "relation" : self.metadata.idx2name[target_relation], 238 | 'relation type' : relation_type, 239 | "reference action" : self.metadata.idx2name[reference_action], 240 | "temporal reference type": temporal_reference_type, 241 | } 242 | task_store.add(task_plan) 243 | 244 | def _generate_task(self, task_plan): 245 | if task_plan["relation type"] == "spatial": 246 | question = f"What is the spatial relation of the person to the {task_plan['object']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 247 | negatives = list(set(self.metadata.spatial_relations) - get_all_spatial_relations(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 248 | elif task_plan["relation type"] == "contact": 249 | question = f"What is the person doing to the {task_plan['object']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 250 | negatives = list(set(self.metadata.contact_relations) - get_all_contact_relations(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 251 | else: 252 | raise ValueError(f"Unknown relation type: {task_plan['relation type']}") 253 | 254 | answer = task_plan['relation'] 255 | negatives = [self.metadata.idx2name[neg] for neg in negatives] 256 | 257 | options = self._compose_options(answer, negatives) 258 | return question, answer, options, task_plan["video scene graph id"] 259 | 260 | 261 | class WhatActionVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator): 262 | schema = { 263 | "task type" : "str", 264 | "action" : "str", 265 | "reference action" : "str", 266 | "relation type" : "str", 267 | "temporal reference type": "str", 268 | "video scene graph id" : "str", 269 | } 270 | 271 | def enumerate_task_plans(self, task_store: TaskStore): 272 | for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what action video] task"): 273 | for temporal_reference_type in ["before", "after", "while"]: 274 | target_action_to_possible_reference_actions = enumerate_target_action_to_possible_reference_actions(video_scene_graph, temporal_reference_type) 275 | for target_action, possible_reference_actions in target_action_to_possible_reference_actions.items(): 276 | for reference_action in possible_reference_actions: 277 | task_plan = { 278 | "task type" : "what action video", 279 | "video scene graph id" : video_scene_graph_id, 280 | "action" : self.metadata.idx2name[target_action], 281 | "reference action" : self.metadata.idx2name[reference_action], 282 | "temporal reference type": temporal_reference_type, 283 | } 284 | task_store.add(task_plan) 285 | 286 | def _generate_task(self, task_plan): 287 | question = f"What action is the person doing {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 288 | 289 | answer = task_plan["action"] 290 | negatives = list(set(self.metadata.actions) - get_all_actions(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 291 | negatives = [self.metadata.idx2name[neg] for neg in negatives] 292 | 293 | options = self._compose_options(answer, negatives) 294 | return question, answer, options, task_plan["video scene graph id"] 295 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/movement_single_video_task.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from .single_video_task import GridVideoTaskGenerator 4 | from .utils import relative_positions 5 | from ..metadata import ObjaverseVideoMetaData as MetaData 6 | from ...constant import NUM_OPTIONS, VIDEO_FPS, VIDEO_NUM_FRAMES 7 | from ...task_store import TaskStore 8 | 9 | grid_options = [2] 10 | DEFAULT_OBJECT_SIZE_MULTIPLIER = 1.3 11 | 12 | moving_options = {'left', 'right', 'up', 'down'} 13 | 14 | 15 | def direction_to_keyframes(direction): 16 | if direction == 'left': 17 | return [{}, {}, {}, {}, {'movement': (0, 0.35)}] 18 | elif direction == 'right': 19 | return [{}, {}, {}, {}, {'movement': (0, -0.35)}] 20 | elif direction == 'up': 21 | return [{}, {}, {}, {}, {'movement': (0.45, 0)}] 22 | elif direction == 'down': 23 | return [{}, {}, {}, {}, {'movement': (-0.45, 0)}] 24 | 25 | 26 | class MovementVideoGridTaskGenerator(GridVideoTaskGenerator): 27 | def _make_video_metadata(self, grid_size, grids, queries, remaining_query=..., target_object_moving_direction='left', are_other_objects_moving="No", object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER): 28 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 29 | remaining_grids = [g for g in range(grid_size ** 2) if g not in grids] 30 | for _ in remaining_grids: 31 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 32 | objects.append(uid) 33 | 34 | remaining_moving_direction = list(moving_options - {target_object_moving_direction}) 35 | keyframes = [direction_to_keyframes(target_object_moving_direction)] 36 | if are_other_objects_moving == "Yes": 37 | remaining_keyframes = [direction_to_keyframes(self.rng.choice(remaining_moving_direction, size=1)) for _ in range(len(remaining_grids))] 38 | else: 39 | remaining_keyframes = [[{}, {}, {}, {}, {}] for _ in range(len(remaining_grids))] 40 | 41 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 42 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 43 | 44 | video_metadata = { 45 | 'grid number' : grid_size, 46 | 'objects' : objects, 47 | 'object_path' : object_path, 48 | 'object_angles' : angles, 49 | 'grids' : grids + remaining_grids, 50 | 'blender_config' : self.metadata.sample_blender_configuration(self.rng), 51 | 'fps' : VIDEO_FPS, 52 | 'total_num_frames': VIDEO_NUM_FRAMES, 53 | 'sizes' : [object_size_multiplier for _ in objects], 54 | 'keyframes' : keyframes + remaining_keyframes, 55 | } 56 | return video_metadata 57 | 58 | 59 | class WhatMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator): 60 | schema = { 61 | 'task type' : 'str', 62 | 'grid number' : 'int', 63 | 'target category' : 'str', 64 | 'absolute position' : 'str', 65 | 'attribute type' : 'str', 66 | 'attribute value' : 'str', 67 | 'moving direction' : 'str', 68 | 'are other objects moving': 'str' 69 | } 70 | 71 | def enumerate_task_plans(self, task_store: TaskStore): 72 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what move video] task"): 73 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 74 | for attribute_type, attribute_values in attribute_dict.items(): 75 | for attribute_value in attribute_values: 76 | for grid_size in grid_options: 77 | for absolute_pos in self.grid_mappings[grid_size]: 78 | for target_object_moving_direction in moving_options: 79 | task_plan = { 80 | 'task type' : 'what move video', 81 | 'grid number' : grid_size, 82 | 'target category' : target_category, 83 | 'absolute position' : absolute_pos, 84 | 'attribute type' : attribute_type, 85 | 'attribute value' : attribute_value, 86 | 'moving direction' : target_object_moving_direction, 87 | 'are other objects moving': "Yes" 88 | } 89 | task_store.add(task_plan) 90 | 91 | task_plan = { 92 | 'task type' : 'what move video', 93 | 'grid number' : grid_size, 94 | 'target category' : target_category, 95 | 'absolute position' : absolute_pos, 96 | 'attribute type' : attribute_type, 97 | 'attribute value' : attribute_value, 98 | 'moving direction' : target_object_moving_direction, 99 | 'are other objects moving': "No" 100 | } 101 | task_store.add(task_plan) 102 | 103 | for grid_size in grid_options: 104 | for absolute_pos in self.grid_mappings[grid_size]: 105 | for target_object_moving_direction in moving_options: 106 | task_plan = { 107 | 'task type' : 'what move video', 108 | 'grid number' : grid_size, 109 | 'target category' : target_category, 110 | 'absolute position' : absolute_pos, 111 | 'moving direction' : target_object_moving_direction, 112 | 'are other objects moving': "Yes" 113 | } 114 | task_store.add(task_plan) 115 | 116 | task_plan = { 117 | 'task type' : 'what move video', 118 | 'grid number' : grid_size, 119 | 'target category' : target_category, 120 | 'absolute position' : absolute_pos, 121 | 'moving direction' : target_object_moving_direction, 122 | 'are other objects moving': "No" 123 | } 124 | task_store.add(task_plan) 125 | 126 | def _generate_task(self, task_plan): 127 | grid_size = task_plan['grid number'] 128 | target_category = task_plan['target category'] 129 | absolute_pos = task_plan['absolute position'] 130 | grids = [self.grid_mappings[grid_size][absolute_pos]] 131 | target_object_moving_direction = task_plan['moving direction'] 132 | 133 | if task_plan['are other objects moving'] == "Yes": 134 | question = f"What is the object that is moving {target_object_moving_direction} in the video?" 135 | else: 136 | question = f"What is the moving object in the video?" 137 | 138 | queries = [self._get_target_object_query(task_plan)] 139 | 140 | remaining_query = self.metadata.and_query([("category", target_category, False)]) 141 | 142 | video_metadata = self._make_video_metadata( 143 | grid_size, 144 | grids, 145 | queries=queries, 146 | remaining_query=remaining_query, 147 | target_object_moving_direction=target_object_moving_direction, 148 | are_other_objects_moving=task_plan['are other objects moving'], 149 | object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER 150 | ) 151 | 152 | answer = self.metadata.get_surfacename(target_category) 153 | negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category)) 154 | for o in video_metadata['objects'][1:]] 155 | options = self._compose_options(answer, negatives) 156 | 157 | return question, answer, options, video_metadata 158 | 159 | 160 | class WhatAttributeMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator): 161 | schema = { 162 | 'task type' : 'str', 163 | 'grid number' : 'int', 164 | 'target category' : 'str', 165 | 'absolute position' : 'str', 166 | 'attribute type' : 'str', 167 | 'attribute value' : 'str', 168 | 'moving direction' : 'str', 169 | 'are other objects moving': 'str' 170 | } 171 | 172 | def enumerate_task_plans(self, task_store: TaskStore): 173 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what attribute move video] task"): 174 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 175 | for attribute_type, attribute_values in attribute_dict.items(): 176 | for attribute_value in attribute_values: 177 | for grid_size in grid_options: 178 | for absolute_pos in self.grid_mappings[grid_size]: 179 | for target_object_moving_direction in moving_options: 180 | task_plan = { 181 | 'task type' : 'what attribute move video', 182 | 'grid number' : grid_size, 183 | 'target category' : target_category, 184 | 'absolute position' : absolute_pos, 185 | 'attribute type' : attribute_type, 186 | 'attribute value' : attribute_value, 187 | 'moving direction' : target_object_moving_direction, 188 | 'are other objects moving': "Yes" 189 | } 190 | task_store.add(task_plan) 191 | 192 | task_plan = { 193 | 'task type' : 'what attribute move video', 194 | 'grid number' : grid_size, 195 | 'target category' : target_category, 196 | 'absolute position' : absolute_pos, 197 | 'attribute type' : attribute_type, 198 | 'attribute value' : attribute_value, 199 | 'moving direction' : target_object_moving_direction, 200 | 'are other objects moving': "No" 201 | } 202 | task_store.add(task_plan) 203 | 204 | def _generate_task(self, task_plan): 205 | grid_size = task_plan['grid number'] 206 | 207 | attribute_type = task_plan['attribute type'] 208 | absolute_pos = task_plan['absolute position'] 209 | target_object_moving_direction = task_plan['moving direction'] 210 | grids = [self.grid_mappings[grid_size][absolute_pos]] 211 | 212 | queries = [self._get_target_object_query(task_plan)] 213 | if task_plan['are other objects moving'] == "Yes": 214 | question = f"What is the {attribute_type} of the object that is moving {target_object_moving_direction} in the video?" 215 | else: 216 | question = f"What is the {attribute_type} of the moving object in the video?" 217 | 218 | video_metadata = self._make_video_metadata( 219 | grid_size, 220 | grids, 221 | queries=queries, 222 | target_object_moving_direction=target_object_moving_direction, 223 | are_other_objects_moving=task_plan['are other objects moving'], 224 | object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER 225 | ) 226 | 227 | answer = task_plan['attribute value'] 228 | target_object = video_metadata['objects'][0] 229 | negative_query = self.metadata.and_query([ 230 | (attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)])) 231 | ]) 232 | negatives = self.metadata.sample( 233 | self.rng, 234 | NUM_OPTIONS - 1, 235 | attribute_type, 236 | query=negative_query, 237 | ) 238 | options = [answer] + negatives 239 | return question, answer, options, video_metadata 240 | 241 | 242 | class WhereMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator): 243 | schema = { 244 | 'task type' : 'str', 245 | 'grid number' : 'int', 246 | 'target category' : 'str', 247 | 'absolute position' : 'str', 248 | 'attribute type' : 'str', 249 | 'attribute value' : 'str', 250 | 'moving direction' : 'str', 251 | 'are other objects moving': 'str' 252 | } 253 | 254 | def __init__(self, metadata: MetaData, seed=42): 255 | super().__init__(metadata, seed=seed) 256 | self.relative_positions = relative_positions 257 | 258 | def enumerate_task_plans(self, task_store: TaskStore): 259 | for target_category in tqdm(self.metadata.categories, desc="enumerating [where move video] task"): 260 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 261 | for attribute_type, attribute_values in attribute_dict.items(): 262 | for attribute_value in attribute_values: 263 | for grid_size in grid_options: 264 | for absolute_pos in self.grid_mappings[grid_size]: 265 | for target_object_moving_direction in moving_options: 266 | task_plan = { 267 | 'task type' : 'where move video', 268 | 'grid number' : grid_size, 269 | 'target category' : target_category, 270 | 'absolute position' : absolute_pos, 271 | 'attribute type' : attribute_type, 272 | 'attribute value' : attribute_value, 273 | 'moving direction' : target_object_moving_direction, 274 | 'are other objects moving': "Yes" 275 | } 276 | task_store.add(task_plan) 277 | 278 | task_plan = { 279 | 'task type' : 'where move video', 280 | 'grid number' : grid_size, 281 | 'target category' : target_category, 282 | 'absolute position' : absolute_pos, 283 | 'attribute type' : attribute_type, 284 | 'attribute value' : attribute_value, 285 | 'moving direction' : target_object_moving_direction, 286 | 'are other objects moving': "No" 287 | } 288 | task_store.add(task_plan) 289 | 290 | for grid_size in grid_options: 291 | for absolute_pos in self.grid_mappings[grid_size]: 292 | for target_object_moving_direction in moving_options: 293 | task_plan = { 294 | 'task type' : 'where move video', 295 | 'grid number' : grid_size, 296 | 'target category' : target_category, 297 | 'absolute position' : absolute_pos, 298 | 'moving direction' : target_object_moving_direction, 299 | 'are other objects moving': "Yes" 300 | } 301 | task_store.add(task_plan) 302 | 303 | task_plan = { 304 | 'task type' : 'where move video', 305 | 'grid number' : grid_size, 306 | 'target category' : target_category, 307 | 'absolute position' : absolute_pos, 308 | 'moving direction' : target_object_moving_direction, 309 | 'are other objects moving': "No" 310 | } 311 | task_store.add(task_plan) 312 | 313 | def _generate_task(self, task_plan): 314 | grid_size = task_plan['grid number'] 315 | 316 | target_category = task_plan['target category'] 317 | categories = [target_category] 318 | queries = [self._get_target_object_query(task_plan)] 319 | absolute_pos = task_plan['absolute position'] 320 | grids = [self.grid_mappings[grid_size][absolute_pos]] 321 | target_object_moving_direction = task_plan['moving direction'] 322 | 323 | if task_plan['are other objects moving'] == "Yes": 324 | question = f"Where is the object that is moving {target_object_moving_direction} located in the video?" 325 | else: 326 | question = f"Where is the moving object located in the video?" 327 | answer = absolute_pos 328 | negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer] 329 | 330 | options = self._compose_options(answer, negatives) 331 | video_metadata = self._make_video_metadata( 332 | grid_size, 333 | grids, 334 | queries=queries, 335 | remaining_query=self.metadata.and_query([("category", c, False) for c in categories]), 336 | target_object_moving_direction=target_object_moving_direction, 337 | are_other_objects_moving=task_plan['are other objects moving'], 338 | object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER 339 | ) 340 | 341 | return question, answer, options, video_metadata 342 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/run_blender.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import math 4 | import os 5 | import sys 6 | import urllib.request 7 | from math import radians 8 | 9 | try: 10 | import bpy 11 | from mathutils import Vector, Matrix, Quaternion, Euler 12 | except ImportError: 13 | pass 14 | 15 | 16 | def rotate(obj, degree): 17 | """Rotates around the z axis by theta""" 18 | degree = -degree 19 | bpy.ops.object.select_all(action='DESELECT') 20 | obj.select_set(True) 21 | bpy.context.view_layer.objects.active = obj 22 | radian = radians(degree) 23 | bpy.context.object.rotation_mode = 'XYZ' 24 | rot_x, rot_y, rot_z = obj.rotation_euler 25 | obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian)) 26 | freeze_transformation(obj) 27 | 28 | 29 | def reset_scene(): 30 | # delete everything that isn't part of a camera or a light 31 | bpy.ops.object.select_all(action="SELECT") 32 | for obj in bpy.data.objects: 33 | bpy.data.objects.remove(obj, do_unlink=True) 34 | bpy.ops.ptcache.free_bake_all() 35 | 36 | 37 | def select_hierarchy(obj): 38 | """Recursively select an object and all of its descendants.""" 39 | obj.select_set(True) 40 | for child in obj.children: 41 | select_hierarchy(child) 42 | 43 | 44 | def load_object(object_path: str) -> None: 45 | """Loads a glb model into the scene.""" 46 | bpy.ops.object.select_all(action='DESELECT') 47 | if object_path.endswith(".glb"): 48 | bpy.ops.import_scene.gltf(filepath=object_path, merge_vertices=True) 49 | elif object_path.endswith(".fbx"): 50 | bpy.ops.import_scene.fbx(filepath=object_path) 51 | else: 52 | raise ValueError(f"Unsupported file type: {object_path}") 53 | 54 | base_name = os.path.basename(object_path) 55 | object_name, _ = os.path.splitext(base_name) 56 | bpy.context.view_layer.objects.active.name = object_name 57 | bpy.ops.object.select_all(action='DESELECT') 58 | 59 | obj = bpy.data.objects.get(object_name) 60 | # bpy.context.view_layer.objects.active = obj 61 | select_hierarchy(obj) 62 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 63 | meshes = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"] 64 | non_meshes = [obj for obj in bpy.context.selected_objects if obj.type != "MESH"] 65 | bpy.ops.object.select_all(action="DESELECT") 66 | 67 | # delete non-mesh and consolidate 68 | 69 | for obj in non_meshes: 70 | obj.select_set(True) 71 | bpy.ops.object.delete() 72 | bpy.ops.object.select_all(action="DESELECT") 73 | for obj in meshes: 74 | obj.select_set(True) 75 | bpy.context.view_layer.objects.active = meshes[0] 76 | bpy.ops.object.join() 77 | bpy.context.view_layer.objects.active.name = object_name 78 | bpy.ops.object.origin_set(type='GEOMETRY_ORIGIN', center='BOUNDS') 79 | bpy.ops.object.select_all(action="DESELECT") 80 | 81 | return object_name 82 | 83 | 84 | def scene_meshes(): 85 | for obj in bpy.context.scene.objects.values(): 86 | if isinstance(obj.data, (bpy.types.Mesh)): 87 | yield obj 88 | 89 | 90 | def download_uid(uid_path, save_dir): 91 | return download_object(uid_path, save_dir) 92 | 93 | 94 | def download_object(object_url, save_dir) -> str: 95 | """Download the object and return the path.""" 96 | # uid = uuid.uuid4() 97 | uid = object_url.split("/")[-1].split(".")[0] 98 | tmp_local_path = os.path.join(save_dir, f"{uid}.glb" + ".tmp") 99 | local_path = os.path.join(save_dir, f"{uid}.glb") 100 | # wget the file and put it in local_path 101 | os.makedirs(os.path.dirname(tmp_local_path), exist_ok=True) 102 | urllib.request.urlretrieve(object_url, tmp_local_path) 103 | os.rename(tmp_local_path, local_path) 104 | # get the absolute path 105 | local_path = os.path.abspath(local_path) 106 | return local_path 107 | 108 | 109 | def scene_bbox(single_obj=None, ignore_matrix=False): 110 | bbox_min = (math.inf,) * 3 111 | bbox_max = (-math.inf,) * 3 112 | found = False 113 | for obj in scene_meshes() if single_obj is None else [single_obj]: 114 | found = True 115 | for coord in obj.bound_box: 116 | coord = Vector(coord) 117 | if not ignore_matrix: 118 | coord = obj.matrix_world @ coord 119 | bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord)) 120 | bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord)) 121 | if not found: 122 | raise RuntimeError("no objects in scene to compute bounding box for") 123 | return Vector(bbox_min), Vector(bbox_max) 124 | 125 | 126 | def scene_root_objects(): 127 | for obj in bpy.context.scene.objects.values(): 128 | if not obj.parent: 129 | yield obj 130 | 131 | 132 | def freeze_transformation(obj): 133 | bpy.context.view_layer.objects.active = obj 134 | obj.select_set(True) 135 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 136 | bpy.ops.object.select_all(action='DESELECT') 137 | 138 | 139 | def scale(obj, scale_factor): 140 | bpy.ops.object.select_all(action='DESELECT') 141 | obj.select_set(True) 142 | bpy.ops.transform.resize(value=(scale_factor, scale_factor, scale_factor)) 143 | bpy.ops.object.select_all(action='DESELECT') 144 | freeze_transformation(obj) 145 | 146 | 147 | def get_3d_dimensions(obj): 148 | # pdb.set_trace() 149 | max_x, max_y, max_z = float("-inf"), float("-inf"), float("-inf") 150 | min_x, min_y, min_z = float("inf"), float("inf"), float("inf") 151 | 152 | for vertex in obj.data.vertices: 153 | v_world = obj.matrix_world @ vertex.co 154 | max_x, max_y, max_z = max(max_x, v_world.x), max(max_y, v_world.y), max(max_z, v_world.z) 155 | min_x, min_y, min_z = min(min_x, v_world.x), min(min_y, v_world.y), min(min_z, v_world.z) 156 | 157 | return (max_x - min_x, max_y - min_y, max_z - min_z) 158 | 159 | 160 | def normalize_object(obj, factor=1.0): 161 | max_dimension = max(get_3d_dimensions(obj)) 162 | scale_factor = factor * (1 / max_dimension) 163 | scale(obj, scale_factor) 164 | 165 | 166 | def move_to_xy(obj, x, y): 167 | min_z = float('inf') 168 | for vertex in obj.data.vertices: 169 | z = obj.matrix_world @ vertex.co 170 | min_z = min(min_z, z.z) 171 | obj.location -= Vector((0, 0, min_z)) 172 | freeze_transformation(obj) 173 | 174 | # move location x,y to sampled box center 175 | new_location = Vector((x, y, obj.location[2])) 176 | obj.location = new_location 177 | freeze_transformation(obj) 178 | 179 | 180 | def normalize_scene(): 181 | bbox_min, bbox_max = scene_bbox() 182 | scale = 1 / max(bbox_max - bbox_min) 183 | for obj in scene_root_objects(): 184 | obj.scale = obj.scale * scale 185 | # Apply scale to matrix_world. 186 | bpy.context.view_layer.update() 187 | bbox_min, bbox_max = scene_bbox() 188 | offset = -(bbox_min + bbox_max) / 2 189 | for obj in scene_root_objects(): 190 | obj.matrix_world.translation += offset 191 | bpy.ops.object.select_all(action="DESELECT") 192 | 193 | 194 | def setup_plane_and_background(plane_texture_path, hdri_path): 195 | # load plane 196 | plane_name = load_object(plane_texture_path) 197 | plane = bpy.data.objects.get(plane_name) 198 | scale(plane, 0.5) 199 | 200 | # load light map 201 | print(f"HDRI PATH: {hdri_path}") 202 | bpy.ops.image.open(filepath=hdri_path) 203 | if bpy.data.worlds.get("World") is None: 204 | bpy.data.worlds.new("World") 205 | 206 | bpy.context.scene.world = bpy.data.worlds["World"] 207 | 208 | bpy.context.scene.world.use_nodes = True 209 | tree = bpy.context.scene.world.node_tree 210 | tree.nodes.clear() 211 | 212 | tex_env = tree.nodes.new(type="ShaderNodeTexEnvironment") 213 | tex_env.image = bpy.data.images[hdri_path.split('/')[-1]] # Image name is typically the last part of the path 214 | background = tree.nodes.new(type="ShaderNodeBackground") 215 | output = tree.nodes.new(type="ShaderNodeOutputWorld") 216 | 217 | tree.links.new(tex_env.outputs[0], background.inputs[0]) 218 | tree.links.new(background.outputs[0], output.inputs[0]) 219 | 220 | return plane_texture_path + " " + hdri_path 221 | 222 | 223 | def setup_camera_and_lights( 224 | sun_x, 225 | sun_y, 226 | sun_energy, 227 | key_light_horizontal_angle, 228 | fill_light_horizontal_angle, 229 | key_light_vertical_angle, 230 | fill_light_vertical_angle 231 | ): 232 | # for seeting up the three point lighting, we mostly follow https://courses.cs.washington.edu/courses/cse458/05au/reading/3point_lighting.pdf 233 | # in order to keep lights and camera on the hemisphere pointing to origin, we use a hierarchy of empties 234 | 235 | # create the sun 236 | 237 | bpy.ops.object.light_add(type="SUN") 238 | sun = bpy.context.active_object 239 | sun.rotation_euler = Euler((sun_x, sun_y, 0), "XYZ") 240 | sun.data.energy = sun_energy 241 | 242 | # create global empty 243 | 244 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 245 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 246 | empty = bpy.context.scene.objects.get("Empty") 247 | 248 | # create camera 249 | 250 | # radius = random.uniform(1.8,2.2) 251 | radius = 2.5 252 | 253 | bpy.ops.object.camera_add(enter_editmode=False, align='VIEW', location=(-radius, 0, 0), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 254 | cam = bpy.context.scene.objects.get("Camera") 255 | cam.data.lens = 35 256 | cam.data.sensor_width = 32 257 | bpy.context.scene.camera = cam 258 | 259 | # create camera empty 260 | 261 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 262 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 263 | cam_empty = bpy.context.scene.objects.get("Empty.001") 264 | cam_empty.name = "camera_empty" 265 | 266 | # make camera empty parent of camera 267 | 268 | bpy.ops.object.select_all(action='DESELECT') 269 | cam.select_set(True) 270 | cam_empty.select_set(True) 271 | bpy.context.view_layer.objects.active = cam_empty 272 | bpy.ops.object.parent_set() 273 | bpy.ops.object.select_all(action='DESELECT') 274 | 275 | # make camera empty parent of global empty 276 | 277 | bpy.ops.object.select_all(action='DESELECT') 278 | cam_empty.select_set(True) 279 | empty.select_set(True) 280 | bpy.context.view_layer.objects.active = empty 281 | bpy.ops.object.parent_set() 282 | bpy.ops.object.select_all(action='DESELECT') 283 | 284 | light_names = ["key_light", "fill_light", "back_light"] 285 | light_energies = [1000., 300., 500.] 286 | 287 | for light_name, light_energy in zip(light_names, light_energies): 288 | # create light empty 289 | 290 | empty_name = light_name + "_empty" 291 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 292 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 293 | light_empty = bpy.context.scene.objects.get("Empty.001") 294 | light_empty.name = empty_name 295 | 296 | # parent light empty to main (camera) empty 297 | 298 | bpy.ops.object.select_all(action='DESELECT') 299 | light_empty.select_set(True) 300 | empty.select_set(True) 301 | bpy.context.view_layer.objects.active = empty 302 | bpy.ops.object.parent_set() 303 | bpy.ops.object.select_all(action='DESELECT') 304 | 305 | # create light 306 | 307 | x_loc, y_loc, z_loc = -radius, 0, 0 308 | bpy.ops.object.light_add(type='POINT', radius=1, align='WORLD', location=(x_loc, y_loc, z_loc), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 309 | bpy.data.objects["Point"].name = light_name 310 | light = bpy.data.objects[light_name] 311 | light.data.energy = light_energy 312 | # light.data.size = 0.5 313 | 314 | # parent light empty to light 315 | 316 | bpy.ops.object.select_all(action='DESELECT') 317 | light.select_set(True) 318 | light_empty.select_set(True) 319 | bpy.context.view_layer.objects.active = light_empty 320 | bpy.ops.object.parent_set() 321 | bpy.ops.object.select_all(action='DESELECT') 322 | 323 | # rotate camera and lights around the z-axis 324 | 325 | z_random_rot = radians(90) # radians(random.uniform(0,360)) 326 | empty.rotation_euler = Euler((0, 0, z_random_rot)) 327 | 328 | # # raise the camera while having it point to origin 329 | 330 | # cam_y_random_rot = radians(random.uniform(10,50)) 331 | # cam_empty.rotation_euler = Euler((0,cam_y_random_rot,0),"XYZ") 332 | 333 | bpy.context.view_layer.update() 334 | 335 | back_light_horizontal_angle = radians(180) 336 | light_horizontal_angles = [key_light_horizontal_angle, fill_light_horizontal_angle, back_light_horizontal_angle] 337 | for light_angle, light_name in zip(light_horizontal_angles, light_names): 338 | light_empty = bpy.data.objects[light_name + "_empty"] 339 | global_z = (light_empty.matrix_world.inverted() @ Vector((0.0, 0.0, 1.0, 0.0)))[:3] 340 | quat = Quaternion(global_z, light_angle) 341 | light_empty.rotation_euler = quat.to_euler() 342 | 343 | back_light_vertical_angle = 0 344 | light_vertical_angles = [key_light_vertical_angle, fill_light_vertical_angle, back_light_vertical_angle] 345 | # light_vertical_angles = [radians(-45)]*3 346 | 347 | for light_angle, light_name in zip(light_vertical_angles, light_names): 348 | light_empty = bpy.data.objects[light_name + "_empty"] 349 | global_x = (light_empty.matrix_world.inverted() @ Vector((1.0, 0.0, 0.0, 0.0)))[:3] 350 | quat = Quaternion(global_x, light_angle) 351 | euler_add = quat.to_euler() 352 | euler_current = light_empty.rotation_euler 353 | new_euler = Euler((euler_add[0] + euler_current[0], euler_add[1] + euler_current[1], euler_add[2] + euler_current[2])) 354 | light_empty.rotation_euler = new_euler 355 | 356 | # bpy.context.view_layer.update() 357 | 358 | return cam, empty 359 | 360 | 361 | def render(fp): 362 | # Render image 363 | bpy.context.view_layer.update() 364 | bpy.context.scene.render.filepath = fp 365 | bpy.ops.render.render(write_still=True) 366 | 367 | 368 | def setup_renderer(H, W, use_cpu=False): 369 | scene = bpy.context.scene 370 | render = bpy.context.scene.render 371 | 372 | render.engine = "CYCLES" 373 | render.image_settings.file_format = "PNG" 374 | render.image_settings.color_mode = "RGBA" 375 | render.resolution_x = W 376 | render.resolution_y = H 377 | render.resolution_percentage = 100 378 | 379 | scene.cycles.device = "CPU" if use_cpu else "GPU" 380 | scene.cycles.samples = 10 if use_cpu else 128 381 | scene.cycles.diffuse_bounces = 1 382 | scene.cycles.glossy_bounces = 1 383 | scene.cycles.transparent_max_bounces = 3 384 | scene.cycles.transmission_bounces = 3 385 | scene.cycles.filter_width = 0.01 386 | scene.cycles.use_denoising = True 387 | scene.render.film_transparent = False 388 | 389 | bpy.context.preferences.addons["cycles"].preferences.get_devices() 390 | # Set the device_type 391 | bpy.context.preferences.addons[ 392 | "cycles" 393 | ].preferences.compute_device_type = "METAL" if use_cpu else "CUDA" 394 | bpy.context.scene.view_settings.view_transform = 'Filmic' 395 | 396 | 397 | # def randomize_camera_view(axis): 398 | # euler_y = radians(random.uniform(-90, 90)) 399 | # euler_z = radians(random.uniform(0, 360)) 400 | # axis.rotation_euler = Euler((0, euler_y, euler_z)) 401 | 402 | 403 | def run_render(metadata, save_image_path, use_cpu): 404 | reset_scene() 405 | 406 | objs = [] 407 | for uid in metadata["objects"]: 408 | object_path = metadata["object_path"][uid] 409 | objs.append(bpy.data.objects.get(load_object(object_path))) 410 | 411 | grid_number = metadata["grid number"] 412 | 413 | if grid_number == 2: 414 | locations = { 415 | 0: [0.7, 0.5], 416 | 1: [0.7, -0.5], 417 | 2: [-0.6, 0.5], 418 | 3: [-0.6, -0.5] 419 | } 420 | scale_factor = 1 / 2 421 | elif grid_number == 3: 422 | locations = { 423 | 0: [0.9, 0.6], 424 | 1: [0.9, 0], 425 | 2: [0.9, -0.6], 426 | 3: [0.0, 0.6], 427 | 4: [0.0, 0.0], 428 | 5: [0.0, -0.6], 429 | 6: [-0.9, 0.6], 430 | 7: [-0.9, 0.0], 431 | 8: [-0.9, -0.6] 432 | } 433 | scale_factor = 1 / 3 434 | else: 435 | raise ValueError(f"Expected grid number to be 2 or 3 but got {grid_number}") 436 | 437 | # process rotate 438 | for idx, obj in enumerate(objs): 439 | rotate(obj, degree=metadata['object_angles'][idx]) 440 | 441 | # process scale 442 | if "sizes" in metadata: 443 | for idx, obj in enumerate(objs): 444 | normalize_object(obj, factor=metadata['sizes'][idx] * scale_factor) 445 | else: 446 | for obj in objs: 447 | normalize_object(obj, factor=scale_factor) 448 | 449 | for pos, obj in zip(metadata["grids"], objs): 450 | x, y = locations[pos] 451 | move_to_xy(obj, x, y) 452 | 453 | blender_config = metadata["blender_config"] 454 | 455 | setup_plane_and_background(blender_config["plane_texture_path"], blender_config["hdri_path"]) 456 | cam, axis = setup_camera_and_lights( 457 | blender_config["sun_x"], 458 | blender_config["sun_y"], 459 | blender_config["sun_energy"], 460 | blender_config["key_light_horizontal_angle"], 461 | blender_config["fill_light_horizontal_angle"], 462 | blender_config["key_light_vertical_angle"], 463 | blender_config["fill_light_vertical_angle"] 464 | ) 465 | axis.rotation_euler = Euler((0, radians(45), 0)) 466 | setup_renderer(H=metadata["H"], W=metadata["W"], use_cpu=use_cpu) 467 | render(save_image_path) 468 | 469 | 470 | if __name__ == "__main__": 471 | parser = argparse.ArgumentParser() 472 | parser.add_argument( 473 | "--save_local", 474 | type=str, 475 | default="" 476 | ) 477 | parser.add_argument( 478 | "--save_image_path", 479 | type=str, 480 | default="render.png" 481 | ) 482 | parser.add_argument( 483 | "--json_file", 484 | type=str, 485 | default="image_metadata.json" 486 | ) 487 | 488 | parser.add_argument( 489 | "--use_cpu", 490 | action="store_true", 491 | default=False 492 | ) 493 | 494 | argv = sys.argv[sys.argv.index("--") + 1:] 495 | args = parser.parse_args(argv) 496 | 497 | with open(args.json_file, "r") as f: 498 | metadata = json.load(f) 499 | 500 | run_render(metadata, args.save_image_path, args.use_cpu) 501 | -------------------------------------------------------------------------------- /tma/imageqa/scene_graph/single_image_task.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | from itertools import combinations 4 | from typing import List, Tuple 5 | 6 | import numpy as np 7 | from PIL import Image 8 | from tqdm import tqdm 9 | 10 | from ..metadata import SceneGraphMetaData 11 | from ...base import TaskGenerator 12 | from ...task_store import TaskStore 13 | 14 | 15 | def scene_graph_adjacent_objects(scene_graph, node): 16 | adjacent_objects = {} 17 | for edge in scene_graph["objects"][node]['relations']: 18 | obj = edge['object'] 19 | if obj not in adjacent_objects: 20 | adjacent_objects[obj] = [] 21 | adjacent_objects[obj].append((edge['name'], 0)) 22 | 23 | for obj, edges in scene_graph["objects"].items(): 24 | for edge in edges["relations"]: 25 | if edge['object'] == node: 26 | if obj not in adjacent_objects: 27 | adjacent_objects[obj] = [] 28 | adjacent_objects[obj].append((edge['name'], 1)) 29 | return adjacent_objects 30 | 31 | 32 | def subgraph_to_json_str(subgraph, scene_graph): 33 | subgraph_json = { 34 | "attributes" : [], 35 | "adjacent_objects": [], 36 | } 37 | adjacent_object_info = {} 38 | for element in subgraph: 39 | if isinstance(element, str): 40 | subgraph_json["attributes"].append(element) 41 | else: 42 | if len(element) == 2: 43 | obj, attr = element 44 | if obj not in adjacent_object_info: 45 | adjacent_object_info[obj] = { 46 | "attributes": [attr], 47 | "relation" : None 48 | } 49 | else: 50 | adjacent_object_info[obj]["attributes"].append(attr) 51 | else: 52 | obj, rel, direction = element 53 | if obj not in adjacent_object_info: 54 | adjacent_object_info[obj] = { 55 | "attributes": [], 56 | "relation" : (rel, direction) 57 | } 58 | else: 59 | adjacent_object_info[obj]["relation"] = (rel, direction) 60 | 61 | for obj, info in adjacent_object_info.items(): 62 | subgraph_json["adjacent_objects"].append({ 63 | "object" : scene_graph["objects"][obj]["name"], 64 | "attributes": sorted(info["attributes"]), 65 | "relation" : info["relation"] 66 | }) 67 | subgraph_json["attributes"] = sorted(subgraph_json["attributes"]) 68 | subgraph_json["adjacent_objects"] = sorted(subgraph_json["adjacent_objects"], key=lambda x: json.dumps(x)) 69 | return json.dumps(subgraph_json) 70 | 71 | 72 | def constrained_combinations(n, k, constraints): 73 | """ 74 | Generate all combinations of k elements from n elements that satisfy the constraints 75 | :param n: 76 | :param k: 77 | :param constraints: a list of tuples (i, j) that means that when i is not selected, i+1 ~ j should not be selected 78 | :return: a binary array of shape (x, n) where each row represents a valid combination 79 | """ 80 | combo = np.array(list(combinations(range(n), k))) 81 | selection = np.zeros((len(combo), n), dtype=bool) 82 | selection[np.arange(len(combo))[:, None], combo] = 1 83 | for start, end in constraints: 84 | selection = selection[~((selection[:, start] == 0) & (np.any(selection[:, start + 1:end], axis=1)))] 85 | return selection 86 | 87 | 88 | def compose_parallel_phrase(phrases): 89 | if len(phrases) == 0: 90 | return "" 91 | elif len(phrases) == 1: 92 | return phrases[0] 93 | elif len(phrases) == 2: 94 | return f"{phrases[0]} and {phrases[1]}" 95 | else: 96 | phrases[-1] = "and " + phrases[-1] 97 | return ", ".join(phrases) 98 | 99 | 100 | def compose_attributed_name(attributes, name): 101 | if len(attributes) > 0: 102 | attributes = compose_parallel_phrase(attributes) 103 | return f"{attributes} {name}" 104 | else: 105 | return name 106 | 107 | 108 | @functools.lru_cache(maxsize=100000) 109 | def compose_object_reference(subgraph: str): 110 | subgraph = json.loads(subgraph) 111 | 112 | # Helper function to create relation phrases 113 | def create_relation_phrase(attributed_name, relation_name, is_forward=True): 114 | return f"is {relation_name} the {attributed_name}" if is_forward else f"the {attributed_name} is {relation_name}" 115 | 116 | # Process relations 117 | forward_relations, backward_relations = [], [] 118 | 119 | for idx, node in enumerate(subgraph['adjacent_objects']): 120 | rel = node['relation'] 121 | attributed_name = compose_attributed_name(node.get("attributes", []), node['object']) 122 | if rel[1] == 0: 123 | forward_relations.append(create_relation_phrase(attributed_name, rel[0], True)) 124 | else: 125 | backward_relations.append(create_relation_phrase(attributed_name, rel[0], False)) 126 | 127 | # Combine relations into reference string 128 | reference = "" 129 | if forward_relations: 130 | reference += compose_parallel_phrase(forward_relations) 131 | if backward_relations: 132 | if forward_relations: 133 | reference += ", and also, " 134 | reference += compose_parallel_phrase(backward_relations) 135 | return reference 136 | 137 | 138 | def subgraph_contain_multiple_same_direction_relations(subgraph): 139 | out_rel = False 140 | in_rel = False 141 | for item in subgraph: 142 | if len(item) == 3: 143 | if item[2] == 0: 144 | if out_rel: 145 | return True 146 | out_rel = True 147 | else: 148 | if in_rel: 149 | return True 150 | in_rel = True 151 | return False 152 | 153 | 154 | def subgraph_contain_multiple_relations(subgraph): 155 | rel = False 156 | for item in subgraph: 157 | if isinstance(item, tuple) and len(item) == 3: 158 | if rel: 159 | return True 160 | rel = True 161 | return False 162 | 163 | 164 | class SceneGraphTaskGenerator(TaskGenerator): 165 | metadata: SceneGraphMetaData 166 | 167 | embed_schema = [ 168 | "task type", 169 | "object", 170 | "attribute value", 171 | "attribute type", 172 | "relation", 173 | "source object", 174 | "target object" 175 | ] 176 | 177 | def __init__(self, metadata: SceneGraphMetaData, subgraph_size=4, n_subgraph_per_answer=1, max_scene_graph_size=10000, seed=42): 178 | super().__init__(metadata, seed=seed) 179 | self.subgraph_size = subgraph_size 180 | self.n_subgraph_per_answer = n_subgraph_per_answer 181 | self.max_scene_graph_size = max_scene_graph_size 182 | 183 | def _enumerate_subgraphs_w_object( 184 | self, 185 | scene_graph, 186 | start_node, 187 | exclude_attribute_type=None, 188 | exclude_object=[] 189 | ): 190 | 191 | stamp = [] 192 | elements = [ 193 | attr for attr in scene_graph["objects"][start_node]['attributes'] 194 | if exclude_attribute_type is None or self.metadata.get_attribute_type(attr) != exclude_attribute_type 195 | ] 196 | adjacent_objects = scene_graph_adjacent_objects(scene_graph, start_node) 197 | for obj in adjacent_objects: 198 | if obj not in exclude_object: 199 | start = len(elements) 200 | elements.append(obj) 201 | elements += [(obj, attr) for attr in scene_graph["objects"][obj]['attributes']] 202 | stamp.append((start, len(elements))) 203 | if len(elements) < self.subgraph_size: 204 | return [] 205 | 206 | # sample all subgraphs that contain the start node with the given size 207 | selection = constrained_combinations(len(elements), self.subgraph_size, stamp) 208 | 209 | # distinguish subgraphs with and without the objects 210 | with_object_mask = np.any(selection[:, [start for start, _ in stamp]], axis=1) 211 | subgraphs_w_objects = [[elements[i] for i in np.where(indices)[0]] for indices in selection[with_object_mask]] 212 | subgraphs_wo_objects = [[elements[i] for i in np.where(indices)[0]] for indices in selection[~with_object_mask]] 213 | 214 | # for subgraph with object, add its all possible relations to the start node 215 | for obj, rels in adjacent_objects.items(): 216 | new_subgraphs = [] 217 | for subgraph in subgraphs_w_objects: 218 | if obj in subgraph: 219 | obj_id = subgraph.index(obj) 220 | for rel, direction in rels: 221 | subgraph_rel = subgraph.copy() 222 | subgraph_rel[obj_id] = (obj, rel, direction) 223 | # remove subgraphs with multiple same-direction relations 224 | if not subgraph_contain_multiple_relations(subgraph_rel): 225 | new_subgraphs.append(subgraph_rel) 226 | else: 227 | new_subgraphs.append(subgraph) 228 | subgraphs_w_objects = new_subgraphs 229 | 230 | subgraph_json_strs = [subgraph_to_json_str(subgraph, scene_graph) 231 | for subgraph in subgraphs_w_objects + subgraphs_wo_objects] 232 | 233 | return set(subgraph_json_strs) 234 | 235 | def _task_plan_to_str(self, task_plan) -> str: 236 | t = [] 237 | for k, v in task_plan.items(): 238 | if k in self.embed_schema: 239 | assert isinstance(v, str) 240 | t.append(f'{k}: {v}') 241 | return '\n'.join(t) 242 | 243 | def _generate_task(self, task_plan) -> Tuple[str, str, List[str], str]: 244 | "(Abstract method) generate task" 245 | 246 | def generate(self, task_plan, return_data=True, seed=None): 247 | if seed is not None: 248 | self.rng = np.random.default_rng(seed=seed) 249 | 250 | question, answer, options, scene_graph_id = self._generate_task(task_plan) 251 | 252 | task = { 253 | "question" : question.replace("_", " "), 254 | "answer" : answer.replace("_", " "), 255 | "options" : [o.replace("_", " ") for o in options], 256 | "task_plan" : self._task_plan_to_str(task_plan), 257 | "scene_graph_id": scene_graph_id, 258 | 'image' : Image.open(self.metadata.get_image_path(scene_graph_id)) if return_data else None 259 | } 260 | return task 261 | 262 | 263 | class WhatObjectSceneGraphTaskGenerator(SceneGraphTaskGenerator): 264 | schema = { 265 | "task type" : "str", 266 | "object" : "str", 267 | "subgraph" : "str", 268 | "scene graph id": "str", 269 | "answers" : "list", 270 | } 271 | 272 | def enumerate_object_subgraphs(self, scene_graph): 273 | subgraph_to_objects = {} 274 | for object, info in scene_graph["objects"].items(): 275 | obj_name = info['name'] 276 | if self.metadata.check_object_in_category(obj_name): 277 | subgraphs = self._enumerate_subgraphs_w_object(scene_graph, object) 278 | # subgraphs = self.rng.choice(list(subgraphs), min(self.n_subgraph_per_answer, len(subgraphs)), replace=False) 279 | for subgraph in subgraphs: 280 | if subgraph not in subgraph_to_objects: 281 | subgraph_to_objects[subgraph] = set() 282 | subgraph_to_objects[subgraph].add(obj_name) 283 | return subgraph_to_objects 284 | 285 | def enumerate_task_plans(self, task_store: TaskStore): 286 | 287 | for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what object] task"): 288 | 289 | if len(scene_graph["objects"]) < self.max_scene_graph_size: 290 | subgraph_to_nodes = self.enumerate_object_subgraphs(scene_graph) 291 | 292 | for subgraph_str, nodes in subgraph_to_nodes.items(): 293 | answers = sorted(list(nodes)) 294 | for node in nodes: 295 | task_plan = { 296 | "task type" : "what object", 297 | "scene graph id": scene_graph_id, 298 | "subgraph" : subgraph_str, 299 | "object" : node, 300 | "answers" : answers, 301 | } 302 | task_store.add(task_plan) 303 | 304 | def _generate_task(self, task_plan): 305 | obj_reference = compose_object_reference(task_plan["subgraph"]) 306 | subgraph = json.loads(task_plan["subgraph"]) 307 | object = task_plan["object"] 308 | scene_graph_id = task_plan["scene graph id"] 309 | 310 | attributed_name = compose_attributed_name(subgraph.get("attributes", []), "object") 311 | 312 | if obj_reference != "": 313 | obj_reference = f" that {obj_reference}" 314 | question = f"What is the {attributed_name}{obj_reference}?" 315 | 316 | answer = object 317 | exclude_categories = [self.metadata.sg_object_to_cateid[obj] for obj in task_plan["answers"]] 318 | negative_objects = [self.metadata.get_surfacename(cateid) for cateid in self.metadata.get_irrelevant_categories(exclude_categories)] 319 | options = self._compose_options(answer, negative_objects) 320 | 321 | return question, answer, options, scene_graph_id 322 | 323 | 324 | class WhatAttributeSceneGraphTaskGenerator(SceneGraphTaskGenerator): 325 | schema = { 326 | "task type" : "str", 327 | "attribute type" : "str", 328 | "attribute value": "str", 329 | "subgraph" : "str", 330 | "scene graph id" : "str", 331 | "answers" : "list", 332 | } 333 | 334 | def enumerate_attribute_subgraphs(self, scene_graph): 335 | subgraph_to_nodes = {} 336 | for node, info in scene_graph["objects"].items(): 337 | for attr in info['attributes']: 338 | attr_type = self.metadata.get_attribute_type(attr) 339 | subgraphs = self._enumerate_subgraphs_w_object(scene_graph, node, exclude_attribute_type=attr_type) 340 | # subgraphs = self.rng.choice(list(subgraphs), min(self.n_subgraph_per_answer, len(subgraphs)), replace=False) 341 | for subgraph in subgraphs: 342 | if subgraph not in subgraph_to_nodes: 343 | subgraph_to_nodes[subgraph] = {} 344 | if attr_type not in subgraph_to_nodes[subgraph]: 345 | subgraph_to_nodes[subgraph][attr_type] = set() 346 | subgraph_to_nodes[subgraph][attr_type].add(attr) 347 | return subgraph_to_nodes 348 | 349 | def enumerate_task_plans(self, task_store: TaskStore): 350 | for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what attribute] task"): 351 | if len(scene_graph["objects"]) < self.max_scene_graph_size: 352 | 353 | subgraphs_to_attrs = self.enumerate_attribute_subgraphs(scene_graph) 354 | for subgraph_str, attributes in subgraphs_to_attrs.items(): 355 | for attribute_type, attribute_set in attributes.items(): 356 | answers = sorted(list(attribute_set)) 357 | for attribute in attribute_set: 358 | task_plan = { 359 | "task type" : "what attribute", 360 | "scene graph id" : scene_graph_id, 361 | "subgraph" : subgraph_str, 362 | "attribute value": attribute, 363 | "answers" : answers, 364 | "attribute type" : attribute_type 365 | } 366 | task_store.add(task_plan) 367 | 368 | def _generate_task(self, task_plan): 369 | 370 | obj_reference = compose_object_reference(task_plan["subgraph"]) 371 | subgraph = json.loads(task_plan["subgraph"]) 372 | 373 | scene_graph_id = task_plan["scene graph id"] 374 | attribute = task_plan["attribute value"] 375 | attribute_type = task_plan["attribute type"] 376 | 377 | attributed_name = compose_attributed_name(subgraph.get("attributes", []), "object") 378 | 379 | if obj_reference != "": 380 | obj_reference = f" that {obj_reference}" 381 | 382 | attribute_type_word = lambda x: "attribute value" if x == "other" else x 383 | question = f"What is the {attribute_type_word(attribute_type)} of the {attributed_name}{obj_reference}?" 384 | answer = attribute 385 | negative_attributes = list(set(self.metadata.type_to_attribute[attribute_type]) - set(task_plan["answers"])) 386 | options = self._compose_options(answer, negative_attributes) 387 | 388 | return question, answer, options, scene_graph_id 389 | 390 | 391 | class WhatRelationSceneGraphTaskGenerator(SceneGraphTaskGenerator): 392 | schema = { 393 | "task type" : "str", 394 | "relation" : "str", 395 | "source object" : "str", 396 | "target object" : "str", 397 | "source subgraph": "str", 398 | "target subgraph": "str", 399 | "scene graph id" : "str", 400 | "answers" : "list" 401 | 402 | } 403 | 404 | def enumerate_relation_subgraphs(self, scene_graph): 405 | subgraph_to_nodes_cnt = {} 406 | for node, info in scene_graph["objects"].items(): 407 | subgraphs = self._enumerate_subgraphs_w_object(scene_graph, node) 408 | for subgraph in subgraphs: 409 | if subgraph not in subgraph_to_nodes_cnt: 410 | subgraph_to_nodes_cnt[subgraph] = 0 411 | subgraph_to_nodes_cnt[subgraph] += 1 412 | 413 | relations = {} 414 | for node, info in scene_graph["objects"].items(): 415 | for rel in info['relations']: 416 | obj2 = rel['object'] 417 | if (node, obj2) not in relations: 418 | relations[(node, obj2)] = set() 419 | relations[(node, obj2)].add(rel['name']) 420 | 421 | subgraph_to_relation = {} 422 | for (obj1, obj2), rels in relations.items(): 423 | 424 | subgraphs1 = self._enumerate_subgraphs_w_object(scene_graph, obj1, exclude_object=[obj2]) 425 | subgraphs1 = [subgraph for subgraph in subgraphs1 if subgraph_to_nodes_cnt[subgraph] == 1] 426 | subgraphs1 = self.rng.choice(list(subgraphs1), min(self.n_subgraph_per_answer, len(subgraphs1)), replace=False) 427 | 428 | subgraphs2 = self._enumerate_subgraphs_w_object(scene_graph, obj2, exclude_object=[obj1]) 429 | subgraphs2 = [subgraph for subgraph in subgraphs2 if subgraph_to_nodes_cnt[subgraph] == 1] 430 | subgraphs2 = self.rng.choice(list(subgraphs2), min(self.n_subgraph_per_answer, len(subgraphs2)), replace=False) 431 | 432 | obj1_name = scene_graph["objects"][obj1]["name"] 433 | obj2_name = scene_graph["objects"][obj2]["name"] 434 | for subgraph1 in subgraphs1: 435 | for subgraph2 in subgraphs2: 436 | subgraph_to_relation[(subgraph1, subgraph2)] = (rels, obj1_name, obj2_name) 437 | 438 | return subgraph_to_relation 439 | 440 | def enumerate_task_plans(self, task_store: TaskStore): 441 | for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what relation] task"): 442 | if len(scene_graph["objects"]) < self.max_scene_graph_size: 443 | subgraphs_to_rels = self.enumerate_relation_subgraphs(scene_graph) 444 | for subgraph, (rels, obj1, obj2) in subgraphs_to_rels.items(): 445 | answers = sorted(list(rels)) 446 | for rel in rels: 447 | task_plan = { 448 | "task type" : "what relation", 449 | "relation" : rel, 450 | "source object" : obj1, 451 | "target object" : obj2, 452 | "scene graph id" : scene_graph_id, 453 | "source subgraph": subgraph[0], 454 | "target subgraph": subgraph[1], 455 | "answers" : answers, 456 | } 457 | task_store.add(task_plan) 458 | 459 | def _generate_task(self, task_plan): 460 | source_obj_reference = compose_object_reference(task_plan["source subgraph"]) 461 | target_obj_reference = compose_object_reference(task_plan["target subgraph"]) 462 | 463 | source_subgraph = json.loads(task_plan["source subgraph"]) 464 | target_subgraph = json.loads(task_plan["target subgraph"]) 465 | relation = task_plan["relation"] 466 | scene_graph_id = task_plan["scene graph id"] 467 | 468 | source_attributed_name = compose_attributed_name(source_subgraph.get("attributes", []), "object") 469 | target_attributed_name = compose_attributed_name(target_subgraph.get("attributes", []), "object") 470 | 471 | if source_obj_reference != "": 472 | source_obj_reference = f", which {source_obj_reference}" 473 | if target_obj_reference != "": 474 | target_obj_reference = f", which {target_obj_reference}" 475 | 476 | question = f"What is the relation from the {source_attributed_name}{source_obj_reference}, to the {target_attributed_name}{target_obj_reference}?" 477 | answer = relation 478 | negatives = list(set(self.metadata.relations) - set(task_plan["answers"])) 479 | options = self._compose_options(answer, negatives) 480 | 481 | return question, answer, options, scene_graph_id 482 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/size_single_image_task.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from .single_image_task import _3DGridTaskGenerator 4 | from .utils import relative_positions, reverse_relative_positions 5 | from ..metadata import Objaverse3DMetaData 6 | from ...constant import NUM_OPTIONS 7 | from ...task_store import TaskStore 8 | 9 | largest_size = 1.5 10 | smallest_size = 0.5 11 | all_size_options = set([0.5, 1.0, 1.5]) 12 | grid_options = [2] 13 | 14 | 15 | class Size3DGridTaskGenerator(_3DGridTaskGenerator): 16 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 17 | super().__init__(metadata, seed=seed) 18 | self.grid_options = grid_options 19 | 20 | def _make_image_metadata(self, grid_size, sizes, size_options, grids, queries, remaining_query=...): 21 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 22 | 23 | remaining_grids = [g for g in range(grid_size ** 2) if g not in grids] 24 | for _ in remaining_grids: 25 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 26 | objects.append(uid) 27 | remaining_sizes = list(self.rng.choice(size_options, replace=True, size=len(remaining_grids))) 28 | 29 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 30 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 31 | 32 | image_metadata = { 33 | 'grid number' : grid_size, 34 | 'objects' : objects, 35 | 'object_path' : object_path, 36 | 'object_angles' : angles, 37 | 'grids' : grids + remaining_grids, 38 | 'blender_config': self.metadata.sample_blender_configuration(self.rng), 39 | 'sizes' : sizes + remaining_sizes 40 | } 41 | return image_metadata 42 | 43 | 44 | class WhatSize3DGridTaskGenerator(Size3DGridTaskGenerator): 45 | schema = { 46 | 'task type' : 'str', 47 | 'size' : 'str', 48 | 'grid number' : 'int', 49 | 'target category' : 'str', 50 | 'absolute position': 'str', 51 | 'attribute type' : 'str', 52 | 'attribute value' : 'str', 53 | } 54 | 55 | def enumerate_task_plans(self, task_store: TaskStore): 56 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what size] task"): 57 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 58 | for attribute_type, attribute_values in attribute_dict.items(): 59 | for attribute_value in attribute_values: 60 | for grid_size in self.grid_options: 61 | for absolute_pos in self.grid_mappings[grid_size]: 62 | task_plan = { 63 | 'task type' : 'what size', 64 | 'size' : 'largest', 65 | 'grid number' : grid_size, 66 | 'target category' : target_category, 67 | 'absolute position': absolute_pos, 68 | 'attribute type' : attribute_type, 69 | 'attribute value' : attribute_value, 70 | } 71 | task_store.add(task_plan) 72 | 73 | task_plan = { 74 | 'task type' : 'what size', 75 | 'size' : 'smallest', 76 | 'grid number' : grid_size, 77 | 'target category' : target_category, 78 | 'absolute position': absolute_pos, 79 | 'attribute type' : attribute_type, 80 | 'attribute value' : attribute_value, 81 | } 82 | task_store.add(task_plan) 83 | 84 | for grid_size in self.grid_options: 85 | for absolute_pos in self.grid_mappings[grid_size]: 86 | task_plan = { 87 | 'task type' : 'what size', 88 | 'size' : 'largest', 89 | 'grid number' : grid_size, 90 | 'target category' : target_category, 91 | 'absolute position': absolute_pos, 92 | } 93 | task_store.add(task_plan) 94 | 95 | task_plan = { 96 | 'task type' : 'what size', 97 | 'size' : 'smallest', 98 | 'grid number' : grid_size, 99 | 'target category' : target_category, 100 | 'absolute position': absolute_pos, 101 | } 102 | task_store.add(task_plan) 103 | 104 | def _generate_task(self, task_plan): 105 | grid_size = task_plan['grid number'] 106 | target_category = task_plan['target category'] 107 | absolute_pos = task_plan['absolute position'] 108 | grids = [self.grid_mappings[grid_size][absolute_pos]] 109 | 110 | if task_plan['size'] == 'largest': 111 | sizes = [largest_size] 112 | size_options = list(all_size_options - {largest_size}) 113 | question = f"What is the largest object in the image?" 114 | else: 115 | sizes = [smallest_size] 116 | size_options = list(all_size_options - {smallest_size}) 117 | question = f"What is the smallest object in the image?" 118 | 119 | queries = [self._get_target_object_query(task_plan)] 120 | 121 | remaining_query = self.metadata.and_query([("category", target_category, False)]) 122 | 123 | image_metadata = self._make_image_metadata( 124 | grid_size, 125 | sizes, 126 | size_options, 127 | grids, 128 | queries=queries, 129 | remaining_query=remaining_query, 130 | ) 131 | 132 | answer = self.metadata.get_surfacename(target_category) 133 | negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category)) 134 | for o in image_metadata['objects'][1:]] 135 | options = self._compose_options(answer, negatives) 136 | 137 | return question, answer, options, image_metadata 138 | 139 | 140 | class WhatAttributeSize3DGridTaskGenerator(Size3DGridTaskGenerator): 141 | schema = { 142 | 'task type' : 'str', 143 | 'size' : 'str', 144 | 'grid number' : 'int', 145 | 'target category' : 'str', 146 | 'absolute position': 'str', 147 | 'attribute type' : 'str', 148 | 'attribute value' : 'str', 149 | } 150 | 151 | def enumerate_task_plans(self, task_store: TaskStore): 152 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what size attribute] task"): 153 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 154 | for attribute_type, attribute_values in attribute_dict.items(): 155 | for attribute_value in attribute_values: 156 | for grid_size in self.grid_options: 157 | for absolute_pos in self.grid_mappings[grid_size]: 158 | task_plan = { 159 | 'task type' : 'what attribute size', 160 | 'size' : 'largest', 161 | 'grid number' : grid_size, 162 | 'target category' : target_category, 163 | 'absolute position': absolute_pos, 164 | 'attribute type' : attribute_type, 165 | 'attribute value' : attribute_value, 166 | } 167 | task_store.add(task_plan) 168 | 169 | task_plan = { 170 | 'task type' : 'what attribute size', 171 | 'size' : 'smallest', 172 | 'grid number' : grid_size, 173 | 'target category' : target_category, 174 | 'absolute position': absolute_pos, 175 | 'attribute type' : attribute_type, 176 | 'attribute value' : attribute_value, 177 | } 178 | task_store.add(task_plan) 179 | 180 | def _generate_task(self, task_plan): 181 | grid_size = task_plan['grid number'] 182 | 183 | attribute_type = task_plan['attribute type'] 184 | 185 | absolute_pos = task_plan['absolute position'] 186 | grids = [self.grid_mappings[grid_size][absolute_pos]] 187 | 188 | queries = [self._get_target_object_query(task_plan)] 189 | 190 | if task_plan['size'] == 'largest': 191 | sizes = [largest_size] 192 | size_options = list(all_size_options - {largest_size}) 193 | question = f"What is the {attribute_type} of the largest object in the image?" 194 | else: 195 | sizes = [smallest_size] 196 | size_options = list(all_size_options - {smallest_size}) 197 | question = f"What is the {attribute_type} of the smallest object in the image?" 198 | 199 | image_metadata = self._make_image_metadata( 200 | grid_size, 201 | sizes, 202 | size_options, 203 | grids, 204 | queries=queries, 205 | ) 206 | 207 | answer = task_plan['attribute value'] 208 | target_object = image_metadata['objects'][0] 209 | negative_query = self.metadata.and_query([ 210 | (attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)])) 211 | ]) 212 | negatives = self.metadata.sample( 213 | self.rng, 214 | NUM_OPTIONS - 1, 215 | attribute_type, 216 | query=negative_query, 217 | ) 218 | options = [answer] + negatives 219 | 220 | return question, answer, options, image_metadata 221 | 222 | 223 | class WhereSize3DGridTaskGenerator(Size3DGridTaskGenerator): 224 | schema = { 225 | 'task type' : 'str', 226 | 'size' : 'str', 227 | 'grid number' : 'int', 228 | 'target category' : 'str', 229 | 'absolute position' : 'str', 230 | 'attribute type' : 'str', 231 | 'attribute value' : 'str', 232 | 'reference category' : 'str', 233 | 'reference position' : 'str', 234 | 'target-reference order': 'str' 235 | } 236 | 237 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 238 | super().__init__(metadata, seed=seed) 239 | self.relative_positions = relative_positions 240 | 241 | def enumerate_task_plans(self, task_store: TaskStore): 242 | for target_category in tqdm(self.metadata.categories, desc="enumerating [where size] task"): 243 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 244 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 245 | for attribute_type, attribute_values in attribute_dict.items(): 246 | for attribute_value in attribute_values: 247 | for grid_size in self.grid_options: 248 | for absolute_pos in self.grid_mappings[grid_size]: 249 | task_plan = { 250 | 'task type' : 'where size', 251 | 'size' : 'largest', 252 | 'grid number' : grid_size, 253 | 'target category' : target_category, 254 | 'absolute position': absolute_pos, 255 | 'attribute type' : attribute_type, 256 | 'attribute value' : attribute_value, 257 | } 258 | task_store.add(task_plan) 259 | 260 | task_plan = { 261 | 'task type' : 'where size', 262 | 'size' : 'smallest', 263 | 'grid number' : grid_size, 264 | 'target category' : target_category, 265 | 'absolute position': absolute_pos, 266 | 'attribute type' : attribute_type, 267 | 'attribute value' : attribute_value, 268 | } 269 | task_store.add(task_plan) 270 | 271 | grid = self.grid_mappings[grid_size][absolute_pos] 272 | 273 | for reference_category in irrelevant_categories: 274 | for reference_pos in self.relative_positions: 275 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 276 | if ref_grid >= 0: 277 | task_plan = { 278 | 'task type' : 'where size', 279 | 'size' : 'largest', 280 | 'grid number' : grid_size, 281 | 'target category' : target_category, 282 | 'absolute position' : absolute_pos, 283 | 'reference category' : reference_category, 284 | 'reference position' : reference_pos, 285 | 'attribute type' : attribute_type, 286 | 'attribute value' : attribute_value, 287 | 'target-reference order': 'target first' 288 | } 289 | task_store.add(task_plan) 290 | 291 | task_plan = { 292 | 'task type' : 'where size', 293 | 'size' : 'largest', 294 | 'grid number' : grid_size, 295 | 'target category' : target_category, 296 | 'absolute position' : absolute_pos, 297 | 'reference category' : reference_category, 298 | 'reference position' : reference_pos, 299 | 'attribute type' : attribute_type, 300 | 'attribute value' : attribute_value, 301 | 'target-reference order': 'reference first' 302 | } 303 | task_store.add(task_plan) 304 | 305 | task_plan = { 306 | 'task type' : 'where size', 307 | 'size' : 'smallest', 308 | 'grid number' : grid_size, 309 | 'target category' : target_category, 310 | 'absolute position' : absolute_pos, 311 | 'reference category' : reference_category, 312 | 'reference position' : reference_pos, 313 | 'attribute type' : attribute_type, 314 | 'attribute value' : attribute_value, 315 | 'target-reference order': 'target first' 316 | } 317 | task_store.add(task_plan) 318 | 319 | task_plan = { 320 | 'task type' : 'where size', 321 | 'size' : 'smallest', 322 | 'grid number' : grid_size, 323 | 'target category' : target_category, 324 | 'absolute position' : absolute_pos, 325 | 'reference category' : reference_category, 326 | 'reference position' : reference_pos, 327 | 'attribute type' : attribute_type, 328 | 'attribute value' : attribute_value, 329 | 'target-reference order': 'reference first' 330 | } 331 | task_store.add(task_plan) 332 | 333 | for grid_size in self.grid_options: 334 | for absolute_pos in self.grid_mappings[grid_size]: 335 | task_plan = { 336 | 'task type' : 'where size', 337 | 'size' : 'largest', 338 | 'grid number' : grid_size, 339 | 'target category' : target_category, 340 | 'absolute position': absolute_pos, 341 | } 342 | task_store.add(task_plan) 343 | 344 | task_plan = { 345 | 'task type' : 'where size', 346 | 'size' : 'smallest', 347 | 'grid number' : grid_size, 348 | 'target category' : target_category, 349 | 'absolute position': absolute_pos, 350 | } 351 | task_store.add(task_plan) 352 | 353 | grid = self.grid_mappings[grid_size][absolute_pos] 354 | 355 | for reference_category in irrelevant_categories: 356 | for reference_pos in self.relative_positions: 357 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 358 | if ref_grid >= 0: 359 | task_plan = { 360 | 'task type' : 'where size', 361 | 'size' : 'largest', 362 | 'grid number' : grid_size, 363 | 'target category' : target_category, 364 | 'absolute position' : absolute_pos, 365 | 'reference category' : reference_category, 366 | 'reference position' : reference_pos, 367 | 'target-reference order': 'target first' 368 | } 369 | task_store.add(task_plan) 370 | 371 | task_plan = { 372 | 'task type' : 'where size', 373 | 'size' : 'largest', 374 | 'grid number' : grid_size, 375 | 'target category' : target_category, 376 | 'absolute position' : absolute_pos, 377 | 'reference category' : reference_category, 378 | 'reference position' : reference_pos, 379 | 'target-reference order': 'reference first' 380 | } 381 | task_store.add(task_plan) 382 | 383 | task_plan = { 384 | 'task type' : 'where size', 385 | 'size' : 'smallest', 386 | 'grid number' : grid_size, 387 | 'target category' : target_category, 388 | 'absolute position' : absolute_pos, 389 | 'reference category' : reference_category, 390 | 'reference position' : reference_pos, 391 | 'target-reference order': 'target first' 392 | } 393 | task_store.add(task_plan) 394 | 395 | task_plan = { 396 | 'task type' : 'where size', 397 | 'size' : 'smallest', 398 | 'grid number' : grid_size, 399 | 'target category' : target_category, 400 | 'absolute position' : absolute_pos, 401 | 'reference category' : reference_category, 402 | 'reference position' : reference_pos, 403 | 'target-reference order': 'reference first' 404 | } 405 | task_store.add(task_plan) 406 | 407 | def _generate_task(self, task_plan): 408 | grid_size = task_plan['grid number'] 409 | 410 | target_category = task_plan['target category'] 411 | categories = [target_category] 412 | queries = [self._get_target_object_query(task_plan)] 413 | absolute_pos = task_plan['absolute position'] 414 | grids = [self.grid_mappings[grid_size][absolute_pos]] 415 | 416 | if 'reference category' in task_plan: 417 | reference_pos = task_plan['reference position'] 418 | reference_category = task_plan['reference category'] 419 | categories.append(reference_category) 420 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 421 | 422 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 423 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 424 | grids.append(ref_grid) 425 | 426 | if task_plan['target-reference order'] == 'target first': 427 | if task_plan['size'] == 'largest': 428 | question = f"Where is the largest object in the image with respect to the {self.metadata.get_surfacename(reference_category)}?" 429 | else: 430 | question = f"Where is the smallest object in the image with respect to the {self.metadata.get_surfacename(reference_category)}?" 431 | answer = reference_pos 432 | else: 433 | if task_plan['size'] == 'largest': 434 | question = f"Where is the {self.metadata.get_surfacename(reference_category)} with respect to the largest object in the image?" 435 | else: 436 | question = f"Where is the {self.metadata.get_surfacename(reference_category)} with respect to the smallest object in the image?" 437 | answer = reverse_relative_positions[reference_pos] 438 | negatives = [o for o in self.relative_positions if o != answer] 439 | else: 440 | if task_plan['size'] == 'largest': 441 | question = f"Where is the largest object in the image?" 442 | else: 443 | question = f"Where is the smallest object in the image?" 444 | answer = absolute_pos 445 | negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer] 446 | 447 | if task_plan['size'] == 'largest': 448 | sizes = [largest_size] 449 | size_options = list(all_size_options - {largest_size}) 450 | else: 451 | sizes = [smallest_size] 452 | size_options = list(all_size_options - {smallest_size}) 453 | sizes += list(self.rng.choice(size_options, replace=True, size=1)) 454 | 455 | options = self._compose_options(answer, negatives) 456 | image_metadata = self._make_image_metadata( 457 | grid_size, 458 | sizes, 459 | size_options, 460 | grids, 461 | queries=queries, 462 | remaining_query=self.metadata.and_query([("category", c, False) for c in categories]) 463 | ) 464 | 465 | return question, answer, options, image_metadata 466 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/distance_single_image_task.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from .single_image_task import _3DGridTaskGenerator 4 | from .utils import relative_positions 5 | from ..metadata import Objaverse3DMetaData 6 | from ...constant import NUM_OPTIONS 7 | from ...task_store import TaskStore 8 | 9 | grid_options = [3] 10 | 11 | relative_distance = { 12 | '0': [[1, 3], [4], [2, 6], [5, 7], [8]], 13 | '1': [[0, 4, 2], [3, 5], [7], [6, 8]], 14 | '2': [[1, 5], [4], [0, 8], [3, 7], [6]], 15 | '3': [[0, 4, 6], [1, 7], [5], [2, 8]], 16 | '4': [[1, 3, 5, 7], [0, 2, 6, 8]], 17 | '5': [[2, 4, 8], [1, 7], [3], [0, 6]], 18 | '6': [[3, 7], [4], [0, 8], [1, 5], [2]], 19 | '7': [[4, 6, 8], [3, 5], [1], [2, 0]], 20 | '8': [[5, 7], [4], [2, 6], [1, 3], [0]], 21 | } 22 | 23 | 24 | def _get_relative_distance_level(ref, target): 25 | for idx, level in enumerate(relative_distance[str(ref)]): 26 | if target in level: 27 | return idx 28 | 29 | 30 | def _get_max_distance_level(ref): 31 | return len(relative_distance[str(ref)]) - 1 32 | 33 | 34 | def _get_farther_grids(ref, target): 35 | ref_level = _get_relative_distance_level(ref, target) 36 | farther_grids = [] 37 | for level in relative_distance[str(ref)][ref_level + 1:]: 38 | farther_grids.extend(level) 39 | return farther_grids 40 | 41 | 42 | def _get_closer_grids(ref, target): 43 | ref_level = _get_relative_distance_level(ref, target) 44 | closer_grids = [] 45 | for level in relative_distance[str(ref)][:ref_level]: 46 | closer_grids.extend(level) 47 | return closer_grids 48 | 49 | 50 | class Distance3DGridTaskGenerator(_3DGridTaskGenerator): 51 | 52 | def __init__(self, metadata: Objaverse3DMetaData, max_num_distracting_object=2, seed=42): 53 | super().__init__(metadata, seed=seed) 54 | self.grid_options = grid_options 55 | self.max_num_distracting_object = max_num_distracting_object 56 | 57 | def _make_image_metadata(self, grid_size, distance_type, grids, queries, remaining_query=...): 58 | target_grid = grids[0] 59 | ref_grid = grids[1] 60 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 61 | if distance_type == 'farthest': 62 | possible_closer_grids = _get_closer_grids(ref_grid, target_grid) 63 | remaining_grids = self.rng.choice(possible_closer_grids, replace=False, size=min(self.max_num_distracting_object, len(possible_closer_grids))) 64 | else: 65 | possible_farther_grids = _get_farther_grids(ref_grid, target_grid) 66 | remaining_grids = self.rng.choice(possible_farther_grids, replace=False, size=min(self.max_num_distracting_object, len(possible_farther_grids))) 67 | 68 | remaining_grids = [int(grid) for grid in remaining_grids] # convert numpy.int64 to int to feed into json 69 | 70 | for _ in remaining_grids: 71 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 72 | objects.append(uid) 73 | 74 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 75 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 76 | 77 | image_metadata = { 78 | 'grid number' : grid_size, 79 | 'objects' : objects, 80 | 'object_path' : object_path, 81 | 'object_angles' : angles, 82 | 'grids' : grids + remaining_grids, 83 | 'blender_config': self.metadata.sample_blender_configuration(self.rng), 84 | } 85 | return image_metadata 86 | 87 | 88 | class WhatDistance3DGridTaskGenerator(Distance3DGridTaskGenerator): 89 | schema = { 90 | 'task type' : 'str', 91 | 'distance type' : 'str', 92 | 'grid number' : 'int', 93 | 'target category' : 'str', 94 | 'absolute position' : 'str', 95 | 'attribute type' : 'str', 96 | 'attribute value' : 'str', 97 | 'reference category': 'str', 98 | 'reference position': 'str', 99 | } 100 | 101 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 102 | super().__init__(metadata, seed=seed) 103 | self.relative_positions = relative_positions 104 | 105 | def enumerate_task_plans(self, task_store: TaskStore): 106 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what distance] task"): 107 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 108 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 109 | for attribute_type, attribute_values in attribute_dict.items(): 110 | for attribute_value in attribute_values: 111 | for grid_size in self.grid_options: 112 | for absolute_pos in self.grid_mappings[grid_size]: 113 | grid = self.grid_mappings[grid_size][absolute_pos] 114 | for reference_category in irrelevant_categories: 115 | for reference_pos in self.relative_positions: 116 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 117 | if ref_grid >= 0: 118 | if (_get_relative_distance_level(ref_grid, grid) > 0): 119 | task_plan = { 120 | 'task type' : 'what distance', 121 | 'distance type' : 'farthest', 122 | 'grid number' : grid_size, 123 | 'target category' : target_category, 124 | 'absolute position' : absolute_pos, 125 | 'reference category': reference_category, 126 | 'reference position': reference_pos, 127 | 'attribute type' : attribute_type, 128 | 'attribute value' : attribute_value, 129 | } 130 | task_store.add(task_plan) 131 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 132 | task_plan = { 133 | 'task type' : 'what distance', 134 | 'distance type' : 'closest', 135 | 'grid number' : grid_size, 136 | 'target category' : target_category, 137 | 'absolute position' : absolute_pos, 138 | 'reference category': reference_category, 139 | 'reference position': reference_pos, 140 | 'attribute type' : attribute_type, 141 | 'attribute value' : attribute_value, 142 | } 143 | task_store.add(task_plan) 144 | 145 | for grid_size in self.grid_options: 146 | for absolute_pos in self.grid_mappings[grid_size]: 147 | grid = self.grid_mappings[grid_size][absolute_pos] 148 | for reference_category in irrelevant_categories: 149 | for reference_pos in self.relative_positions: 150 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 151 | if ref_grid >= 0: 152 | if (_get_relative_distance_level(ref_grid, grid) > 0): 153 | task_plan = { 154 | 'task type' : 'what distance', 155 | 'distance type' : 'farthest', 156 | 'grid number' : grid_size, 157 | 'target category' : target_category, 158 | 'absolute position' : absolute_pos, 159 | 'reference category': reference_category, 160 | 'reference position': reference_pos, 161 | } 162 | task_store.add(task_plan) 163 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 164 | task_plan = { 165 | 'task type' : 'what distance', 166 | 'distance type' : 'closest', 167 | 'grid number' : grid_size, 168 | 'target category' : target_category, 169 | 'absolute position' : absolute_pos, 170 | 'reference category': reference_category, 171 | 'reference position': reference_pos, 172 | } 173 | task_store.add(task_plan) 174 | 175 | def _generate_task(self, task_plan): 176 | grid_size = task_plan['grid number'] 177 | 178 | target_category = task_plan['target category'] 179 | absolute_pos = task_plan['absolute position'] 180 | grids = [self.grid_mappings[grid_size][absolute_pos]] 181 | queries = [self._get_target_object_query(task_plan)] 182 | 183 | remaining_query = [("category", target_category, False)] 184 | 185 | reference_pos = task_plan['reference position'] 186 | reference_category = task_plan['reference category'] 187 | 188 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 189 | remaining_query += [("category", reference_category, False)] 190 | 191 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 192 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 193 | grids.append(ref_grid) 194 | 195 | if task_plan['distance type'] == 'farthest': 196 | question = f"What is the object that is farthest from the {self.metadata.get_surfacename(reference_category)}?" 197 | else: 198 | question = f"What is the object that is closest to the {self.metadata.get_surfacename(reference_category)}?" 199 | 200 | image_metadata = self._make_image_metadata( 201 | grid_size, 202 | distance_type=task_plan['distance type'], 203 | grids=grids, 204 | queries=queries, 205 | remaining_query=self.metadata.and_query(remaining_query) 206 | ) 207 | 208 | answer = self.metadata.get_surfacename(target_category) 209 | negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category)) 210 | for o in image_metadata['objects'][1:]] 211 | options = self._compose_options(answer, negatives) 212 | 213 | return question, answer, options, image_metadata 214 | 215 | 216 | class WhatAttributeDistance3DGridTaskGenerator(Distance3DGridTaskGenerator): 217 | schema = { 218 | 'task type' : 'str', 219 | 'distance type' : 'str', 220 | 'grid number' : 'int', 221 | 'target category' : 'str', 222 | 'absolute position' : 'str', 223 | 'attribute type' : 'str', 224 | 'attribute value' : 'str', 225 | 'reference category': 'str', 226 | 'reference position': 'str', 227 | } 228 | 229 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 230 | super().__init__(metadata, seed=seed) 231 | self.relative_positions = relative_positions 232 | 233 | def enumerate_task_plans(self, task_store: TaskStore): 234 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what attribute distance] task"): 235 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 236 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 237 | for attribute_type, attribute_values in attribute_dict.items(): 238 | for attribute_value in attribute_values: 239 | for grid_size in self.grid_options: 240 | for absolute_pos in self.grid_mappings[grid_size]: 241 | grid = self.grid_mappings[grid_size][absolute_pos] 242 | for reference_category in irrelevant_categories: 243 | for reference_pos in self.relative_positions: 244 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 245 | if ref_grid >= 0: 246 | if (_get_relative_distance_level(ref_grid, grid) > 0): 247 | task_plan = { 248 | 'task type' : 'what attribute distance', 249 | 'distance type' : 'farthest', 250 | 'grid number' : grid_size, 251 | 'target category' : target_category, 252 | 'absolute position' : absolute_pos, 253 | 'reference category': reference_category, 254 | 'reference position': reference_pos, 255 | 'attribute type' : attribute_type, 256 | 'attribute value' : attribute_value, 257 | } 258 | task_store.add(task_plan) 259 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 260 | task_plan = { 261 | 'task type' : 'what attribute distance', 262 | 'distance type' : 'closest', 263 | 'grid number' : grid_size, 264 | 'target category' : target_category, 265 | 'absolute position' : absolute_pos, 266 | 'reference category': reference_category, 267 | 'reference position': reference_pos, 268 | 'attribute type' : attribute_type, 269 | 'attribute value' : attribute_value, 270 | } 271 | task_store.add(task_plan) 272 | 273 | def _generate_task(self, task_plan): 274 | grid_size = task_plan['grid number'] 275 | 276 | attribute_type = task_plan['attribute type'] 277 | 278 | absolute_pos = task_plan['absolute position'] 279 | grids = [self.grid_mappings[grid_size][absolute_pos]] 280 | 281 | queries = [self._get_target_object_query(task_plan)] 282 | 283 | reference_pos = task_plan['reference position'] 284 | reference_category = task_plan['reference category'] 285 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 286 | 287 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 288 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 289 | 290 | grids.append(ref_grid) 291 | if task_plan['distance type'] == 'farthest': 292 | question = f"What is the {attribute_type} of the object that is farthest from the {self.metadata.get_surfacename(reference_category)}?" 293 | else: 294 | question = f"What is the {attribute_type} of the object that is closest to the {self.metadata.get_surfacename(reference_category)}?" 295 | 296 | image_metadata = self._make_image_metadata( 297 | grid_size, 298 | distance_type=task_plan['distance type'], 299 | grids=grids, 300 | queries=queries, 301 | ) 302 | 303 | answer = task_plan['attribute value'] 304 | target_object = image_metadata['objects'][0] 305 | negative_query = self.metadata.and_query([ 306 | (attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)])) 307 | ]) 308 | negatives = self.metadata.sample( 309 | self.rng, 310 | NUM_OPTIONS - 1, 311 | attribute_type, 312 | query=negative_query, 313 | ) 314 | options = [answer] + negatives 315 | 316 | return question, answer, options, image_metadata 317 | 318 | 319 | class WhereDistance3DGridTaskGenerator(Distance3DGridTaskGenerator): 320 | schema = { 321 | 'task type' : 'str', 322 | 'distance type' : 'str', 323 | 'grid number' : 'int', 324 | 'target category' : 'str', 325 | 'absolute position' : 'str', 326 | 'attribute type' : 'str', 327 | 'attribute value' : 'str', 328 | 'reference category': 'str', 329 | 'reference position': 'str', 330 | } 331 | 332 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 333 | super().__init__(metadata, seed=seed) 334 | self.relative_positions = relative_positions 335 | 336 | def enumerate_task_plans(self, task_store: TaskStore): 337 | for target_category in tqdm(self.metadata.categories, desc="enumerating [where distance] task"): 338 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 339 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 340 | for attribute_type, attribute_values in attribute_dict.items(): 341 | for attribute_value in attribute_values: 342 | for grid_size in self.grid_options: 343 | for absolute_pos in self.grid_mappings[grid_size]: 344 | grid = self.grid_mappings[grid_size][absolute_pos] 345 | for reference_category in irrelevant_categories: 346 | for reference_pos in self.relative_positions: 347 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 348 | if ref_grid >= 0: 349 | if (_get_relative_distance_level(ref_grid, grid) > 0): 350 | task_plan = { 351 | 'task type' : 'where distance', 352 | 'distance type' : 'farthest', 353 | 'grid number' : grid_size, 354 | 'target category' : target_category, 355 | 'absolute position' : absolute_pos, 356 | 'reference category': reference_category, 357 | 'reference position': reference_pos, 358 | 'attribute type' : attribute_type, 359 | 'attribute value' : attribute_value, 360 | } 361 | task_store.add(task_plan) 362 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 363 | task_plan = { 364 | 'task type' : 'where distance', 365 | 'distance type' : 'closest', 366 | 'grid number' : grid_size, 367 | 'target category' : target_category, 368 | 'absolute position' : absolute_pos, 369 | 'reference category': reference_category, 370 | 'reference position': reference_pos, 371 | 'attribute type' : attribute_type, 372 | 'attribute value' : attribute_value, 373 | } 374 | task_store.add(task_plan) 375 | 376 | for grid_size in self.grid_options: 377 | for absolute_pos in self.grid_mappings[grid_size]: 378 | grid = self.grid_mappings[grid_size][absolute_pos] 379 | for reference_category in irrelevant_categories: 380 | for reference_pos in self.relative_positions: 381 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 382 | if ref_grid >= 0: 383 | if (_get_relative_distance_level(ref_grid, grid) > 0): 384 | task_plan = { 385 | 'task type' : 'where distance', 386 | 'distance type' : 'farthest', 387 | 'grid number' : grid_size, 388 | 'target category' : target_category, 389 | 'absolute position' : absolute_pos, 390 | 'reference category': reference_category, 391 | 'reference position': reference_pos, 392 | } 393 | task_store.add(task_plan) 394 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 395 | task_plan = { 396 | 'task type' : 'where distance', 397 | 'distance type' : 'closest', 398 | 'grid number' : grid_size, 399 | 'target category' : target_category, 400 | 'absolute position' : absolute_pos, 401 | 'reference category': reference_category, 402 | 'reference position': reference_pos, 403 | } 404 | task_store.add(task_plan) 405 | 406 | def _generate_task(self, task_plan): 407 | grid_size = task_plan['grid number'] 408 | 409 | target_category = task_plan['target category'] 410 | categories = [target_category] 411 | queries = [self._get_target_object_query(task_plan)] 412 | absolute_pos = task_plan['absolute position'] 413 | grids = [self.grid_mappings[grid_size][absolute_pos]] 414 | 415 | reference_pos = task_plan['reference position'] 416 | reference_category = task_plan['reference category'] 417 | categories.append(reference_category) 418 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 419 | 420 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 421 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 422 | grids.append(ref_grid) 423 | 424 | if task_plan['distance type'] == 'farthest': 425 | question = f"Where is the object that is farthest from the {self.metadata.get_surfacename(reference_category)} in the image?" 426 | else: 427 | question = f"Where is the object that is closest to the {self.metadata.get_surfacename(reference_category)} in the image?" 428 | answer = absolute_pos 429 | negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer] 430 | 431 | options = self._compose_options(answer, negatives) 432 | image_metadata = self._make_image_metadata( 433 | grid_size, 434 | distance_type=task_plan['distance type'], 435 | grids=grids, 436 | queries=queries, 437 | remaining_query=self.metadata.and_query([("category", c, False) for c in categories]) 438 | ) 439 | 440 | return question, answer, options, image_metadata 441 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/run_blender.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import math 4 | import os 5 | import sys 6 | import urllib.request 7 | from math import radians 8 | 9 | try: 10 | import bpy 11 | from mathutils import Vector, Matrix, Quaternion, Euler 12 | except ImportError: 13 | pass 14 | 15 | 16 | def get_exact_frame(current_keyframe_idx, total_num_frames): 17 | return (current_keyframe_idx) * (total_num_frames // 4) + 1 18 | 19 | 20 | def rotate(obj, degree): 21 | """Rotates around the z axis by theta""" 22 | degree = -degree 23 | bpy.ops.object.select_all(action='DESELECT') 24 | obj.select_set(True) 25 | bpy.context.view_layer.objects.active = obj 26 | radian = radians(degree) 27 | bpy.context.object.rotation_mode = 'XYZ' 28 | rot_x, rot_y, rot_z = obj.rotation_euler 29 | obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian)) 30 | freeze_transformation(obj) 31 | 32 | 33 | def rotate_and_keyframe(obj, degree, frame): 34 | degree = -degree 35 | bpy.ops.object.select_all(action='DESELECT') 36 | obj.select_set(True) 37 | bpy.context.scene.frame_set(frame) 38 | 39 | bpy.context.view_layer.objects.active = obj 40 | bpy.ops.object.origin_set(type='ORIGIN_GEOMETRY') 41 | bpy.context.object.rotation_mode = 'XYZ' 42 | radian = radians(degree) 43 | rot_x, rot_y, rot_z = obj.rotation_euler 44 | obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian)) 45 | obj.keyframe_insert(data_path="rotation_euler", frame=frame) 46 | bpy.ops.object.select_all(action='DESELECT') 47 | 48 | 49 | def reset_scene(): 50 | # delete everything that isn't part of a camera or a light 51 | bpy.ops.object.select_all(action="SELECT") 52 | for obj in bpy.data.objects: 53 | bpy.data.objects.remove(obj, do_unlink=True) 54 | bpy.ops.ptcache.free_bake_all() 55 | 56 | 57 | def select_hierarchy(obj): 58 | """Recursively select an object and all of its descendants.""" 59 | obj.select_set(True) 60 | for child in obj.children: 61 | select_hierarchy(child) 62 | 63 | 64 | def load_object(object_path: str) -> None: 65 | """Loads a glb model into the scene.""" 66 | bpy.ops.object.select_all(action='DESELECT') 67 | if object_path.endswith(".glb"): 68 | bpy.ops.import_scene.gltf(filepath=object_path, merge_vertices=True) 69 | elif object_path.endswith(".fbx"): 70 | bpy.ops.import_scene.fbx(filepath=object_path) 71 | else: 72 | raise ValueError(f"Unsupported file type: {object_path}") 73 | 74 | base_name = os.path.basename(object_path) 75 | object_name, _ = os.path.splitext(base_name) 76 | bpy.context.view_layer.objects.active.name = object_name 77 | bpy.ops.object.select_all(action='DESELECT') 78 | 79 | obj = bpy.data.objects.get(object_name) 80 | # bpy.context.view_layer.objects.active = obj 81 | select_hierarchy(obj) 82 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 83 | meshes = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"] 84 | non_meshes = [obj for obj in bpy.context.selected_objects if obj.type != "MESH"] 85 | bpy.ops.object.select_all(action="DESELECT") 86 | 87 | # delete non-mesh and consolidate 88 | 89 | for obj in non_meshes: 90 | obj.select_set(True) 91 | bpy.ops.object.delete() 92 | bpy.ops.object.select_all(action="DESELECT") 93 | for obj in meshes: 94 | obj.select_set(True) 95 | bpy.context.view_layer.objects.active = meshes[0] 96 | bpy.ops.object.join() 97 | bpy.context.view_layer.objects.active.name = object_name 98 | bpy.ops.object.origin_set(type='GEOMETRY_ORIGIN', center='BOUNDS') 99 | 100 | bpy.ops.object.select_all(action="DESELECT") 101 | 102 | return object_name 103 | 104 | 105 | def scene_meshes(): 106 | for obj in bpy.context.scene.objects.values(): 107 | if isinstance(obj.data, (bpy.types.Mesh)): 108 | yield obj 109 | 110 | 111 | def download_uid(uid_path, save_dir): 112 | return download_object(uid_path, save_dir) 113 | 114 | 115 | def download_object(object_url, save_dir) -> str: 116 | """Download the object and return the path.""" 117 | # uid = uuid.uuid4() 118 | uid = object_url.split("/")[-1].split(".")[0] 119 | tmp_local_path = os.path.join(save_dir, f"{uid}.glb" + ".tmp") 120 | local_path = os.path.join(save_dir, f"{uid}.glb") 121 | # wget the file and put it in local_path 122 | os.makedirs(os.path.dirname(tmp_local_path), exist_ok=True) 123 | urllib.request.urlretrieve(object_url, tmp_local_path) 124 | os.rename(tmp_local_path, local_path) 125 | # get the absolute path 126 | local_path = os.path.abspath(local_path) 127 | return local_path 128 | 129 | 130 | def scene_bbox(single_obj=None, ignore_matrix=False): 131 | bbox_min = (math.inf,) * 3 132 | bbox_max = (-math.inf,) * 3 133 | found = False 134 | for obj in scene_meshes() if single_obj is None else [single_obj]: 135 | found = True 136 | for coord in obj.bound_box: 137 | coord = Vector(coord) 138 | if not ignore_matrix: 139 | coord = obj.matrix_world @ coord 140 | bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord)) 141 | bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord)) 142 | if not found: 143 | raise RuntimeError("no objects in scene to compute bounding box for") 144 | return Vector(bbox_min), Vector(bbox_max) 145 | 146 | 147 | def scene_root_objects(): 148 | for obj in bpy.context.scene.objects.values(): 149 | if not obj.parent: 150 | yield obj 151 | 152 | 153 | def freeze_transformation(obj): 154 | bpy.context.view_layer.objects.active = obj 155 | obj.select_set(True) 156 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 157 | bpy.ops.object.select_all(action='DESELECT') 158 | 159 | 160 | def scale(obj, scale_factor): 161 | bpy.ops.object.select_all(action='DESELECT') 162 | obj.select_set(True) 163 | bpy.ops.transform.resize(value=(scale_factor, scale_factor, scale_factor)) 164 | bpy.ops.object.select_all(action='DESELECT') 165 | freeze_transformation(obj) 166 | 167 | 168 | def get_3d_dimensions(obj): 169 | # pdb.set_trace() 170 | max_x, max_y, max_z = float("-inf"), float("-inf"), float("-inf") 171 | min_x, min_y, min_z = float("inf"), float("inf"), float("inf") 172 | 173 | for vertex in obj.data.vertices: 174 | v_world = obj.matrix_world @ vertex.co 175 | max_x, max_y, max_z = max(max_x, v_world.x), max(max_y, v_world.y), max(max_z, v_world.z) 176 | min_x, min_y, min_z = min(min_x, v_world.x), min(min_y, v_world.y), min(min_z, v_world.z) 177 | 178 | return (max_x - min_x, max_y - min_y, max_z - min_z) 179 | 180 | 181 | def normalize_object(obj, factor=1.0): 182 | max_dimension = max(get_3d_dimensions(obj)) 183 | scale_factor = factor * (1 / max_dimension) 184 | scale(obj, scale_factor) 185 | 186 | 187 | def move_to_xy(obj, x, y): 188 | min_z = float('inf') 189 | for vertex in obj.data.vertices: 190 | z = obj.matrix_world @ vertex.co 191 | min_z = min(min_z, z.z) 192 | obj.location -= Vector((0, 0, min_z)) 193 | freeze_transformation(obj) 194 | 195 | # move location x,y to sampled box center 196 | new_location = Vector((x, y, obj.location[2])) 197 | obj.location = new_location 198 | freeze_transformation(obj) 199 | 200 | 201 | def move_to_xy_at_frame(obj, movement, frame): 202 | # Set the scene to the specific frame 203 | x, y = movement[0], movement[1] 204 | bpy.context.scene.frame_set(frame) 205 | new_location = Vector((x, y, 0)) 206 | obj.location = obj.location + new_location 207 | obj.keyframe_insert(data_path="location", frame=frame) 208 | 209 | 210 | def normalize_scene(): 211 | bbox_min, bbox_max = scene_bbox() 212 | scale = 1 / max(bbox_max - bbox_min) 213 | for obj in scene_root_objects(): 214 | obj.scale = obj.scale * scale 215 | # Apply scale to matrix_world. 216 | bpy.context.view_layer.update() 217 | bbox_min, bbox_max = scene_bbox() 218 | offset = -(bbox_min + bbox_max) / 2 219 | for obj in scene_root_objects(): 220 | obj.matrix_world.translation += offset 221 | bpy.ops.object.select_all(action="DESELECT") 222 | 223 | 224 | def setup_plane_and_background(plane_texture_path, hdri_path): 225 | # load plane 226 | plane_name = load_object(plane_texture_path) 227 | plane = bpy.data.objects.get(plane_name) 228 | scale(plane, 0.5) 229 | 230 | # load light map 231 | print(f"HDRI PATH: {hdri_path}") 232 | bpy.ops.image.open(filepath=hdri_path) 233 | if bpy.data.worlds.get("World") is None: 234 | bpy.data.worlds.new("World") 235 | 236 | bpy.context.scene.world = bpy.data.worlds["World"] 237 | 238 | bpy.context.scene.world.use_nodes = True 239 | tree = bpy.context.scene.world.node_tree 240 | tree.nodes.clear() 241 | 242 | tex_env = tree.nodes.new(type="ShaderNodeTexEnvironment") 243 | tex_env.image = bpy.data.images[hdri_path.split('/')[-1]] # Image name is typically the last part of the path 244 | background = tree.nodes.new(type="ShaderNodeBackground") 245 | output = tree.nodes.new(type="ShaderNodeOutputWorld") 246 | 247 | tree.links.new(tex_env.outputs[0], background.inputs[0]) 248 | tree.links.new(background.outputs[0], output.inputs[0]) 249 | 250 | return plane_texture_path + " " + hdri_path 251 | 252 | 253 | def setup_camera_and_lights( 254 | sun_x, 255 | sun_y, 256 | sun_energy, 257 | key_light_horizontal_angle, 258 | fill_light_horizontal_angle, 259 | key_light_vertical_angle, 260 | fill_light_vertical_angle 261 | ): 262 | # for seeting up the three point lighting, we mostly follow https://courses.cs.washington.edu/courses/cse458/05au/reading/3point_lighting.pdf 263 | # in order to keep lights and camera on the hemisphere pointing to origin, we use a hierarchy of empties 264 | 265 | # create the sun 266 | 267 | bpy.ops.object.light_add(type="SUN") 268 | sun = bpy.context.active_object 269 | sun.rotation_euler = Euler((sun_x, sun_y, 0), "XYZ") 270 | sun.data.energy = sun_energy 271 | 272 | # create global empty 273 | 274 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 275 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 276 | empty = bpy.context.scene.objects.get("Empty") 277 | 278 | # create camera 279 | 280 | # radius = random.uniform(1.8,2.2) 281 | radius = 2.5 282 | 283 | bpy.ops.object.camera_add(enter_editmode=False, align='VIEW', location=(-radius, 0, 0), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 284 | cam = bpy.context.scene.objects.get("Camera") 285 | cam.data.lens = 35 286 | cam.data.sensor_width = 32 287 | bpy.context.scene.camera = cam 288 | 289 | # create camera empty 290 | 291 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 292 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 293 | cam_empty = bpy.context.scene.objects.get("Empty.001") 294 | cam_empty.name = "camera_empty" 295 | 296 | # make camera empty parent of camera 297 | 298 | bpy.ops.object.select_all(action='DESELECT') 299 | cam.select_set(True) 300 | cam_empty.select_set(True) 301 | bpy.context.view_layer.objects.active = cam_empty 302 | bpy.ops.object.parent_set() 303 | bpy.ops.object.select_all(action='DESELECT') 304 | 305 | # make camera empty parent of global empty 306 | 307 | bpy.ops.object.select_all(action='DESELECT') 308 | cam_empty.select_set(True) 309 | empty.select_set(True) 310 | bpy.context.view_layer.objects.active = empty 311 | bpy.ops.object.parent_set() 312 | bpy.ops.object.select_all(action='DESELECT') 313 | 314 | light_names = ["key_light", "fill_light", "back_light"] 315 | light_energies = [1000., 300., 500.] 316 | 317 | for light_name, light_energy in zip(light_names, light_energies): 318 | # create light empty 319 | 320 | empty_name = light_name + "_empty" 321 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 322 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 323 | light_empty = bpy.context.scene.objects.get("Empty.001") 324 | light_empty.name = empty_name 325 | 326 | # parent light empty to main (camera) empty 327 | 328 | bpy.ops.object.select_all(action='DESELECT') 329 | light_empty.select_set(True) 330 | empty.select_set(True) 331 | bpy.context.view_layer.objects.active = empty 332 | bpy.ops.object.parent_set() 333 | bpy.ops.object.select_all(action='DESELECT') 334 | 335 | # create light 336 | 337 | x_loc, y_loc, z_loc = -radius, 0, 0 338 | bpy.ops.object.light_add(type='POINT', radius=1, align='WORLD', location=(x_loc, y_loc, z_loc), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 339 | bpy.data.objects["Point"].name = light_name 340 | light = bpy.data.objects[light_name] 341 | light.data.energy = light_energy 342 | # light.data.size = 0.5 343 | 344 | # parent light empty to light 345 | 346 | bpy.ops.object.select_all(action='DESELECT') 347 | light.select_set(True) 348 | light_empty.select_set(True) 349 | bpy.context.view_layer.objects.active = light_empty 350 | bpy.ops.object.parent_set() 351 | bpy.ops.object.select_all(action='DESELECT') 352 | 353 | # rotate camera and lights around the z-axis 354 | 355 | z_random_rot = radians(90) # radians(random.uniform(0,360)) 356 | empty.rotation_euler = Euler((0, 0, z_random_rot)) 357 | 358 | # # raise the camera while having it point to origin 359 | 360 | # cam_y_random_rot = radians(random.uniform(10,50)) 361 | # cam_empty.rotation_euler = Euler((0,cam_y_random_rot,0),"XYZ") 362 | 363 | bpy.context.view_layer.update() 364 | 365 | back_light_horizontal_angle = radians(180) 366 | light_horizontal_angles = [key_light_horizontal_angle, fill_light_horizontal_angle, back_light_horizontal_angle] 367 | for light_angle, light_name in zip(light_horizontal_angles, light_names): 368 | light_empty = bpy.data.objects[light_name + "_empty"] 369 | global_z = (light_empty.matrix_world.inverted() @ Vector((0.0, 0.0, 1.0, 0.0)))[:3] 370 | quat = Quaternion(global_z, light_angle) 371 | light_empty.rotation_euler = quat.to_euler() 372 | 373 | back_light_vertical_angle = 0 374 | light_vertical_angles = [key_light_vertical_angle, fill_light_vertical_angle, back_light_vertical_angle] 375 | # light_vertical_angles = [radians(-45)]*3 376 | 377 | for light_angle, light_name in zip(light_vertical_angles, light_names): 378 | light_empty = bpy.data.objects[light_name + "_empty"] 379 | global_x = (light_empty.matrix_world.inverted() @ Vector((1.0, 0.0, 0.0, 0.0)))[:3] 380 | quat = Quaternion(global_x, light_angle) 381 | euler_add = quat.to_euler() 382 | euler_current = light_empty.rotation_euler 383 | new_euler = Euler((euler_add[0] + euler_current[0], euler_add[1] + euler_current[1], euler_add[2] + euler_current[2])) 384 | light_empty.rotation_euler = new_euler 385 | 386 | # bpy.context.view_layer.update() 387 | 388 | return cam, empty 389 | 390 | 391 | def render_animation(fp): 392 | bpy.context.scene.render.filepath = fp 393 | bpy.context.scene.render.image_settings.file_format = 'FFMPEG' 394 | bpy.context.scene.render.ffmpeg.format = 'MPEG4' 395 | bpy.context.scene.render.ffmpeg.codec = 'H264' 396 | bpy.context.scene.render.ffmpeg.constant_rate_factor = 'MEDIUM' 397 | bpy.ops.render.render(animation=True) 398 | 399 | 400 | def setup_renderer(H, W, use_cpu=False): 401 | scene = bpy.context.scene 402 | render = bpy.context.scene.render 403 | 404 | render.engine = "CYCLES" 405 | render.image_settings.file_format = "PNG" 406 | render.image_settings.color_mode = "RGBA" 407 | render.resolution_x = W 408 | render.resolution_y = H 409 | render.resolution_percentage = 100 410 | 411 | scene.cycles.device = "CPU" if use_cpu else "GPU" 412 | scene.cycles.samples = 10 if use_cpu else 128 413 | scene.cycles.diffuse_bounces = 1 414 | scene.cycles.glossy_bounces = 1 415 | scene.cycles.transparent_max_bounces = 3 416 | scene.cycles.transmission_bounces = 3 417 | scene.cycles.filter_width = 0.01 418 | scene.cycles.use_denoising = True 419 | scene.render.film_transparent = False 420 | 421 | bpy.context.preferences.addons["cycles"].preferences.get_devices() 422 | # Set the device_type 423 | bpy.context.preferences.addons[ 424 | "cycles" 425 | ].preferences.compute_device_type = "METAL" if use_cpu else "CUDA" 426 | bpy.context.scene.view_settings.view_transform = 'Filmic' 427 | 428 | 429 | # def randomize_camera_view(axis): 430 | # euler_y = radians(random.uniform(-90, 90)) 431 | # euler_z = radians(random.uniform(0, 360)) 432 | # axis.rotation_euler = Euler((0, euler_y, euler_z)) 433 | 434 | 435 | def run_render(metadata, save_image_path, use_cpu): 436 | reset_scene() 437 | 438 | bpy.context.scene.render.fps = metadata['fps'] 439 | bpy.context.scene.frame_start = 1 440 | bpy.context.scene.frame_end = metadata['total_num_frames'] 441 | 442 | objs = [] 443 | for uid in metadata["objects"]: 444 | object_path = metadata["object_path"][uid] 445 | objs.append(bpy.data.objects.get(load_object(object_path))) 446 | 447 | grid_number = metadata["grid number"] 448 | 449 | if grid_number == 2: 450 | locations = { 451 | 0: [0.7, 0.5], 452 | 1: [0.7, -0.5], 453 | 2: [-0.6, 0.5], 454 | 3: [-0.6, -0.5] 455 | } 456 | scale_factor = 1 / 2 457 | elif grid_number == 3: 458 | locations = { 459 | 0: [0.9, 0.6], 460 | 1: [0.9, 0], 461 | 2: [0.9, -0.6], 462 | 3: [0.0, 0.6], 463 | 4: [0.0, 0.0], 464 | 5: [0.0, -0.6], 465 | 6: [-0.9, 0.6], 466 | 7: [-0.9, 0.0], 467 | 8: [-0.9, -0.6] 468 | } 469 | scale_factor = 1 / 3 470 | else: 471 | raise ValueError(f"Expected grid number to be 2 or 3 but got {grid_number}") 472 | 473 | # process rotate 474 | for idx, obj in enumerate(objs): 475 | rotate(obj, degree=metadata['object_angles'][idx]) 476 | 477 | # process scale 478 | if "sizes" in metadata: 479 | for idx, obj in enumerate(objs): 480 | normalize_object(obj, factor=metadata['sizes'][idx] * scale_factor) 481 | else: 482 | for obj in objs: 483 | normalize_object(obj, factor=scale_factor) 484 | 485 | for pos, obj in zip(metadata["grids"], objs): 486 | x, y = locations[pos] 487 | move_to_xy(obj, x, y) 488 | 489 | # set the first keyframe of video 490 | for obj in objs: 491 | rotate_and_keyframe(obj, 0, 1) 492 | move_to_xy_at_frame(obj, (0, 0), 1) 493 | 494 | # set other keyframes based on the metadata 495 | for idx, obj in enumerate(objs): 496 | for keyframe_order, keyframe_info in enumerate(metadata["keyframes"][idx]): 497 | if "rotation" in keyframe_info: 498 | rotate_and_keyframe(obj, keyframe_info["rotation"], get_exact_frame(keyframe_order, metadata['total_num_frames'])) 499 | if "movement" in keyframe_info: 500 | move_to_xy_at_frame(obj, keyframe_info["movement"], get_exact_frame(keyframe_order, metadata['total_num_frames'])) 501 | 502 | blender_config = metadata["blender_config"] 503 | 504 | setup_plane_and_background(blender_config["plane_texture_path"], blender_config["hdri_path"]) 505 | cam, axis = setup_camera_and_lights( 506 | blender_config["sun_x"], 507 | blender_config["sun_y"], 508 | blender_config["sun_energy"], 509 | blender_config["key_light_horizontal_angle"], 510 | blender_config["fill_light_horizontal_angle"], 511 | blender_config["key_light_vertical_angle"], 512 | blender_config["fill_light_vertical_angle"] 513 | ) 514 | axis.rotation_euler = Euler((0, radians(45), 0)) 515 | setup_renderer(H=metadata["VIDEO_H"], W=metadata["VIDEO_W"], use_cpu=use_cpu) 516 | render_animation(save_image_path) 517 | 518 | 519 | if __name__ == "__main__": 520 | parser = argparse.ArgumentParser() 521 | parser.add_argument( 522 | "--save_local", 523 | type=str, 524 | default="" 525 | ) 526 | parser.add_argument( 527 | "--save_video_path", 528 | type=str, 529 | default="render.png" 530 | ) 531 | parser.add_argument( 532 | "--json_file", 533 | type=str, 534 | default="video_metadata.json" 535 | ) 536 | 537 | parser.add_argument( 538 | "--use_cpu", 539 | action="store_true", 540 | default=False 541 | ) 542 | 543 | argv = sys.argv[sys.argv.index("--") + 1:] 544 | args = parser.parse_args(argv) 545 | 546 | with open(args.json_file, "r") as f: 547 | metadata = json.load(f) 548 | 549 | run_render(metadata, args.save_video_path, args.use_cpu) 550 | --------------------------------------------------------------------------------