├── tma ├── __init__.py ├── imageqa │ ├── __init__.py │ ├── scene_graph │ │ ├── __init__.py │ │ └── single_image_task.py │ ├── sticker_2d │ │ ├── __init__.py │ │ └── utils.py │ ├── tabletop_3d │ │ ├── __init__.py │ │ ├── single_image_task.py │ │ ├── utils.py │ │ ├── run_blender.py │ │ ├── size_single_image_task.py │ │ └── distance_single_image_task.py │ └── metadata.py ├── videoqa │ ├── __init__.py │ ├── scene_graph │ │ ├── __init__.py │ │ └── single_video_task.py │ ├── tabletop_3d │ │ ├── __init__.py │ │ ├── single_video_task.py │ │ ├── utils.py │ │ ├── movement_single_video_task.py │ │ └── run_blender.py │ └── metadata.py ├── models │ ├── __init__.py │ └── qa_model │ │ ├── __init__.py │ │ ├── prompt.py │ │ ├── base_qa_model.py │ │ └── videoqa_model.py ├── constant.py ├── metadata.py ├── base.py └── task_store.py ├── teaser.png ├── assets ├── 2024-imageqa-result.png ├── 2024-videoqa-result.png ├── 2024vsrandom-imageqa.png ├── 2024vsrandom-videoqa.png ├── random-imageqa-result.png └── random-videoqa-result.png ├── requirements.txt ├── .gitignore ├── annotations ├── relation_to_type.json └── attribute_category.json ├── LICENSE └── README.md /tma/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tma/imageqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tma/videoqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tma/models/__init__.py: -------------------------------------------------------------------------------- 1 | class Model: 2 | model_name: str 3 | -------------------------------------------------------------------------------- /tma/imageqa/scene_graph/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_image_task import * 2 | -------------------------------------------------------------------------------- /tma/imageqa/sticker_2d/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_image_task import * 2 | -------------------------------------------------------------------------------- /tma/videoqa/scene_graph/__init__.py: -------------------------------------------------------------------------------- 1 | from .single_video_task import * 2 | -------------------------------------------------------------------------------- /teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/teaser.png -------------------------------------------------------------------------------- /assets/2024-imageqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024-imageqa-result.png -------------------------------------------------------------------------------- /assets/2024-videoqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024-videoqa-result.png -------------------------------------------------------------------------------- /assets/2024vsrandom-imageqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024vsrandom-imageqa.png -------------------------------------------------------------------------------- /assets/2024vsrandom-videoqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024vsrandom-videoqa.png -------------------------------------------------------------------------------- /assets/random-imageqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/random-imageqa-result.png -------------------------------------------------------------------------------- /assets/random-videoqa-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/random-videoqa-result.png -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/__init__.py: -------------------------------------------------------------------------------- 1 | from .movement_single_video_task import * 2 | from .rotation_single_video_task import * 3 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/__init__.py: -------------------------------------------------------------------------------- 1 | from .distance_single_image_task import * 2 | from .single_image_task import * 3 | from .size_single_image_task import * 4 | -------------------------------------------------------------------------------- /tma/constant.py: -------------------------------------------------------------------------------- 1 | NUM_OPTIONS = 4 2 | 3 | # ImageQA 4 | 5 | IMAGE_H = 512 6 | IMAGE_W = 512 7 | 8 | # VideoQA 9 | 10 | VIDEO_H = 224 11 | VIDEO_W = 224 12 | VIDEO_FPS = 4 13 | VIDEO_NUM_FRAMES = 16 14 | -------------------------------------------------------------------------------- /tma/models/qa_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_qa_model import QAModel, QAModelInstance 2 | from .imageqa_model import ImageQAModel, list_imageqa_models, set_imageqa_model_key 3 | from .videoqa_model import ImageQAModel4Video, VideoQAModel, list_videoqa_models 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sentence-transformers==2.5.1 2 | transformers==4.38.1 3 | accelerate==0.27.2 4 | diskcache 5 | networkx 6 | openai 7 | pyarrow 8 | scikit-learn 9 | pandas 10 | matplotlib 11 | tiktoken 12 | einops 13 | transformers_stream_generator 14 | prefixspan 15 | dashscope 16 | oss2 17 | google.generativeai 18 | replicate 19 | decord 20 | opencv-python -------------------------------------------------------------------------------- /tma/metadata.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import networkx as nx 4 | 5 | 6 | class MetaData: 7 | """ 8 | Abstract class for metadata 9 | """ 10 | 11 | 12 | class CategoryMetaData(MetaData): 13 | def __init__(self): 14 | super().__init__() 15 | 16 | self.taxonomy = None 17 | self.categories = None 18 | self.category_info = None 19 | 20 | def check_category_exists(self, cateid): 21 | return cateid in self.categories 22 | 23 | def get_surfacename(self, node): 24 | return self.category_info[node]['surface_name'][0] 25 | 26 | def get_relevant_categories(self, cateid): 27 | return set(nx.descendants(self.taxonomy, cateid)) | set(nx.ancestors(self.taxonomy, cateid)) | {cateid} 28 | 29 | def get_irrelevant_categories(self, cateid): 30 | if isinstance(cateid, List): 31 | relevant_categories = set() 32 | for c in cateid: 33 | relevant_categories |= self.get_relevant_categories(c) 34 | else: 35 | relevant_categories = self.get_relevant_categories(cateid) 36 | return set(self.categories) - relevant_categories 37 | -------------------------------------------------------------------------------- /tma/videoqa/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pickle 4 | 5 | from ..imageqa.metadata import Objaverse3DMetaData 6 | from ..metadata import MetaData 7 | 8 | 9 | class ObjaverseVideoMetaData(Objaverse3DMetaData): 10 | pass 11 | 12 | 13 | def load_video_scene_graph(video_scene_graph_folder): 14 | video_folder = os.path.join(video_scene_graph_folder, "Charades_v1_480") 15 | scene_graphs = json.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/video_scene_graph.json"))) 16 | idx2name = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/idx2name.pkl"), "rb")) 17 | objects = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/objects.pkl"), "rb")) 18 | actions = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/actions.pkl"), "rb")) 19 | spatial_relations = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/spatial_relations.pkl"), "rb")) 20 | contact_relations = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/contact_relations.pkl"), "rb")) 21 | return video_folder, scene_graphs, idx2name, objects, actions, spatial_relations, contact_relations 22 | 23 | 24 | class VideoSceneGraphMetaData(MetaData): 25 | def __init__(self, path_to_metadata, video_scene_graph_folder): 26 | super().__init__() 27 | # video scene graph use idx to represent relations, objects, and actions, like r1, o1, idx_to_name is a dict to map idx to its name. 28 | self.image_folder, self.video_scene_graphs, self.idx2name, self.objects, self.actions, self.spatial_relations, self.contact_relations = ( 29 | load_video_scene_graph(video_scene_graph_folder)) 30 | 31 | def get_video_path(self, video_scene_graph_id): 32 | return os.path.join(self.image_folder, video_scene_graph_id + ".mp4") 33 | -------------------------------------------------------------------------------- /tma/base.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | 5 | from .constant import NUM_OPTIONS 6 | from .metadata import MetaData 7 | from .task_store import TaskStore 8 | 9 | 10 | class TaskGenerator: 11 | schema = {} 12 | 13 | def __init__(self, metadata: MetaData, seed=42): 14 | self.metadata = metadata 15 | self.rng = np.random.default_rng(seed=seed) 16 | 17 | def _compose_options(self, answer, negatives): 18 | if len(negatives) > NUM_OPTIONS - 1: 19 | negatives = self.rng.choice(negatives, NUM_OPTIONS - 1, replace=False).tolist() 20 | options = [answer] + negatives 21 | return options 22 | 23 | def _task_plan_to_str(self, task_plan) -> str: 24 | "(Abstract method) convert task plan to string for task embedding" 25 | 26 | def enumerate_task_plans(self, task_store: TaskStore): 27 | "(Abstract method) enumerate task plan" 28 | 29 | def generate(self, task_plan, return_data=True, seed=None): 30 | "(Abstract method) enumerate task" 31 | 32 | 33 | class JointTaskGenerator: 34 | def __init__(self, metadata: MetaData, generators: Dict, seed=42): 35 | self.generators = { 36 | k: v(metadata, seed=seed) for k, v in generators.items() 37 | } 38 | self.stats = {generator_type: 0 for generator_type in generators} 39 | self.schema = {} 40 | for generator_type, generator in self.generators.items(): 41 | self.schema.update(generator.schema) 42 | 43 | def enumerate_task_plans(self, task_store: TaskStore): 44 | for generator_type, generator in self.generators.items(): 45 | before = len(task_store) 46 | generator.enumerate_task_plans(task_store) 47 | self.stats[generator_type] = len(task_store) - before 48 | print(f"Generated [{self.stats[generator_type]}] {generator_type} tasks") 49 | task_store.dump() 50 | 51 | def generate(self, task_plan, return_data=True, seed=None): 52 | return self.generators[task_plan['task type']].generate(task_plan, return_data=return_data, seed=seed) 53 | -------------------------------------------------------------------------------- /tma/models/qa_model/prompt.py: -------------------------------------------------------------------------------- 1 | def succinct_prompt(question, choices=[]): 2 | if len(choices) == 0: 3 | prompt = question 4 | else: 5 | choices = '\n'.join(choices) 6 | prompt = (f"{question}\n" 7 | f"Select from the following choices.\n" 8 | f"{choices}") 9 | 10 | return prompt 11 | 12 | 13 | #################################################################################################### 14 | # videoqa 15 | #################################################################################################### 16 | 17 | 18 | def detailed_videoqa_prompt(question, choices=[]): 19 | if len(choices) == 0: 20 | prompt = f"Based on the video, answer the question. Question: {question} Answer:" 21 | else: 22 | prompt = (f"Based on the video, output the best option for the question.\n" 23 | f"You must only output the option.\n" 24 | f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(") 25 | return prompt 26 | 27 | 28 | def detailed_video2imageqa_prompt(question, choices=[]): 29 | if len(choices) == 0: 30 | prompt = f"This is a series of images sampled at equal intervals from the beginning to the end of a video, based on the series of images, answer the question. Question: {question} Answer:" 31 | else: 32 | prompt = (f"This is a series of images sampled at equal intervals from the beginning to the end of a video, based on the series of images, output the best option for the question.\n" 33 | f"You must only output the option.\n" 34 | f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(") 35 | return prompt 36 | 37 | 38 | #################################################################################################### 39 | # imageqa 40 | #################################################################################################### 41 | 42 | def detailed_imageqa_prompt(question, choices=[]): 43 | if len(choices) == 0: 44 | prompt = f"Based on the image, answer the question. Question: {question} Answer:" 45 | else: 46 | prompt = (f"Based on the image, output the best option for the question.\n" 47 | f"You must only output the option.\n" 48 | f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(") 49 | return prompt 50 | -------------------------------------------------------------------------------- /tma/task_store.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pyarrow as pa 3 | import pyarrow.parquet as pq 4 | 5 | pa_schema_map = { 6 | 'str' : pa.string(), 7 | 'int' : pa.int64(), 8 | 'list': pa.list_(pa.string()), 9 | } 10 | 11 | pd_schema_map = { 12 | 'str' : 'string', 13 | 'int' : 'Int64', 14 | 'list': 'object', 15 | } 16 | 17 | 18 | def get_pa_schema(schema): 19 | return pa.schema([(k, pa_schema_map[v]) for k, v in schema.items()]) 20 | 21 | 22 | def get_pd_schema(schema): 23 | return {k: pd_schema_map[v] for k, v in schema.items()} 24 | 25 | 26 | class TaskStore: 27 | 28 | def __init__(self, schema, output_file=None, buffer_size=1e8): 29 | self.columns = list(schema.keys()) 30 | self.dtypes = list(schema.values()) 31 | self.buffer = [] 32 | self.buffer_size = buffer_size 33 | self.output_file = output_file 34 | if output_file is None: 35 | self.schema = get_pd_schema(schema) 36 | self.task_plan_df = pd.DataFrame({k: pd.Series(dtype=v) for k, v in self.schema.items()}) 37 | else: 38 | print(f'Writing to {output_file}') 39 | self.counter = 0 40 | self.schema = get_pa_schema(schema) 41 | self.parquet_writer = pq.ParquetWriter(output_file, schema=self.schema) 42 | 43 | def _update_buffer(self): 44 | if len(self.buffer) > self.buffer_size: 45 | self.dump() 46 | 47 | def dump(self): 48 | if len(self.buffer) > 0: 49 | if self.output_file is None: 50 | self.task_plan_df = pd.concat( 51 | [self.task_plan_df, pd.DataFrame(self.buffer, columns=self.columns).astype(self.schema, errors='ignore')], 52 | ignore_index=True, 53 | sort=False 54 | ) 55 | else: 56 | self.parquet_writer.write_table(pa.Table.from_pylist(self.buffer, schema=self.schema)) 57 | self.counter += len(self.buffer) 58 | self.buffer = [] 59 | 60 | def add_many(self, xs): 61 | self.buffer.extend(xs) 62 | self._update_buffer() 63 | 64 | def add(self, x): 65 | self.buffer.append(x) 66 | self._update_buffer() 67 | 68 | def __len__(self): 69 | if self.output_file is None: 70 | return len(self.task_plan_df) + len(self.buffer) 71 | else: 72 | return self.counter + len(self.buffer) 73 | 74 | def return_df(self): 75 | self.dump() 76 | return self.task_plan_df 77 | 78 | def close(self): 79 | if self.output_file is not None: 80 | self.dump() 81 | self.parquet_writer.close() 82 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/single_image_task.py: -------------------------------------------------------------------------------- 1 | from .utils import grid_mappings, grid_options, make_image, relative_grid, relative_position_phrase, relative_positions 2 | from ..metadata import Objaverse3DMetaData, ObjaverseMetaData 3 | from ..sticker_2d import GridTaskGenerator, HowManyGridTaskGenerator, WhatAttributeGridTaskGenerator, WhatGridTaskGenerator, WhereAttributeGridTaskGenerator, WhereGridTaskGenerator 4 | from ...constant import IMAGE_H, IMAGE_W 5 | 6 | 7 | class _3DGridTaskGenerator(GridTaskGenerator): 8 | metadata: Objaverse3DMetaData 9 | 10 | def __init__(self, metadata: ObjaverseMetaData, seed=42): 11 | super().__init__(metadata, seed=seed) 12 | self.grid_mappings = grid_mappings 13 | self.grid_options = grid_options 14 | self.relative_positions = relative_positions 15 | self.relative_position_phrase = relative_position_phrase 16 | 17 | def _make_image_metadata(self, grid_size, grids, queries, remaining_query=...): 18 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 19 | 20 | remaining_grids = [g for g in range(grid_size ** 2) if g not in grids] 21 | for _ in remaining_grids: 22 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 23 | objects.append(uid) 24 | 25 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 26 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 27 | 28 | image_metadata = { 29 | 'grid number' : grid_size, 30 | 'objects' : objects, 31 | 'object_path' : object_path, 32 | 'object_angles' : angles, 33 | 'grids' : grids + remaining_grids, 34 | 'blender_config': self.metadata.sample_blender_configuration(self.rng) 35 | } 36 | 37 | return image_metadata 38 | 39 | def make_image(self, image_metadata): 40 | return make_image(image_metadata, self.metadata, IMAGE_H, IMAGE_W) 41 | 42 | def _relative_grid(self, grid_size, grid, reference_pos): 43 | return relative_grid(grid_size, grid, reference_pos) 44 | 45 | 46 | class What3DGridTaskGenerator(_3DGridTaskGenerator, WhatGridTaskGenerator): 47 | metadata: Objaverse3DMetaData 48 | 49 | 50 | class Where3DGridTaskGenerator(_3DGridTaskGenerator, WhereGridTaskGenerator): 51 | metadata: Objaverse3DMetaData 52 | 53 | 54 | class WhatAttribute3DGridTaskGenerator(_3DGridTaskGenerator, WhatAttributeGridTaskGenerator): 55 | metadata: Objaverse3DMetaData 56 | 57 | 58 | class WhereAttribute3DGridTaskGenerator(_3DGridTaskGenerator, WhereAttributeGridTaskGenerator): 59 | metadata: Objaverse3DMetaData 60 | 61 | 62 | class HowMany3DGridTaskGenerator(_3DGridTaskGenerator, HowManyGridTaskGenerator): 63 | metadata: Objaverse3DMetaData 64 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/single_video_task.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from typing import Dict, List, Tuple 3 | 4 | import numpy as np 5 | 6 | from .utils import grid_mappings, grid_options, make_video, relative_grid 7 | from ..metadata import ObjaverseVideoMetaData 8 | from ...base import TaskGenerator 9 | from ...constant import VIDEO_H, VIDEO_W 10 | 11 | 12 | def check_video(video): 13 | from decord import VideoReader, cpu 14 | with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp: 15 | try: 16 | with open(tmp.name, 'wb') as file: 17 | file.write(video) 18 | with open(tmp.name, 'rb') as f: 19 | VideoReader(f, ctx=cpu(0)) 20 | except Exception as e: 21 | return False 22 | return True 23 | 24 | 25 | class GridVideoTaskGenerator(TaskGenerator): 26 | metadata: ObjaverseVideoMetaData 27 | 28 | def __init__(self, metadata: ObjaverseVideoMetaData, seed=42): 29 | super().__init__(metadata, seed=seed) 30 | self.grid_options = grid_options 31 | self.grid_mappings = grid_mappings 32 | 33 | def _relative_grid(self, grid_size, grid, reference_pos): 34 | return relative_grid(grid_size, grid, reference_pos) 35 | 36 | def _get_target_object_query(self, task_plan): 37 | if 'attribute type' in task_plan: 38 | return self.metadata.and_query([("category", task_plan['target category'], True), (task_plan['attribute type'], task_plan['attribute value'], True)]) 39 | else: 40 | return self.metadata.and_query([("category", task_plan['target category'], True)]) 41 | 42 | def _task_plan_to_str(self, task_plan): 43 | t = [] 44 | for k, v in task_plan.items(): 45 | if self.metadata.check_category_exists(v): 46 | t.append(f'{k}: {self.metadata.get_surfacename(v)}') 47 | else: 48 | t.append(f'{k}: {v}') 49 | return '\n'.join(t) 50 | 51 | def make_video(self, video_metadata): 52 | return make_video(video_metadata, self.metadata, VIDEO_H, VIDEO_W) 53 | 54 | def _generate_task(self, task_plan) -> Tuple[str, str, List[str], Dict]: 55 | "(Abstract method) generate task" 56 | 57 | def generate(self, task_plan, return_data=True, seed=None): 58 | if seed is not None: 59 | self.rng = np.random.default_rng(seed=seed) 60 | 61 | retry = 0 62 | while True: 63 | question, answer, options, video_metadata = self._generate_task(task_plan) 64 | task = { 65 | 'question' : question.replace('_', ' '), 66 | 'answer' : answer.replace('_', ' '), 67 | 'options' : [o.replace('_', ' ') for o in options], 68 | 'task_plan' : self._task_plan_to_str(task_plan), 69 | 'video_metadata': video_metadata, 70 | 'video' : self.make_video(video_metadata) if return_data else None 71 | } 72 | if return_data: 73 | if check_video(task['video']): 74 | break 75 | else: 76 | retry -= 1 77 | if retry <= 0: 78 | raise Exception("Failed to generate video") 79 | else: 80 | break 81 | 82 | return task 83 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess 4 | 5 | from ..metadata import ObjaverseVideoMetaData 6 | 7 | grid_options = [2, 3] 8 | 9 | grid_mappings = { 10 | 2: 11 | { 12 | 'back left' : 0, 13 | 'back right' : 1, 14 | 'front left' : 2, 15 | 'front right': 3 16 | }, 17 | 3: 18 | { 19 | 'back left' : 0, 20 | 'back middle' : 1, 21 | 'back right' : 2, 22 | 'middle left' : 3, 23 | 'middle' : 4, 24 | 'middle right': 5, 25 | 'front left' : 6, 26 | 'front middle': 7, 27 | 'front right' : 8 28 | } 29 | } 30 | 31 | relative_positions = ['left', 'right', 'back', 'front', 'back left', 'back right', 'front left', 'front right'] 32 | relative_position_phrase = { 33 | 'left' : 'to the left of', 34 | 'right' : 'to the right of', 35 | 'back' : 'behind', 36 | 'front' : 'in front of', 37 | 'back left' : 'behind and to the left of', 38 | 'back right' : 'behind and to the right of', 39 | 'front left' : 'in front and to the left of', 40 | 'front right': 'in front and to the right of' 41 | } 42 | reverse_relative_positions = { 43 | 'left' : 'right', 44 | 'right' : 'left', 45 | 'back' : 'front', 46 | 'front' : 'back', 47 | 'front left' : 'back right', 48 | 'front right': 'back left', 49 | 'back left' : 'front right', 50 | 'back right' : 'front left' 51 | } 52 | 53 | 54 | def relative_grid(grid_size, grid, reference_pos): 55 | if 'right' in reference_pos: 56 | if grid % grid_size == 0: return -1 57 | grid = grid - 1 58 | if 'left' in reference_pos: 59 | if grid % grid_size == grid_size - 1: return -1 60 | grid = grid + 1 61 | if 'back' in reference_pos: 62 | if grid + grid_size >= grid_size * grid_size: return -1 63 | grid = grid + grid_size 64 | if 'front' in reference_pos: 65 | if grid - grid_size < 0: return -1 66 | grid = grid - grid_size 67 | return grid 68 | 69 | 70 | import tempfile 71 | import diskcache 72 | 73 | run_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "run_blender.py") 74 | 75 | 76 | def make_video(scene_json, metadata: ObjaverseVideoMetaData, VIDEO_H, VIDEO_W): 77 | device = metadata.render_device 78 | blender_cache = metadata.blender_cache 79 | assert len(scene_json["objects"]) <= (scene_json["grid number"] ** 2) 80 | scene_json["VIDEO_H"] = VIDEO_H 81 | scene_json["VIDEO_W"] = VIDEO_W 82 | 83 | with diskcache.Cache(blender_cache, size_limit=100 * (2 ** 30)) as cache: 84 | key = json.dumps(scene_json, sort_keys=True) 85 | video = cache.get(key, None) 86 | if video is None: 87 | with (tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp_video, 88 | tempfile.NamedTemporaryFile(delete=True, suffix=".json") as tmp_json): 89 | json.dump(scene_json, open(tmp_json.name, 'w')) 90 | 91 | env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(device)) 92 | command = ( 93 | f"{metadata.blender_path} -b -noaudio --python {run_script_path} -- " 94 | f"--save_video_path {tmp_video.name} " 95 | f"--json_file {tmp_json.name}" 96 | ) 97 | 98 | subprocess.run(command, shell=True, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) 99 | 100 | with open(tmp_video.name, 'rb') as video_file: 101 | video = video_file.read() # save video to a binary files 102 | cache.set(key, video) 103 | 104 | return video 105 | -------------------------------------------------------------------------------- /tma/imageqa/sticker_2d/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from PIL import Image 5 | 6 | grid_options = [2, 3] 7 | grid_mappings = { 8 | 2: 9 | { 10 | 'top left' : 0, 11 | 'top right' : 1, 12 | 'bottom left' : 2, 13 | 'bottom right': 3 14 | }, 15 | 3: 16 | { 17 | 'top left' : 0, 18 | 'top middle' : 1, 19 | 'top right' : 2, 20 | 'middle left' : 3, 21 | 'middle' : 4, 22 | 'middle right' : 5, 23 | 'bottom left' : 6, 24 | 'bottom middle': 7, 25 | 'bottom right' : 8 26 | } 27 | } 28 | 29 | relative_positions = ['left', 'right', 'top', 'bottom', 'top left', 'top right', 'bottom left', 'bottom right'] 30 | relative_position_phrase = { 31 | 'left' : 'to the left of', 32 | 'right' : 'to the right of', 33 | 'top' : 'above', 34 | 'bottom' : 'below', 35 | 'top left' : 'above and to the left of', 36 | 'top right' : 'above and to the right of', 37 | 'bottom left' : 'below and to the left of', 38 | 'bottom right': 'below and to the right of' 39 | } 40 | 41 | 42 | def relative_grid(grid_size, grid, reference_pos): 43 | if 'right' in reference_pos: 44 | if grid % grid_size == 0: return -1 45 | grid = grid - 1 46 | if 'left' in reference_pos: 47 | if grid % grid_size == grid_size - 1: return -1 48 | grid = grid + 1 49 | if 'top' in reference_pos: 50 | if grid + grid_size >= grid_size * grid_size: return -1 51 | grid = grid + grid_size 52 | if 'bottom' in reference_pos: 53 | if grid - grid_size < 0: return -1 54 | grid = grid - grid_size 55 | return grid 56 | 57 | 58 | def does_overlap(box1, box2): 59 | # Returns True if box1 and box2 overlap, False otherwise 60 | x1, y1, x2, y2 = box1 61 | x3, y3, x4, y4 = box2 62 | return not (x2 < x3 or x4 < x1 or y2 < y3 or y4 < y1) 63 | 64 | 65 | def sample_bounding_boxes(num_objects, H, W, size_range=(0.3, 0.45)): 66 | while True: 67 | frac = random.uniform(*size_range) 68 | boxes = [] 69 | count = 0 70 | num_chances = 5 71 | while len(boxes) < num_objects and count < num_chances: 72 | box_w = int(frac * W) 73 | box_h = int(frac * H) 74 | box_x = random.randint(0, W - box_w) 75 | box_y = random.randint(0, H - box_h) 76 | new_box = (box_x, box_y, box_x + box_w, box_y + box_h) 77 | if not any(does_overlap(new_box, box) for box in boxes): 78 | boxes.append(new_box) 79 | count += 1 80 | if count >= num_chances: 81 | continue 82 | return boxes 83 | 84 | 85 | def grid_to_box(H, W, grid_size, grid_index, grid_H, grid_W): 86 | grid_height = H // grid_size 87 | grid_width = W // grid_size 88 | 89 | # grid_x, grid_y = np.unravel_index(grid_index, (grid_size, grid_size)) 90 | grid_y, grid_x = np.unravel_index(grid_index, (grid_size, grid_size)) 91 | 92 | box_x = grid_x * grid_width 93 | box_y = grid_y * grid_height 94 | box_w = grid_W * grid_width 95 | box_h = grid_H * grid_height 96 | return (box_x, box_y, box_x + box_w, box_y + box_h) 97 | 98 | 99 | def paste_image(background, obj, box): 100 | obj = obj.resize((box[2] - box[0], box[3] - box[1])) 101 | background.paste(obj, box=box, mask=obj) 102 | 103 | 104 | def make_image(metadata, H=512, W=512): 105 | # sample bounding boxes 106 | grid_size = metadata["grid number"] 107 | object_paths = metadata["object paths"] 108 | assert len(metadata["objects"]) <= (grid_size ** 2) 109 | boxes = [grid_to_box(H, W, grid_size, x, 1, 1) for x in metadata["grids"]] 110 | 111 | im_target = Image.new("RGBA", (W, H), 'WHITE') # you can load this as a background image if you want 112 | 113 | for view, box in zip(object_paths, boxes): 114 | obj = Image.open(view) 115 | paste_image(im_target, obj, box) 116 | 117 | return im_target.convert('RGB') 118 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | 4 | from ..metadata import Objaverse3DMetaData 5 | 6 | grid_options = [2, 3] 7 | 8 | grid_mappings = { 9 | 2: 10 | { 11 | 'back left' : 0, 12 | 'back right' : 1, 13 | 'front left' : 2, 14 | 'front right': 3 15 | }, 16 | 3: 17 | { 18 | 'back left' : 0, 19 | 'back middle' : 1, 20 | 'back right' : 2, 21 | 'middle left' : 3, 22 | 'middle' : 4, 23 | 'middle right': 5, 24 | 'front left' : 6, 25 | 'front middle': 7, 26 | 'front right' : 8 27 | } 28 | } 29 | 30 | relative_positions = ['left', 'right', 'back', 'front', 'back left', 'back right', 'front left', 'front right'] 31 | relative_position_phrase = { 32 | 'left' : 'to the left of', 33 | 'right' : 'to the right of', 34 | 'back' : 'behind', 35 | 'front' : 'in front of', 36 | 'back left' : 'behind and to the left of', 37 | 'back right' : 'behind and to the right of', 38 | 'front left' : 'in front and to the left of', 39 | 'front right': 'in front and to the right of' 40 | } 41 | reverse_relative_positions = { 42 | 'left' : 'right', 43 | 'right' : 'left', 44 | 'back' : 'front', 45 | 'front' : 'back', 46 | 'front left' : 'back right', 47 | 'front right': 'back left', 48 | 'back left' : 'front right', 49 | 'back right' : 'front left' 50 | } 51 | 52 | 53 | def relative_grid(grid_size, grid, reference_pos): 54 | if 'right' in reference_pos: 55 | if grid % grid_size == 0: return -1 56 | grid = grid - 1 57 | if 'left' in reference_pos: 58 | if grid % grid_size == grid_size - 1: return -1 59 | grid = grid + 1 60 | if 'back' in reference_pos: 61 | if grid + grid_size >= grid_size * grid_size: return -1 62 | grid = grid + grid_size 63 | if 'front' in reference_pos: 64 | if grid - grid_size < 0: return -1 65 | grid = grid - grid_size 66 | return grid 67 | 68 | 69 | import os 70 | import tempfile 71 | import io, base64 72 | from PIL import Image 73 | import diskcache 74 | 75 | run_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "run_blender.py") 76 | 77 | 78 | def image_to_base64(pil_image): 79 | import io 80 | import base64 81 | img_byte_arr = io.BytesIO() 82 | pil_image.save(img_byte_arr, format='PNG') 83 | img_byte_arr = img_byte_arr.getvalue() 84 | base64_str = base64.b64encode(img_byte_arr).decode('utf-8') 85 | return base64_str 86 | 87 | 88 | def make_image(scene_json, metadata: Objaverse3DMetaData, H=512, W=512): 89 | device = metadata.render_device 90 | blender_cache = metadata.blender_cache 91 | assert len(scene_json["objects"]) <= (scene_json["grid number"] ** 2) 92 | scene_json["H"] = H 93 | scene_json["W"] = W 94 | 95 | with diskcache.Cache(blender_cache, size_limit=100 * (2 ** 30)) as cache: 96 | key = json.dumps(scene_json, sort_keys=True) 97 | base64_str = cache.get(key, None) 98 | if base64_str is None: 99 | with (tempfile.NamedTemporaryFile(delete=True, suffix=".png") as tmp_image, 100 | tempfile.NamedTemporaryFile(delete=True, suffix=".json") as tmp_json): 101 | json.dump(scene_json, open(tmp_json.name, 'w')) 102 | 103 | env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(device)) 104 | command = ( 105 | f"{metadata.blender_path} -b -noaudio --python {run_script_path} -- " 106 | f"--save_image_path {tmp_image.name} " 107 | f"--json_file {tmp_json.name}" 108 | ) 109 | 110 | subprocess.run(command, shell=True, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) 111 | 112 | img = Image.open(tmp_image.name).convert("RGB") 113 | cache.set(key, image_to_base64(img)) 114 | else: 115 | img = Image.open(io.BytesIO(base64.decodebytes(bytes(base64_str, "utf-8")))) 116 | 117 | return img 118 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | **/.DS_STORE 163 | 164 | 165 | 166 | output/ -------------------------------------------------------------------------------- /tma/models/qa_model/base_qa_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from typing import Callable 4 | 5 | import diskcache 6 | import numpy as np 7 | import sentence_transformers 8 | import torch 9 | 10 | from .. import Model 11 | 12 | 13 | def make_options(choices, format='letter'): 14 | assert format in ['numeric', 'letter'] 15 | if format == 'numeric': 16 | prefix1 = [str(i + 1) for i in range(len(choices))] 17 | else: 18 | prefix1 = [chr(ord("a") + i).upper() for i in range(len(choices))] 19 | prefix2 = [f"({p})" for p in prefix1] 20 | return prefix1, prefix2, [f'{p} {c}' for p, c in zip(prefix2, choices)] 21 | 22 | 23 | def check_contain(answer, options): 24 | contains = [option in answer for option in options] 25 | if sum(contains) == 1: 26 | return contains.index(True) 27 | else: 28 | return -1 29 | 30 | 31 | class QAModelInstance: 32 | def qa(self, data, prompt): 33 | "(Abstract method) abstract QA method" 34 | 35 | 36 | class QAModel(Model): 37 | def __init__( 38 | self, 39 | model_name: str, 40 | prompt_name: str, 41 | prompt_func: Callable, 42 | choice_format='letter', 43 | enable_choice_search: bool = False, 44 | cache_path: str = None, 45 | ): 46 | self.model = None 47 | self.model_name = f'{model_name} ({prompt_name})' 48 | self.prompt_func = prompt_func 49 | self.format = choice_format 50 | self.cache_path = cache_path 51 | 52 | if self.cache_path is None: 53 | print("[IMPORTANT] model cache is disabled") 54 | else: 55 | print(f"[IMPORTANT] model cache is enabled, cache path: {cache_path}") 56 | 57 | self.enable_choice_search = enable_choice_search 58 | if enable_choice_search: 59 | # use SBERT to find the closest choice 60 | self.sentence_transformer = sentence_transformers.SentenceTransformer("all-mpnet-base-v2", device='cpu') 61 | 62 | @torch.no_grad() 63 | def choice_search(self, free_form_answer, choices): 64 | query_embedding = self.sentence_transformer.encode([free_form_answer], normalize_embeddings=True) 65 | choices_embedding = self.sentence_transformer.encode(choices, normalize_embeddings=True) 66 | top_choice_index = np.argmax(np.dot(choices_embedding, query_embedding.T)) 67 | return choices[top_choice_index] 68 | 69 | def _data_to_str(self, data): 70 | """ abstract method """ 71 | 72 | @torch.no_grad() 73 | def _qa(self, data, prompt): 74 | if self.cache_path is None: 75 | return self.model.qa(data, prompt) 76 | else: 77 | with diskcache.Cache(self.cache_path, size_limit=10 * (2 ** 30)) as cache: 78 | key = json.dumps([self.model_name, self._data_to_str(data), prompt]) 79 | response = cache.get(key, None) 80 | if response is None: 81 | response = self.model.qa(data, prompt) 82 | cache.set(key, response) 83 | return response 84 | 85 | @torch.no_grad() 86 | def qa(self, data, question): 87 | prompt = self.prompt_func(question) 88 | return self._qa(data, prompt) 89 | 90 | @torch.no_grad() 91 | def multiple_choice_qa(self, data, question, choices, answer=None): 92 | # Get VQA model's answer 93 | prefix1, prefix2, options = make_options(choices, self.format) 94 | prompt = self.prompt_func(question, options) 95 | free_form_answer = self._qa(data, prompt) 96 | free_form_answer = free_form_answer.strip() 97 | 98 | # Limit the answer to the choices 99 | if free_form_answer in choices: 100 | multiple_choice_answer = free_form_answer 101 | elif free_form_answer in options: 102 | multiple_choice_answer = choices[options.index(free_form_answer)] 103 | elif free_form_answer in prefix1: 104 | multiple_choice_answer = choices[prefix1.index(free_form_answer)] 105 | elif free_form_answer in prefix2: 106 | multiple_choice_answer = choices[prefix2.index(free_form_answer)] 107 | elif self.enable_choice_search: 108 | multiple_choice_answer = self.choice_search(free_form_answer, choices) 109 | else: 110 | multiple_choice_answer = "" 111 | for to_check in [choices, options, prefix1, prefix2]: 112 | idx = check_contain(free_form_answer, to_check) 113 | if idx != -1: 114 | multiple_choice_answer = choices[idx] 115 | break 116 | 117 | result = { 118 | "free_form_answer" : free_form_answer, 119 | "multiple_choice_answer": multiple_choice_answer, 120 | "choices" : choices.copy(), 121 | } 122 | if answer is not None: 123 | result["accuracy"] = int(answer == multiple_choice_answer) 124 | return result 125 | 126 | @torch.no_grad() 127 | def multiple_choice_qa_random_ordering(self, data, question, choices, answer=None, n_trials=3): 128 | results = {} 129 | accuracy = 0 130 | for i in range(n_trials): 131 | choices_i = choices.copy() 132 | random.shuffle(choices_i) 133 | results[i] = self.multiple_choice_qa(data, question, choices_i, answer) 134 | accuracy += results[i]["accuracy"] 135 | results["accuracy"] = accuracy / n_trials 136 | return results 137 | -------------------------------------------------------------------------------- /tma/models/qa_model/videoqa_model.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from typing import Callable, Union 3 | 4 | import numpy as np 5 | import torch 6 | from PIL import Image, ImageDraw, ImageFont 7 | 8 | from .base_qa_model import QAModel, QAModelInstance 9 | from .imageqa_model import ImageQAModel 10 | 11 | videoqa_models = { 12 | 13 | } 14 | 15 | 16 | def list_videoqa_models(): 17 | return list(videoqa_models.keys()) 18 | 19 | 20 | class VideoQAModel(QAModel): 21 | def __init__( 22 | self, 23 | model_name, 24 | prompt_name: str, 25 | prompt_func: Callable, 26 | model: QAModelInstance = None, 27 | torch_device: Union[int, str] = -1, 28 | precision=torch.bfloat16, 29 | choice_format='letter', 30 | enable_choice_search: bool = False, 31 | ): 32 | super().__init__(model_name, prompt_name, prompt_func, choice_format, enable_choice_search) 33 | 34 | if isinstance(torch_device, str): 35 | torch_device = torch.device(torch_device) 36 | else: 37 | if torch_device == -1: 38 | torch_device = torch.device("cuda") if torch.cuda.is_available() else "cpu" 39 | else: 40 | torch_device = torch.device(f"cuda:{torch_device}") 41 | 42 | if model is None: 43 | print(f"Loading {model_name}...") 44 | class_name, ckpt = videoqa_models[model_name] 45 | self.model_precision = precision 46 | self.model = eval(class_name)(ckpt, torch_device, self.model_precision) 47 | print(f"Finish loading {model_name}") 48 | else: 49 | print(f"Using provided self.model...") 50 | self.model = model 51 | 52 | @torch.no_grad() 53 | def _qa(self, data, prompt): 54 | if isinstance(data, str): 55 | return self.model.qa(data, prompt) 56 | else: 57 | with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp: 58 | with open(tmp.name, 'wb') as file: 59 | file.write(data) 60 | video_path = tmp.name 61 | answer = self.model.qa(video_path, prompt) 62 | return answer 63 | 64 | 65 | def sample_frames(video_path, n): 66 | import cv2 67 | # Open the video file 68 | cap = cv2.VideoCapture(video_path) 69 | if not cap.isOpened(): 70 | print("Error: Could not open video.") 71 | return [] 72 | 73 | # Calculate total number of frames and video FPS 74 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 75 | 76 | # Calculate interval in terms of frames 77 | interval = max(1, total_frames // n) 78 | 79 | # Sample frames 80 | sampled_frames = [] 81 | for i in range(0, total_frames, interval): 82 | # Set the current frame position 83 | cap.set(cv2.CAP_PROP_POS_FRAMES, i) 84 | 85 | # Read the frame 86 | ret, frame = cap.read() 87 | if not ret: 88 | print(f"Error: Could not read frame {i}.") 89 | break 90 | 91 | # Convert the frame to PIL Image 92 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 93 | pil_img = Image.fromarray(frame_rgb) 94 | sampled_frames.append(pil_img) 95 | 96 | # Stop if we have collected n frames 97 | if len(sampled_frames) >= n: 98 | break 99 | 100 | # Release the video capture object 101 | cap.release() 102 | 103 | return sampled_frames 104 | 105 | 106 | def get_contrasting_color(image, x, y, width, height): 107 | """ 108 | Determine a contrasting color (black or white) based on the average color of a specified area in the image. 109 | """ 110 | # Crop the relevant part of the image 111 | cropped_image = image.crop((x, y, x + width, y + height)) 112 | # Convert to numpy array for analysis 113 | np_image = np.array(cropped_image) 114 | # Calculate the average color 115 | average_color = np.mean(np_image, axis=(0, 1)) 116 | # Brightness calculation based on perceived luminance 117 | brightness = np.sqrt(0.299 * average_color[0] ** 2 + 0.587 * average_color[1] ** 2 + 0.114 * average_color[2] ** 2) 118 | # Return white for dark backgrounds and black for light backgrounds 119 | return 'white' if brightness < 128 else 'black' 120 | 121 | 122 | def concatenate_image(images, rows, columns, separator_width=10): 123 | # Ensure we have the exact number of images needed 124 | if len(images) != rows * columns: 125 | raise ValueError(f"Expected {rows * columns} images, but got {len(images)}.") 126 | 127 | # Calculate the max width and height of images to standardize sizes 128 | max_width = max(img.width for img in images) 129 | max_height = max(img.height for img in images) 130 | 131 | # Resize images to the max width and height 132 | resized_images = [img.resize((max_width, max_height), Image.Resampling.LANCZOS) for img in images] 133 | 134 | # Calculate the total width and height for the combined image 135 | total_width = max_width * columns + separator_width * (columns - 1) 136 | total_height = max_height * rows + separator_width * (rows - 1) 137 | combined_image = Image.new('RGB', (total_width, total_height), color='white') 138 | 139 | # Place images in the specified grid 140 | x_offset = 0 141 | y_offset = 0 142 | for i, img in enumerate(resized_images): 143 | combined_image.paste(img, (x_offset, y_offset)) 144 | if (i + 1) % columns == 0: # Move to the next row after the last column 145 | x_offset = 0 146 | y_offset += img.height + separator_width 147 | else: # Move to the next column 148 | x_offset += img.width + separator_width 149 | 150 | # Add numbers to each image for identification 151 | draw = ImageDraw.Draw(combined_image) 152 | try: 153 | font_size = (max_width + max_height) // 2 // 12 154 | font = ImageFont.load_default(size=font_size) 155 | except IOError: 156 | font = ImageFont.truetype("arial", 20) 157 | 158 | x_offset = 0 159 | y_offset = 0 160 | for i, img in enumerate(resized_images): 161 | text = str(i + 1) 162 | text_x = x_offset + 10 163 | text_y = y_offset + 10 164 | text_width, text_height = font_size, font_size 165 | font_color = get_contrasting_color(combined_image, text_x, text_y, text_width, text_height) 166 | draw.text((text_x, text_y), text, fill=font_color, font=font) 167 | if (i + 1) % columns == 0: 168 | x_offset = 0 169 | y_offset += img.height + separator_width 170 | else: 171 | x_offset += img.width + separator_width 172 | 173 | return combined_image 174 | 175 | 176 | def video_to_concat_image(video_path, num_rows, num_columns): 177 | return concatenate_image(sample_frames(video_path, num_rows * num_columns), num_rows, num_columns) 178 | 179 | 180 | class ImageQAModel4Video(VideoQAModel): 181 | def __init__( 182 | self, 183 | model: ImageQAModel, 184 | prompt_name: str, 185 | prompt_func: Callable, 186 | num_rows: int = 2, 187 | num_columns: int = 2, 188 | choice_format='letter', 189 | enable_choice_search: bool = False, 190 | ): 191 | super(VideoQAModel, self).__init__(model.model_name, prompt_name, prompt_func, choice_format, enable_choice_search) 192 | self.num_rows = num_rows 193 | self.num_columns = num_columns 194 | self.num_frames = self.num_rows * self.num_columns 195 | self.model = model 196 | 197 | @torch.no_grad() 198 | def _qa(self, data, prompt): 199 | if isinstance(data, Image.Image): 200 | return self.model._qa(data, prompt) 201 | elif isinstance(data, str): 202 | return self.model._qa(video_to_concat_image(data, self.num_rows, self.num_columns), prompt) 203 | else: 204 | with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp: 205 | with open(tmp.name, 'wb') as file: 206 | file.write(data) 207 | video_path = tmp.name 208 | answer = self.model._qa(video_to_concat_image(video_path, self.num_rows, self.num_columns), prompt) 209 | return answer 210 | -------------------------------------------------------------------------------- /annotations/relation_to_type.json: -------------------------------------------------------------------------------- 1 | { 2 | "standing behind": "spatial", 3 | "displayed in": "interactional", 4 | "jumping on": "interactional", 5 | "sitting next to": "interactional", 6 | "moving": "interactional", 7 | "exiting": "interactional", 8 | "sitting with": "social", 9 | "drinking from": "interactional", 10 | "herding": "interactional", 11 | "larger than": "spatial", 12 | "tied around": "spatial", 13 | "covered with": "spatial", 14 | "lying inside": "interactional", 15 | "growing behind": "interactional", 16 | "reflecting in": "functional", 17 | "on": "spatial", 18 | "sitting atop": "spatial", 19 | "topped with": "interactional", 20 | "brushing": "interactional", 21 | "sitting in": "spatial", 22 | "pushed by": "interactional", 23 | "walking up": "spatial", 24 | "tossing": "interactional", 25 | "sitting under": "spatial", 26 | "entering": "interactional", 27 | "by": "spatial", 28 | "sitting in front of": "spatial", 29 | "standing against": "spatial", 30 | "about to hit": "interactional", 31 | "buying": "interactional", 32 | "tying": "interactional", 33 | "reflected in": "spatial", 34 | "lying next to": "interactional", 35 | "cutting": "interactional", 36 | "surrounding": "spatial", 37 | "pushing": "interactional", 38 | "skiing on": "interactional", 39 | "walking in": "spatial", 40 | "with": "spatial", 41 | "looking toward": "spatial", 42 | "lying on": "spatial", 43 | "grazing in": "interactional", 44 | "drawn on": "spatial", 45 | "connected to": "spatial", 46 | "taller than": "spatial", 47 | "longer than": "spatial", 48 | "pouring": "interactional", 49 | "sitting by": "spatial", 50 | "smaller than": "spatial", 51 | "on the side of": "spatial", 52 | "jumping off": "interactional", 53 | "sitting beside": "spatial", 54 | "throwing": "interactional", 55 | "pulling": "interactional", 56 | "waiting for": "interactional", 57 | "running through": "spatial", 58 | "contain": "interactional", 59 | "hitting": "interactional", 60 | "at": "spatial", 61 | "smoking": "interactional", 62 | "growing by": "spatial", 63 | "drinking": "interactional", 64 | "hanging from": "spatial", 65 | "hugging": "interactional", 66 | "sleeping in": "interactional", 67 | "towing": "interactional", 68 | "walking across": "spatial", 69 | "parked in front of": "spatial", 70 | "growing along": "interactional", 71 | "resting on": "interactional", 72 | "looking over": "interactional", 73 | "parked along": "spatial", 74 | "beside": "spatial", 75 | "driving": "interactional", 76 | "sewn on": "interactional", 77 | "looking into": "interactional", 78 | "eating in": "spatial", 79 | "traveling down": "spatial", 80 | "close to": "spatial", 81 | "slicing": "interactional", 82 | "bigger than": "spatial", 83 | "underneath": "spatial", 84 | "leading": "interactional", 85 | "talking to": "interactional", 86 | "getting on": "spatial", 87 | "growing from": "interactional", 88 | "swimming in": "interactional", 89 | "talking on": "interactional", 90 | "hung on": "interactional", 91 | "catching": "interactional", 92 | "sprinkled on": "interactional", 93 | "opening": "interactional", 94 | "mounted to": "spatial", 95 | "standing in front of": "spatial", 96 | "seen through": "spatial", 97 | "going into": "spatial", 98 | "growing in": "spatial", 99 | "licking": "interactional", 100 | "full of": "interactional", 101 | "hanging out of": "spatial", 102 | "next to": "spatial", 103 | "hanging above": "spatial", 104 | "standing on top of": "spatial", 105 | "cooking": "interactional", 106 | "looking through": "interactional", 107 | "between": "spatial", 108 | "riding": "interactional", 109 | "playing with": "interactional", 110 | "eating from": "interactional", 111 | "going through": "spatial", 112 | "leaning against": "spatial", 113 | "scattered on": "spatial", 114 | "parked behind": "spatial", 115 | "flying in": "spatial", 116 | "worn on": "interactional", 117 | "surrounded by": "spatial", 118 | "feeding": "interactional", 119 | "standing under": "spatial", 120 | "floating on": "spatial", 121 | "walking down": "spatial", 122 | "skating on": "interactional", 123 | "under": "spatial", 124 | "playing in": "interactional", 125 | "lying on top of": "spatial", 126 | "on the bottom of": "spatial", 127 | "inside": "spatial", 128 | "kissing": "interactional", 129 | "playing at": "interactional", 130 | "standing at": "spatial", 131 | "helping": "interactional", 132 | "riding in": "interactional", 133 | "chained to": "spatial", 134 | "parked in": "spatial", 135 | "on top of": "spatial", 136 | "kept in": "spatial", 137 | "covering": "spatial", 138 | "grazing on": "interactional", 139 | "approaching": "interactional", 140 | "climbing": "interactional", 141 | "covered in": "spatial", 142 | "growing next to": "spatial", 143 | "in between": "spatial", 144 | "behind": "spatial", 145 | "growing near": "spatial", 146 | "painted on": "spatial", 147 | "driving down": "spatial", 148 | "parked next to": "spatial", 149 | "touching": "interactional", 150 | "parked by": "interactional", 151 | "walking to": "spatial", 152 | "posing with": "social", 153 | "standing beside": "spatial", 154 | "standing on": "spatial", 155 | "using": "interactional", 156 | "mounted on": "spatial", 157 | "walking by": "spatial", 158 | "playing on": "interactional", 159 | "blowing out": "interactional", 160 | "sitting near": "interactional", 161 | "crossing": "spatial", 162 | "to the left of": "spatial", 163 | "cooked in": "functional", 164 | "eating at": "interactional", 165 | "walking towards": "interactional", 166 | "floating in": "spatial", 167 | "hang from": "spatial", 168 | "photographing": "interactional", 169 | "sniffing": "interactional", 170 | "stuck on": "interactional", 171 | "walking toward": "interactional", 172 | "looking down at": "interactional", 173 | "traveling on": "spatial", 174 | "typing on": "interactional", 175 | "guiding": "interactional", 176 | "shining through": "spatial", 177 | "jumping over": "interactional", 178 | "following": "interactional", 179 | "dragging": "interactional", 180 | "on the front of": "spatial", 181 | "standing next to": "interactional", 182 | "reflected on": "spatial", 183 | "on the other side of": "spatial", 184 | "lying in": "spatial", 185 | "boarding": "interactional", 186 | "pointing at": "interactional", 187 | "draped over": "spatial", 188 | "observing": "interactional", 189 | "working in": "interactional", 190 | "followed by": "interactional", 191 | "chasing": "interactional", 192 | "wrapped in": "spatial", 193 | "leaning on": "spatial", 194 | "sitting at": "spatial", 195 | "parked on": "spatial", 196 | "piled on": "spatial", 197 | "walking with": "interactional", 198 | "carrying": "interactional", 199 | "beneath": "spatial", 200 | "served on": "functional", 201 | "wading in": "interactional", 202 | "walking into": "spatial", 203 | "sitting inside": "spatial", 204 | "holding": "interactional", 205 | "enclosing": "spatial", 206 | "looking out": "interactional", 207 | "standing near": "interactional", 208 | "of": "spatial", 209 | "to the right of": "spatial", 210 | "walking next to": "interactional", 211 | "petting": "interactional", 212 | "driving on": "spatial", 213 | "standing in": "spatial", 214 | "hidden by": "spatial", 215 | "flying through": "spatial", 216 | "hanging over": "spatial", 217 | "playing": "interactional", 218 | "covered by": "spatial", 219 | "stuck in": "spatial", 220 | "attached to": "spatial", 221 | "facing": "interactional", 222 | "stacked on": "interactional", 223 | "walking near": "spatial", 224 | "wrapped around": "spatial", 225 | "higher than": "spatial", 226 | "chewing": "interactional", 227 | "parked near": "spatial", 228 | "preparing": "interactional", 229 | "skiing in": "interactional", 230 | "jumping in": "interactional", 231 | "flying": "interactional", 232 | "leaning over": "interactional", 233 | "picking up": "interactional", 234 | "walking through": "interactional", 235 | "in front of": "spatial", 236 | "decorated by": "functional", 237 | "growing on": "interactional", 238 | "standing around": "spatial", 239 | "standing by": "spatial", 240 | "going down": "spatial", 241 | "grabbing": "interactional", 242 | "eating": "interactional", 243 | "walking behind": "interactional", 244 | "in": "spatial", 245 | "mixed with": "interactional", 246 | "coming down": "spatial", 247 | "cleaning": "interactional", 248 | "adjusting": "interactional", 249 | "perched on": "interactional", 250 | "riding on": "interactional", 251 | "sitting on": "spatial", 252 | "parked alongside": "spatial", 253 | "working on": "interactional", 254 | "hanging on": "spatial", 255 | "pulled by": "interactional", 256 | "splashing": "interactional", 257 | "hanging in": "spatial", 258 | "tied to": "spatial", 259 | "plugged into": "interactional", 260 | "printed on": "spatial", 261 | "decorated with": "interactional", 262 | "on the back of": "spatial", 263 | "on the edge of": "spatial", 264 | "below": "spatial", 265 | "sleeping on": "interactional", 266 | "walking along": "spatial", 267 | "hanging off": "spatial", 268 | "walking on": "spatial", 269 | "around": "spatial", 270 | "looking in": "interactional", 271 | "looking at": "interactional", 272 | "near": "spatial", 273 | "parked at": "spatial", 274 | "staring at": "interactional", 275 | "reading": "interactional", 276 | "swinging": "interactional", 277 | "wearing": "interactional", 278 | "falling off": "interactional", 279 | "selling": "interactional", 280 | "above": "spatial", 281 | "holding onto": "interactional", 282 | "biting": "interactional", 283 | "running on": "spatial", 284 | "decorating": "interactional", 285 | "leaving": "spatial", 286 | "making": "interactional", 287 | "balancing on": "interactional", 288 | "running in": "spatial", 289 | "flying above": "spatial", 290 | "sitting around": "spatial", 291 | "coming out of": "spatial", 292 | "washing": "interactional", 293 | "worn around": "interactional", 294 | "sitting on top of": "spatial", 295 | "skiing down": "interactional", 296 | "kicking": "interactional", 297 | "running across": "spatial", 298 | "parked beside": "spatial", 299 | "walking past": "interactional", 300 | "reaching for": "interactional", 301 | "displayed on": "interactional", 302 | "serving": "interactional", 303 | "smiling at": "emotional", 304 | "trying to catch": "interactional", 305 | "flying over": "spatial", 306 | "watching": "interactional", 307 | "shorter than": "spatial", 308 | "smelling": "interactional", 309 | "coming from": "spatial", 310 | "sitting behind": "spatial", 311 | "filled with": "interactional", 312 | "writing on": "interactional", 313 | "wiping": "interactional", 314 | "having it on the back": "spatial", 315 | "twisting": "interactional" 316 | } -------------------------------------------------------------------------------- /tma/imageqa/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from itertools import product 4 | from math import radians 5 | from typing import List, Tuple 6 | 7 | import networkx as nx 8 | import pandas as pd 9 | 10 | from ..metadata import CategoryMetaData 11 | 12 | ambiguous_colors = [ 13 | ["red", "pink", "purple"], 14 | ["yellow", "orange", "brown", "gold", "beige"], 15 | ] 16 | 17 | 18 | def get_confusing_colors(color): 19 | for colors in ambiguous_colors: 20 | if color in colors: 21 | return colors 22 | return [color] 23 | 24 | 25 | def remove_skip_edge(edges): 26 | G = nx.DiGraph() 27 | G.add_edges_from(edges) 28 | new_edges = [] 29 | for source, target in edges: 30 | G.remove_edge(source, target) 31 | if not nx.has_path(G, source, target): 32 | G.add_edge(source, target) 33 | new_edges.append((source, target)) 34 | return new_edges 35 | 36 | 37 | def remove_nodes(G, nodes): 38 | for node in nodes: 39 | successors = list(G.successors(node)) 40 | predecessors = list(G.predecessors(node)) 41 | G.remove_node(node) 42 | for s in successors: 43 | for p in predecessors: 44 | G.add_edge(p, s) 45 | return G 46 | 47 | 48 | def build_taxonomy(path_to_metadata, mode): 49 | assert mode in ['objaverse', 'scene_graph'] 50 | 51 | cateid_to_concept = json.load(open(os.path.join(path_to_metadata, 'cateid_to_concept.json'))) 52 | taxonomy = json.load(open(os.path.join(path_to_metadata, 'taxonomy.json'))) 53 | edges, nodes = taxonomy['edges'], taxonomy['nodes'] 54 | G = nx.DiGraph() 55 | G.add_edges_from(remove_skip_edge(edges)) 56 | 57 | nodes_to_remove = [] 58 | categories_with_object = set([k for k, v in cateid_to_concept.items() if len(v[mode]) > 0]) 59 | for node in G.nodes(): 60 | if node not in categories_with_object and len(nx.descendants(G, node) & categories_with_object) == 0: 61 | nodes_to_remove.append(node) 62 | G = remove_nodes(G, nodes_to_remove) 63 | G.add_nodes_from(categories_with_object) 64 | 65 | categories, category_info = [], {} 66 | for node in G.nodes(): 67 | categories.append(node) 68 | if node in cateid_to_concept: 69 | category_info[node] = cateid_to_concept[node] 70 | else: 71 | category_info[node] = nodes[node] 72 | categories = sorted(categories) 73 | 74 | return G, categories, category_info, categories_with_object 75 | 76 | 77 | class ObjaverseMetaData(CategoryMetaData): 78 | def __init__(self, path_to_metadata): 79 | super().__init__() 80 | 81 | self.taxonomy, self.categories, self.category_info, categories_with_object = \ 82 | build_taxonomy(path_to_metadata, 'objaverse') 83 | 84 | cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json'))) 85 | 86 | def get_category_objects(category): 87 | if category in cateid_to_objects: 88 | return list(cateid_to_objects[category].keys()) 89 | else: 90 | return [] 91 | 92 | cateid_to_objid = {} 93 | for cateid in self.categories: 94 | objs = get_category_objects(cateid) 95 | for c in nx.descendants(self.taxonomy, cateid) & categories_with_object: 96 | objs.extend(get_category_objects(c)) 97 | cateid_to_objid[cateid] = objs 98 | assert len(objs) > 0 99 | assert len(objs) == len(set(objs)) 100 | 101 | self.attribute_vocab, objid_to_attribute = {}, {} 102 | for cateid in cateid_to_objects: 103 | for objid in cateid_to_objects[cateid]: 104 | objid_to_attribute[objid] = cateid_to_objects[cateid][objid]["attributes"] 105 | for attr, values in cateid_to_objects[cateid][objid]["attributes"].items(): 106 | if attr not in self.attribute_vocab: 107 | self.attribute_vocab[attr] = set() 108 | self.attribute_vocab[attr].update(values) 109 | 110 | data = [] 111 | for cateid, objs in cateid_to_objid.items(): 112 | for objid in objs: 113 | attribute_data = [] 114 | for attr in self.attribute_vocab: 115 | values = objid_to_attribute[objid].get(attr, []) 116 | if len(values) == 0: 117 | values = [None] 118 | attribute_data.append(values) 119 | 120 | for attribute_combination in product(*attribute_data): 121 | data.append([objid, cateid] + list(attribute_combination)) 122 | 123 | self.df = pd.DataFrame(data, columns=['object', 'category'] + list(self.attribute_vocab.keys())) 124 | 125 | def check_object_attribute(self, objid, attributes): 126 | for attr, values in attributes.items(): 127 | for value in values: 128 | if value not in self.df[self.df['object'] == objid][attr].unique(): 129 | return False 130 | return True 131 | 132 | def and_query(self, conditions: List[Tuple]) -> str: 133 | q = set() 134 | for k, v, i in conditions: 135 | # k: column name; v: value; i: is equal 136 | if v is None: 137 | if i: 138 | q.add(f'{k} in [None]') 139 | else: 140 | q.add(f'{k} not in [None]') 141 | else: 142 | if i: 143 | q.add(f'{k} == {repr(v)}') 144 | else: 145 | if k == 'category': 146 | # exclude all relevant categories 147 | for c in self.get_relevant_categories(v): 148 | q.add(f'{k} != {repr(c)}') 149 | elif k == 'color': 150 | # exclude all confusing colors 151 | for c in get_confusing_colors(v): 152 | q.add(f'{k} != {repr(c)}') 153 | else: 154 | q.add(f'{k} != {repr(v)}') 155 | return ' and '.join(q) 156 | 157 | def or_query(self, conditions: List[str]) -> str: 158 | conditions = [f'({c})' for c in conditions if len(c) > 0] 159 | return ' or '.join(conditions) 160 | 161 | def query_metadata(self, target, query: str): 162 | if len(query) == 0: 163 | return sorted(self.df[target].dropna().unique()) 164 | else: 165 | return sorted(self.df.query(query)[target].dropna().unique().tolist()) 166 | 167 | def sample(self, rng, n, target, query: str): 168 | if n == 1: 169 | return rng.choice(self.query_metadata(target, query)) 170 | else: 171 | candidates = self.query_metadata(target, query) 172 | return rng.choice(candidates, n, replace=len(candidates) < n).tolist() 173 | 174 | def sample_category_for_object(self, rng, objid, exclude_category=None): 175 | candidates = self.query_metadata("category", self.and_query([("object", objid, True)])) 176 | if exclude_category is not None: 177 | exclude_category = self.get_relevant_categories(exclude_category) 178 | candidates = [c for c in candidates if c not in exclude_category] 179 | return rng.choice(candidates) 180 | 181 | def get_category_attribute_dict(self, cateid): 182 | attribute_dict = {} 183 | for attr in self.attribute_vocab: 184 | attribute_dict[attr] = self.query_metadata(attr, self.and_query([("category", cateid, True)])) 185 | return attribute_dict 186 | 187 | 188 | class Objaverse2DMetaData(ObjaverseMetaData): 189 | def __init__(self, path_to_metadata, image_folder): 190 | super().__init__(path_to_metadata) 191 | 192 | self.image_folder = image_folder 193 | cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json'))) 194 | 195 | self.objid_to_images = {} 196 | for cateid in cateid_to_objects: 197 | for objid in cateid_to_objects[cateid]: 198 | self.objid_to_images[objid] = [os.path.join(image_folder, cateid, objid, i) 199 | for i in cateid_to_objects[cateid][objid]["images"]] 200 | 201 | def sample_image(self, rng, objid): 202 | return rng.choice(self.objid_to_images[objid]) 203 | 204 | 205 | class Objaverse3DMetaData(ObjaverseMetaData): 206 | def __init__(self, path_to_metadata, blender_path, assets_path, render_device='cpu', blender_cache='./blender_cache'): 207 | super().__init__(path_to_metadata) 208 | self.assets_path = assets_path 209 | self.blender_path = blender_path 210 | self.blender_cache = blender_cache 211 | self.render_device = render_device 212 | plane_dir = os.path.join(assets_path, "plane_glbs") 213 | self.plane_texture_path = [os.path.join(plane_dir, f) for f in os.listdir(plane_dir) if f.endswith(".glb")] 214 | hdri_dir = os.path.join(assets_path, "hdri") 215 | self.hdri_path = [os.path.join(hdri_dir, f) for f in os.listdir(hdri_dir) if f.endswith(".exr")] 216 | 217 | cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json'))) 218 | self.object_to_angles = {objid: cateid_to_objects[cateid][objid]['angles'] 219 | for cateid in cateid_to_objects for objid in cateid_to_objects[cateid]} 220 | 221 | def get_object_path(self, objid): 222 | return os.path.join(self.assets_path, "objects", objid + ".glb") 223 | 224 | def sample_object_angle(self, rng, objid): 225 | angles = self.object_to_angles[objid] 226 | return angles[rng.choice(len(angles))] 227 | 228 | def sample_blender_configuration(self, rng): 229 | orientation = rng.choice([-1, 1]) 230 | key_light_horizontal_angle = orientation * radians(rng.uniform(15, 45)) 231 | fill_light_horizontal_angle = - orientation * radians(rng.uniform(15, 60)) 232 | key_light_vertical_angle = -radians(rng.uniform(15, 45)) 233 | fill_light_vertical_angle = -radians(rng.uniform(0, 30)) 234 | 235 | sun_x, sun_y = radians(rng.uniform(0, 45)), radians(rng.uniform(0, 45)) 236 | sun_energy = rng.uniform(1.0, 6.0) 237 | 238 | plane_texture_path = rng.choice(self.plane_texture_path) 239 | hdri_path = rng.choice(self.hdri_path) 240 | 241 | return { 242 | "key_light_horizontal_angle" : key_light_horizontal_angle, 243 | "fill_light_horizontal_angle": fill_light_horizontal_angle, 244 | "key_light_vertical_angle" : key_light_vertical_angle, 245 | "fill_light_vertical_angle" : fill_light_vertical_angle, 246 | "sun_x" : sun_x, 247 | "sun_y" : sun_y, 248 | "sun_energy" : sun_energy, 249 | "plane_texture_path" : plane_texture_path, 250 | "hdri_path" : hdri_path 251 | } 252 | 253 | 254 | def load_scene_graph(scene_graph_folder): 255 | image_folder = os.path.join(scene_graph_folder, "images/images") 256 | sg_json_folder = os.path.join(scene_graph_folder, "sceneGraphs") 257 | # train_scene_graphs = json.load(open(os.path.join(sg_json_folder, "train_sceneGraphs.json"))) 258 | val_scene_graphs = json.load(open(os.path.join(sg_json_folder, "val_sceneGraphs.json"))) 259 | scene_graphs = val_scene_graphs # TODO: first only use val_scene_graphs 260 | return image_folder, scene_graphs 261 | 262 | 263 | class SceneGraphMetaData(CategoryMetaData): 264 | def __init__(self, path_to_metadata, scene_graph_folder): 265 | super().__init__() 266 | self.taxonomy, self.categories, self.category_info, self.categories_with_object = \ 267 | build_taxonomy(path_to_metadata, 'scene_graph') 268 | 269 | self.type_to_attribute = json.load(open(os.path.join(path_to_metadata, 'attribute_category.json'))) 270 | self.attribute_to_type = {attr: k for k, vs in self.type_to_attribute.items() for attr in vs} 271 | 272 | self.image_folder, self.scene_graphs = load_scene_graph(scene_graph_folder) 273 | self.scene_graphs_list = list(self.scene_graphs.keys()) 274 | self.sg_object_to_cateid = {} 275 | for k, v in self.category_info.items(): 276 | if k in self.categories_with_object: 277 | for sg_object in v['scene_graph']: 278 | self.sg_object_to_cateid[sg_object] = k 279 | 280 | relations = set() 281 | for sg in self.scene_graphs.values(): 282 | for obj in sg['objects'].values(): 283 | for rel in obj['relations']: 284 | relations.add(rel['name']) 285 | self.relations = list(relations) 286 | 287 | def check_object_in_category(self, object_name): 288 | return object_name in self.sg_object_to_cateid 289 | 290 | def object_name_to_cateid(self, object_name): 291 | return self.sg_object_to_cateid[object_name] 292 | 293 | def get_attribute_type(self, attribute): 294 | return self.attribute_to_type.get(attribute, "other") 295 | 296 | def get_image_path(self, scene_graph_id): 297 | return os.path.join(self.image_folder, scene_graph_id + ".jpg") 298 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /annotations/attribute_category.json: -------------------------------------------------------------------------------- 1 | { 2 | "color": [ 3 | "white", 4 | "yellow", 5 | "maroon", 6 | "navy", 7 | "purple", 8 | "light brown", 9 | "green", 10 | "pink", 11 | "blue", 12 | "light blue", 13 | "red", 14 | "dark", 15 | "black and white", 16 | "dark colored", 17 | "brunette", 18 | "dark brown", 19 | "transparent", 20 | "bronze", 21 | "gold", 22 | "beige", 23 | "gray", 24 | "brown", 25 | "opaque", 26 | "copper", 27 | "dark blue", 28 | "orange", 29 | "neon", 30 | "cream colored", 31 | "rainbow colored", 32 | "silver", 33 | "blond", 34 | "khaki", 35 | "black", 36 | "tan", 37 | "translucent" 38 | ], 39 | "other": [ 40 | "sweet", 41 | "clear", 42 | "wii", 43 | "tinted", 44 | "analog", 45 | "powerful", 46 | "made", 47 | "scarce", 48 | "power", 49 | "electric", 50 | "christmas", 51 | "public", 52 | "wine", 53 | "tennis", 54 | "urban", 55 | "roman", 56 | "abundant", 57 | "commercial", 58 | "deciduous", 59 | "bright", 60 | "toy", 61 | "cordless", 62 | "real", 63 | "tail", 64 | "computer", 65 | "mixed", 66 | "evergreen", 67 | "portable", 68 | "fluorescent", 69 | "strong", 70 | "regular", 71 | "kitchen", 72 | "digital", 73 | "exterior", 74 | "oriental", 75 | "abstract", 76 | "adidas", 77 | "telephone", 78 | "baseball", 79 | "support", 80 | "chinese", 81 | "soccer", 82 | "wireless", 83 | "asian", 84 | "tropical", 85 | "railroad", 86 | "wired", 87 | "rustic", 88 | "professional", 89 | "toilet", 90 | "military", 91 | "simple", 92 | "bathroom", 93 | "safety", 94 | "disposable", 95 | "license", 96 | "calico", 97 | "birthday", 98 | "directional", 99 | "fancy", 100 | "nike", 101 | "sharp", 102 | "industrial", 103 | "ski", 104 | "american", 105 | "office", 106 | "capital", 107 | "garbage", 108 | "assorted", 109 | "electronic", 110 | "tasty", 111 | "ocean", 112 | "artificial", 113 | "caucasian", 114 | "protective", 115 | "foreign", 116 | "double decker", 117 | "french", 118 | "fake", 119 | "formal", 120 | "designed", 121 | "tabby", 122 | "delicious", 123 | "polar", 124 | "typical", 125 | "trash", 126 | "wrist", 127 | "street", 128 | "park", 129 | "wild", 130 | "sparse", 131 | "wedding", 132 | "ugly", 133 | "winter", 134 | "polo", 135 | "sturdy", 136 | "traffic", 137 | "new", 138 | "burning", 139 | "lined", 140 | "intricate", 141 | "on", 142 | "dangling", 143 | "breaking", 144 | "paved", 145 | "loose", 146 | "high", 147 | "beautiful", 148 | "short", 149 | "long", 150 | "outdoor", 151 | "crouched", 152 | "mature", 153 | "checkered", 154 | "chain-link", 155 | "gloomy", 156 | "attached", 157 | "pastel", 158 | "wide", 159 | "slanted", 160 | "fine", 161 | "weathered", 162 | "healthy", 163 | "cracked", 164 | "heavy", 165 | "athletic", 166 | "used", 167 | "rocky", 168 | "floating", 169 | "plain", 170 | "lush", 171 | "halved", 172 | "pointing", 173 | "outstretched", 174 | "still", 175 | "old fashioned", 176 | "shallow", 177 | "cut", 178 | "chocolate", 179 | "off", 180 | "young", 181 | "eaten", 182 | "ivory", 183 | "discolored", 184 | "light", 185 | "decorative", 186 | "dense", 187 | "baby", 188 | "low", 189 | "pulled back", 190 | "teal", 191 | "alert", 192 | "spread", 193 | "perched", 194 | "immature", 195 | "textured", 196 | "outdoors", 197 | "collared", 198 | "shaped", 199 | "inflatable", 200 | "elevated", 201 | "strawberry", 202 | "narrow", 203 | "reflected", 204 | "thin", 205 | "vanilla", 206 | "parked", 207 | "indoors", 208 | "sheer", 209 | "rippling", 210 | "pale", 211 | "hard", 212 | "antique", 213 | "warm", 214 | "dull", 215 | "pretty", 216 | "comfortable", 217 | "wooded", 218 | "funny", 219 | "colorful", 220 | "handmade", 221 | "curly", 222 | "groomed", 223 | "displayed", 224 | "corded", 225 | "straight", 226 | "uneven", 227 | "tilted", 228 | "complete", 229 | "modern", 230 | "vibrant", 231 | "homemade", 232 | "vintage", 233 | "rippled", 234 | "balding", 235 | "adult", 236 | "forested", 237 | "deep", 238 | "tall", 239 | "tangled", 240 | "wavy", 241 | "elderly", 242 | "sandy", 243 | "thick", 244 | "manicured", 245 | "ornamental", 246 | "light colored", 247 | "old" 248 | ], 249 | "size": [ 250 | "huge", 251 | "miniature", 252 | "tiny", 253 | "giant", 254 | "little", 255 | "massive", 256 | "oversized", 257 | "large", 258 | "small", 259 | "skinny", 260 | "chubby", 261 | "vast", 262 | "fat" 263 | ], 264 | "activity": [ 265 | "walking", 266 | "sliding", 267 | "having meeting", 268 | "posing", 269 | "skiing", 270 | "sitting", 271 | "squatting", 272 | "hitting", 273 | "blowing", 274 | "drinking", 275 | "waving", 276 | "looking up", 277 | "blooming", 278 | "driving", 279 | "crashing", 280 | "staring", 281 | "laughing", 282 | "standing", 283 | "cooking", 284 | "riding", 285 | "skating", 286 | "performing trick", 287 | "snowboarding", 288 | "kneeling", 289 | "crouching", 290 | "talking", 291 | "batting", 292 | "smiling", 293 | "looking down", 294 | "bending", 295 | "hanging", 296 | "playing", 297 | "skateboarding", 298 | "running", 299 | "flying", 300 | "eating", 301 | "grazing", 302 | "waiting", 303 | "jumping", 304 | "splashing", 305 | "spinning", 306 | "resting", 307 | "swinging", 308 | "reading", 309 | "spraying", 310 | "surfing", 311 | "sleeping", 312 | "lying", 313 | "swimming" 314 | ], 315 | "state": [ 316 | "sliced", 317 | "lighted", 318 | "toasted", 319 | "ripe", 320 | "fried", 321 | "shaved", 322 | "abandoned", 323 | "trimmed", 324 | "fenced", 325 | "painted", 326 | "dried", 327 | "juicy", 328 | "diced", 329 | "barefoot", 330 | "bunched", 331 | "drawn", 332 | "suspended", 333 | "seasoned", 334 | "shirtless", 335 | "rolled", 336 | "potted", 337 | "uncomfortable", 338 | "overcast", 339 | "grated", 340 | "stained", 341 | "chopped", 342 | "messy", 343 | "crowded", 344 | "raised", 345 | "vacant", 346 | "crossed", 347 | "cushioned", 348 | "faded", 349 | "decorated", 350 | "shadowed", 351 | "piled", 352 | "powdered", 353 | "padded", 354 | "shredded", 355 | "wrapped", 356 | "sealed", 357 | "mowed", 358 | "barren", 359 | "clean", 360 | "turned", 361 | "overgrown", 362 | "framed", 363 | "breakable", 364 | "chipped", 365 | "damaged", 366 | "crumbled", 367 | "hazy", 368 | "edged", 369 | "sunny", 370 | "partly cloudy", 371 | "cloudy", 372 | "gloved", 373 | "clumped", 374 | "patched", 375 | "dirty", 376 | "full", 377 | "inflated", 378 | "snowy", 379 | "short sleeved", 380 | "packed", 381 | "sunlit", 382 | "uncooked", 383 | "roasted", 384 | "rotten", 385 | "glazed", 386 | "scattered", 387 | "bald", 388 | "grouped", 389 | "torn", 390 | "glowing", 391 | "unoccupied", 392 | "hollow", 393 | "scrambled", 394 | "illuminated", 395 | "rimmed", 396 | "tied", 397 | "leafless", 398 | "peeled", 399 | "sculpted", 400 | "fallen", 401 | "upholstered", 402 | "fresh", 403 | "unpeeled", 404 | "half full", 405 | "packaged", 406 | "open", 407 | "melting", 408 | "closed", 409 | "unripe", 410 | "covered", 411 | "mounted", 412 | "worn", 413 | "sprinkled", 414 | "foggy", 415 | "sleeveless", 416 | "unlit", 417 | "cluttered", 418 | "carved", 419 | "grilled", 420 | "frozen", 421 | "baked", 422 | "iced", 423 | "incomplete", 424 | "steamed", 425 | "blurry", 426 | "boiled", 427 | "stormy", 428 | "lit", 429 | "shut", 430 | "written", 431 | "unhealthy", 432 | "blank", 433 | "neat", 434 | "bare", 435 | "connected", 436 | "folding", 437 | "wet", 438 | "shaded", 439 | "peeling", 440 | "folded", 441 | "muscular", 442 | "filled", 443 | "stuffed", 444 | "tight", 445 | "empty", 446 | "shining", 447 | "long sleeved", 448 | "stacked", 449 | "browned", 450 | "cloudless", 451 | "printed", 452 | "busy", 453 | "misty", 454 | "rainy", 455 | "murky", 456 | "raw", 457 | "burnt", 458 | "recessed", 459 | "choppy", 460 | "melted", 461 | "cooked", 462 | "broken", 463 | "docked" 464 | ], 465 | "material": [ 466 | "water", 467 | "rock", 468 | "bamboo", 469 | "soap", 470 | "paper", 471 | "wood", 472 | "metal", 473 | "hardwood", 474 | "cardboard", 475 | "cheese", 476 | "tomato", 477 | "apple", 478 | "gas", 479 | "tin", 480 | "aluminum", 481 | "cotton", 482 | "asphalt", 483 | "mesh", 484 | "styrofoam", 485 | "silk", 486 | "banana", 487 | "granite", 488 | "wicker", 489 | "steel", 490 | "crystal", 491 | "vinyl", 492 | "concrete", 493 | "leather", 494 | "porcelain", 495 | "beer", 496 | "plastic", 497 | "diamond", 498 | "straw", 499 | "rubber", 500 | "fire", 501 | "iron", 502 | "pine", 503 | "glass", 504 | "palm", 505 | "wire", 506 | "cobblestone", 507 | "wool", 508 | "jeans", 509 | "gravel", 510 | "soda", 511 | "cloth", 512 | "stainless steel", 513 | "denim", 514 | "brick", 515 | "pepper", 516 | "coffee", 517 | "lace", 518 | "brass", 519 | "ceramic", 520 | "clay", 521 | "chrome", 522 | "marble", 523 | "chalk", 524 | "pizza", 525 | "snow", 526 | "stone" 527 | ], 528 | "texture": [ 529 | "plaid", 530 | "frosted", 531 | "crumpled", 532 | "braided", 533 | "quilted", 534 | "wrinkled", 535 | "paneled", 536 | "knotted", 537 | "crispy", 538 | "crusty", 539 | "beaded", 540 | "muddy", 541 | "barbed", 542 | "foamy", 543 | "reflective", 544 | "unpaved", 545 | "bushy", 546 | "creamy", 547 | "ruffled", 548 | "furry", 549 | "carpeted", 550 | "flowered", 551 | "polished", 552 | "jagged", 553 | "coarse", 554 | "fuzzy", 555 | "dusty", 556 | "soft", 557 | "puffy", 558 | "dry", 559 | "wrinkly", 560 | "glossy", 561 | "wispy", 562 | "tiled", 563 | "shaggy", 564 | "greasy", 565 | "patchy", 566 | "hairy", 567 | "fluffy", 568 | "plush", 569 | "woven", 570 | "floral", 571 | "shiny", 572 | "shingled", 573 | "rugged", 574 | "ridged", 575 | "rusty", 576 | "dotted", 577 | "spiky", 578 | "patterned", 579 | "speckled", 580 | "grassy", 581 | "feathered", 582 | "smooth", 583 | "crisp", 584 | "floppy", 585 | "ornate", 586 | "knit", 587 | "leafy", 588 | "rough", 589 | "striped" 590 | ], 591 | "mood": [ 592 | "sad", 593 | "angry", 594 | "sleepy", 595 | "happy", 596 | "unhappy", 597 | "curious", 598 | "calm" 599 | ], 600 | "shape": [ 601 | "crooked", 602 | "triangular", 603 | "pointy", 604 | "elongated", 605 | "oblong", 606 | "octagonal", 607 | "sloped", 608 | "curved", 609 | "round", 610 | "domed", 611 | "rounded", 612 | "bent", 613 | "curled", 614 | "winding", 615 | "angled", 616 | "spiral", 617 | "rectangular", 618 | "twisted", 619 | "irregular", 620 | "steep", 621 | "square", 622 | "flat", 623 | "cylindrical", 624 | "arched", 625 | "curvy" 626 | ], 627 | "orientation": [ 628 | "horizontal", 629 | "down", 630 | "upside down", 631 | "overhead", 632 | "upper", 633 | "vertical", 634 | "up", 635 | "lower" 636 | ], 637 | "gender": [ 638 | "female", 639 | "male" 640 | ] 641 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TaskMeAnything 2 | 3 | 4 | 5 | 6 | 7 |
10 |
11 |
12 | 13 |
160 |
161 |
162 | 163 | * VideoQA in Random 164 |
165 |
166 |
167 | 168 | * ImageQA in 2024 169 |
170 |
171 |
172 | 173 | * VideoQA in 2024 174 |
175 |
176 |
177 | 178 | * TaskMeAnything-v1-2024 v.s TaskMeAnything-v1-Random 179 |
180 |
181 |
182 |
183 |
184 |
185 | 186 | * we can see that the performance drops are more significant in the 2024 version, which indicates that the 2024 version is more challenging for the models. 187 | 188 | 189 | For more details, please check out the [paper](https://arxiv.org/abs/2406.11775). 190 | 191 | ## TaskMeAnything-DB 192 | **TaskMeAnything-DB** are stored in [HuggingFace](https://huggingface.co/datasets/jieyuz2/TaskMeAnything-v1-db) 193 | 194 | ## TaskMeAnything-UI 195 | **TaskMeAnything-UI** are hosted in [HuggingFace](todo), check out our interactive interface to explore the performance of models on TaskMeAnything-v1 in your own way! 196 | 197 | ## Disclaimers 198 | **TaskMeAnything** and its associated resources are provided for research and educational purposes only. 199 | The authors and contributors make no warranties regarding the accuracy or reliability of the data and software. 200 | Users are responsible for ensuring their use complies with applicable laws and regulations. 201 | The project is not liable for any damages or losses resulting from the use of these resources. 202 | 203 | 204 | ## Contact 205 | 206 | - Jieyu Zhang: jieyuz2@cs.washington.edu 207 | 208 | ## Citation 209 | 210 | **BibTeX:** 211 | 212 | ```bibtex 213 | @inproceedings{zhang2024task, 214 | title={Task Me Anything}, 215 | author={Zhang, Jieyu and Huang, Weikai and Ma, Zixian and Michel, Oscar and He, Dong and Gupta, Tanmay and Ma, Wei-Chiu and Farhadi, Ali and Kembhavi, Aniruddha and Krishna, Ranjay}, 216 | booktitle={Thirty-Eighth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track}, 217 | year={2024} 218 | } 219 | ``` 220 | 221 | -------------------------------------------------------------------------------- /tma/videoqa/scene_graph/single_video_task.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from ..metadata import VideoSceneGraphMetaData 7 | from ...base import TaskGenerator 8 | from ...task_store import TaskStore 9 | 10 | 11 | def load_mp4_video(video_path): 12 | with open(video_path, "rb") as file: 13 | mp4_data = file.read() 14 | return mp4_data 15 | 16 | 17 | def enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type): 18 | relation_to_actions = {} 19 | video_scene_graph_keyframes = list(video_scene_graph.keys()) 20 | 21 | if temporal_reference_type == "before": 22 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[:-1]): 23 | next_keyframe_name = video_scene_graph_keyframes[idx + 1] 24 | for relation, obj in video_scene_graph[keyframe_name][relation_type].items(): 25 | if relation not in video_scene_graph[next_keyframe_name][relation_type]: 26 | if relation not in relation_to_actions: 27 | relation_to_actions[(relation, obj)] = set() 28 | for after_keyframe in video_scene_graph_keyframes[idx + 1:]: 29 | for action in video_scene_graph[after_keyframe]['actions']: 30 | if action not in video_scene_graph[keyframe_name]['actions']: 31 | relation_to_actions[(relation, obj)].add(action) 32 | 33 | elif temporal_reference_type == "after": 34 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[1:], start=1): 35 | previous_keyframe_name = video_scene_graph_keyframes[idx - 1] 36 | for relation, obj in video_scene_graph[keyframe_name][relation_type].items(): 37 | if relation not in video_scene_graph[previous_keyframe_name][relation_type]: 38 | if relation not in relation_to_actions: 39 | relation_to_actions[(relation, obj)] = set() 40 | for before_keyframe in video_scene_graph_keyframes[:idx]: 41 | for action in video_scene_graph[before_keyframe]['actions']: 42 | if action not in video_scene_graph[keyframe_name]['actions']: 43 | relation_to_actions[(relation, obj)].add(action) 44 | 45 | elif temporal_reference_type == "while": 46 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes): 47 | for relation, obj in video_scene_graph[keyframe_name][relation_type].items(): 48 | if relation not in relation_to_actions: 49 | relation_to_actions[(relation, obj)] = set() 50 | for action in video_scene_graph[keyframe_name]['actions']: 51 | relation_to_actions[(relation, obj)].add(action) 52 | 53 | # Convert sets to lists for the output 54 | relation_to_actions = {k: list(v) for k, v in relation_to_actions.items()} 55 | return relation_to_actions 56 | 57 | 58 | def enumerate_target_action_to_possible_reference_actions(video_scene_graph, temporal_reference_type): 59 | action_to_actions = {} 60 | video_scene_graph_keyframes = list(video_scene_graph.keys()) 61 | 62 | if temporal_reference_type == "before": 63 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[:-1]): 64 | next_keyframe_name = video_scene_graph_keyframes[idx + 1] 65 | for action in video_scene_graph[keyframe_name]['actions']: 66 | if action not in video_scene_graph[next_keyframe_name]['actions']: 67 | if action not in action_to_actions: 68 | action_to_actions[action] = set() 69 | for after_keyframe in video_scene_graph_keyframes[idx + 1:]: 70 | for reference_action in video_scene_graph[after_keyframe]['actions']: 71 | if reference_action not in video_scene_graph[keyframe_name]['actions'] and reference_action != action: 72 | action_to_actions[action].add(reference_action) 73 | 74 | elif temporal_reference_type == "after": 75 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes[1:], start=1): 76 | previous_keyframe_name = video_scene_graph_keyframes[idx - 1] 77 | for action in video_scene_graph[keyframe_name]['actions']: 78 | if action not in video_scene_graph[previous_keyframe_name]['actions']: 79 | if action not in action_to_actions: 80 | action_to_actions[action] = set() 81 | for before_keyframe in video_scene_graph_keyframes[:idx]: 82 | for reference_action in video_scene_graph[before_keyframe]['actions']: 83 | if reference_action not in video_scene_graph[keyframe_name]['actions'] and reference_action != action: 84 | action_to_actions[action].add(reference_action) 85 | 86 | elif temporal_reference_type == "while": 87 | for idx, keyframe_name in enumerate(video_scene_graph_keyframes): 88 | for action in video_scene_graph[keyframe_name]['actions']: 89 | if action not in action_to_actions: 90 | action_to_actions[action] = set() 91 | for reference_action in video_scene_graph[keyframe_name]['actions']: 92 | if reference_action != action: 93 | action_to_actions[action].add(reference_action) 94 | 95 | # Convert sets to lists for the output 96 | action_to_actions = {k: list(v) for k, v in action_to_actions.items()} 97 | return action_to_actions 98 | 99 | 100 | def get_all_spatial_relations(video_scene_graph): 101 | relations = set() 102 | for keyframe_name, keyframe in video_scene_graph.items(): 103 | relations.update(keyframe['spatial']) 104 | return relations 105 | 106 | 107 | def get_all_contact_relations(video_scene_graph): 108 | relations = set() 109 | for keyframe_name, keyframe in video_scene_graph.items(): 110 | relations.update(keyframe['contact']) 111 | return relations 112 | 113 | 114 | def get_all_objects(video_scene_graph): 115 | objects = set() 116 | for keyframe_name, keyframe in video_scene_graph.items(): 117 | for relation in keyframe['spatial']: 118 | objects.add(keyframe['spatial'][relation]) 119 | for relation in keyframe['contact']: 120 | objects.add(keyframe['contact'][relation]) 121 | return objects 122 | 123 | 124 | def get_all_actions(video_scene_graph): 125 | actions = set() 126 | for keyframe_name, keyframe in video_scene_graph.items(): 127 | actions.update(keyframe['actions']) 128 | return actions 129 | 130 | 131 | class VideoSceneGraphTaskGenerator(TaskGenerator): 132 | metadata: VideoSceneGraphMetaData 133 | 134 | embed_schema = [ 135 | "task type", 136 | "object", 137 | "relation", 138 | "action", 139 | "reference action", 140 | "relation type", 141 | "temporal reference type", 142 | ] 143 | 144 | def __init__(self, metadata: VideoSceneGraphMetaData, seed=42): 145 | super().__init__(metadata, seed=seed) 146 | 147 | def _generate_task(self, task_plan) -> Tuple[str, str, List[str], str]: 148 | "(Abstract method) generate task" 149 | 150 | def _task_plan_to_str(self, task_plan) -> str: 151 | t = [] 152 | for k, v in task_plan.items(): 153 | if k in self.embed_schema: 154 | assert isinstance(v, str) 155 | t.append(f'{k}: {v}') 156 | return '\n'.join(t) 157 | 158 | def generate(self, task_plan, return_data=True, seed=None): 159 | if seed is not None: 160 | self.rng = np.random.default_rng(seed=seed) 161 | 162 | question, answer, options, video_scene_graph_id = self._generate_task(task_plan) 163 | 164 | task = { 165 | "question" : question, 166 | "answer" : answer, 167 | "options" : options, 168 | "task_plan" : self._task_plan_to_str(task_plan), 169 | "video_scene_graph_id": video_scene_graph_id, 170 | 'video' : load_mp4_video(self.metadata.get_video_path(video_scene_graph_id)) if return_data else None 171 | } 172 | return task 173 | 174 | 175 | class WhatObjectVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator): 176 | schema = { 177 | "task type" : "str", 178 | "object" : "str", 179 | "relation" : "str", 180 | "reference action" : "str", 181 | "relation type" : "str", 182 | "temporal reference type": "str", 183 | "video scene graph id" : "str", 184 | } 185 | 186 | def enumerate_task_plans(self, task_store: TaskStore): 187 | for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what object video] task"): 188 | for relation_type in ["spatial", "contact"]: 189 | for temporal_reference_type in ["before", "after", "while"]: 190 | target_relation_to_possible_reference_actions = enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type) 191 | for (target_relation, target_object), possible_reference_actions in target_relation_to_possible_reference_actions.items(): 192 | for reference_action in possible_reference_actions: 193 | task_plan = { 194 | "task type" : "what object video", 195 | "video scene graph id" : video_scene_graph_id, 196 | "object" : self.metadata.idx2name[target_object], 197 | "relation" : self.metadata.idx2name[target_relation], 198 | 'relation type' : relation_type, 199 | "reference action" : self.metadata.idx2name[reference_action], 200 | "temporal reference type": temporal_reference_type, 201 | } 202 | task_store.add(task_plan) 203 | 204 | def _generate_task(self, task_plan): 205 | question = f"What is the object that the person is {task_plan['relation']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 206 | 207 | answer = task_plan["object"] 208 | negatives = list(set(self.metadata.objects) - get_all_objects(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 209 | negatives = [self.metadata.idx2name[neg] for neg in negatives] 210 | 211 | options = self._compose_options(answer, negatives) 212 | return question, answer, options, task_plan["video scene graph id"] 213 | 214 | 215 | class WhatRelationVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator): 216 | schema = { 217 | "task type" : "str", 218 | "object" : "str", 219 | "relation" : "str", 220 | "reference action" : "str", 221 | "relation type" : "str", 222 | "temporal reference type": "str", 223 | "video scene graph id" : "str", 224 | } 225 | 226 | def enumerate_task_plans(self, task_store: TaskStore): 227 | for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what relation video] task"): 228 | for relation_type in ["spatial", "contact"]: 229 | for temporal_reference_type in ["before", "after", "while"]: 230 | target_relation_to_possible_reference_actions = enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type) 231 | for (target_relation, target_object), possible_reference_actions in target_relation_to_possible_reference_actions.items(): 232 | for reference_action in possible_reference_actions: 233 | task_plan = { 234 | "task type" : "what relation video", 235 | "video scene graph id" : video_scene_graph_id, 236 | "object" : self.metadata.idx2name[target_object], 237 | "relation" : self.metadata.idx2name[target_relation], 238 | 'relation type' : relation_type, 239 | "reference action" : self.metadata.idx2name[reference_action], 240 | "temporal reference type": temporal_reference_type, 241 | } 242 | task_store.add(task_plan) 243 | 244 | def _generate_task(self, task_plan): 245 | if task_plan["relation type"] == "spatial": 246 | question = f"What is the spatial relation of the person to the {task_plan['object']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 247 | negatives = list(set(self.metadata.spatial_relations) - get_all_spatial_relations(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 248 | elif task_plan["relation type"] == "contact": 249 | question = f"What is the person doing to the {task_plan['object']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 250 | negatives = list(set(self.metadata.contact_relations) - get_all_contact_relations(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 251 | else: 252 | raise ValueError(f"Unknown relation type: {task_plan['relation type']}") 253 | 254 | answer = task_plan['relation'] 255 | negatives = [self.metadata.idx2name[neg] for neg in negatives] 256 | 257 | options = self._compose_options(answer, negatives) 258 | return question, answer, options, task_plan["video scene graph id"] 259 | 260 | 261 | class WhatActionVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator): 262 | schema = { 263 | "task type" : "str", 264 | "action" : "str", 265 | "reference action" : "str", 266 | "relation type" : "str", 267 | "temporal reference type": "str", 268 | "video scene graph id" : "str", 269 | } 270 | 271 | def enumerate_task_plans(self, task_store: TaskStore): 272 | for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what action video] task"): 273 | for temporal_reference_type in ["before", "after", "while"]: 274 | target_action_to_possible_reference_actions = enumerate_target_action_to_possible_reference_actions(video_scene_graph, temporal_reference_type) 275 | for target_action, possible_reference_actions in target_action_to_possible_reference_actions.items(): 276 | for reference_action in possible_reference_actions: 277 | task_plan = { 278 | "task type" : "what action video", 279 | "video scene graph id" : video_scene_graph_id, 280 | "action" : self.metadata.idx2name[target_action], 281 | "reference action" : self.metadata.idx2name[reference_action], 282 | "temporal reference type": temporal_reference_type, 283 | } 284 | task_store.add(task_plan) 285 | 286 | def _generate_task(self, task_plan): 287 | question = f"What action is the person doing {task_plan['temporal reference type']} the person {task_plan['reference action']}?" 288 | 289 | answer = task_plan["action"] 290 | negatives = list(set(self.metadata.actions) - get_all_actions(self.metadata.video_scene_graphs[task_plan["video scene graph id"]])) 291 | negatives = [self.metadata.idx2name[neg] for neg in negatives] 292 | 293 | options = self._compose_options(answer, negatives) 294 | return question, answer, options, task_plan["video scene graph id"] 295 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/movement_single_video_task.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from .single_video_task import GridVideoTaskGenerator 4 | from .utils import relative_positions 5 | from ..metadata import ObjaverseVideoMetaData as MetaData 6 | from ...constant import NUM_OPTIONS, VIDEO_FPS, VIDEO_NUM_FRAMES 7 | from ...task_store import TaskStore 8 | 9 | grid_options = [2] 10 | DEFAULT_OBJECT_SIZE_MULTIPLIER = 1.3 11 | 12 | moving_options = {'left', 'right', 'up', 'down'} 13 | 14 | 15 | def direction_to_keyframes(direction): 16 | if direction == 'left': 17 | return [{}, {}, {}, {}, {'movement': (0, 0.35)}] 18 | elif direction == 'right': 19 | return [{}, {}, {}, {}, {'movement': (0, -0.35)}] 20 | elif direction == 'up': 21 | return [{}, {}, {}, {}, {'movement': (0.45, 0)}] 22 | elif direction == 'down': 23 | return [{}, {}, {}, {}, {'movement': (-0.45, 0)}] 24 | 25 | 26 | class MovementVideoGridTaskGenerator(GridVideoTaskGenerator): 27 | def _make_video_metadata(self, grid_size, grids, queries, remaining_query=..., target_object_moving_direction='left', are_other_objects_moving="No", object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER): 28 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 29 | remaining_grids = [g for g in range(grid_size ** 2) if g not in grids] 30 | for _ in remaining_grids: 31 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 32 | objects.append(uid) 33 | 34 | remaining_moving_direction = list(moving_options - {target_object_moving_direction}) 35 | keyframes = [direction_to_keyframes(target_object_moving_direction)] 36 | if are_other_objects_moving == "Yes": 37 | remaining_keyframes = [direction_to_keyframes(self.rng.choice(remaining_moving_direction, size=1)) for _ in range(len(remaining_grids))] 38 | else: 39 | remaining_keyframes = [[{}, {}, {}, {}, {}] for _ in range(len(remaining_grids))] 40 | 41 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 42 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 43 | 44 | video_metadata = { 45 | 'grid number' : grid_size, 46 | 'objects' : objects, 47 | 'object_path' : object_path, 48 | 'object_angles' : angles, 49 | 'grids' : grids + remaining_grids, 50 | 'blender_config' : self.metadata.sample_blender_configuration(self.rng), 51 | 'fps' : VIDEO_FPS, 52 | 'total_num_frames': VIDEO_NUM_FRAMES, 53 | 'sizes' : [object_size_multiplier for _ in objects], 54 | 'keyframes' : keyframes + remaining_keyframes, 55 | } 56 | return video_metadata 57 | 58 | 59 | class WhatMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator): 60 | schema = { 61 | 'task type' : 'str', 62 | 'grid number' : 'int', 63 | 'target category' : 'str', 64 | 'absolute position' : 'str', 65 | 'attribute type' : 'str', 66 | 'attribute value' : 'str', 67 | 'moving direction' : 'str', 68 | 'are other objects moving': 'str' 69 | } 70 | 71 | def enumerate_task_plans(self, task_store: TaskStore): 72 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what move video] task"): 73 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 74 | for attribute_type, attribute_values in attribute_dict.items(): 75 | for attribute_value in attribute_values: 76 | for grid_size in grid_options: 77 | for absolute_pos in self.grid_mappings[grid_size]: 78 | for target_object_moving_direction in moving_options: 79 | task_plan = { 80 | 'task type' : 'what move video', 81 | 'grid number' : grid_size, 82 | 'target category' : target_category, 83 | 'absolute position' : absolute_pos, 84 | 'attribute type' : attribute_type, 85 | 'attribute value' : attribute_value, 86 | 'moving direction' : target_object_moving_direction, 87 | 'are other objects moving': "Yes" 88 | } 89 | task_store.add(task_plan) 90 | 91 | task_plan = { 92 | 'task type' : 'what move video', 93 | 'grid number' : grid_size, 94 | 'target category' : target_category, 95 | 'absolute position' : absolute_pos, 96 | 'attribute type' : attribute_type, 97 | 'attribute value' : attribute_value, 98 | 'moving direction' : target_object_moving_direction, 99 | 'are other objects moving': "No" 100 | } 101 | task_store.add(task_plan) 102 | 103 | for grid_size in grid_options: 104 | for absolute_pos in self.grid_mappings[grid_size]: 105 | for target_object_moving_direction in moving_options: 106 | task_plan = { 107 | 'task type' : 'what move video', 108 | 'grid number' : grid_size, 109 | 'target category' : target_category, 110 | 'absolute position' : absolute_pos, 111 | 'moving direction' : target_object_moving_direction, 112 | 'are other objects moving': "Yes" 113 | } 114 | task_store.add(task_plan) 115 | 116 | task_plan = { 117 | 'task type' : 'what move video', 118 | 'grid number' : grid_size, 119 | 'target category' : target_category, 120 | 'absolute position' : absolute_pos, 121 | 'moving direction' : target_object_moving_direction, 122 | 'are other objects moving': "No" 123 | } 124 | task_store.add(task_plan) 125 | 126 | def _generate_task(self, task_plan): 127 | grid_size = task_plan['grid number'] 128 | target_category = task_plan['target category'] 129 | absolute_pos = task_plan['absolute position'] 130 | grids = [self.grid_mappings[grid_size][absolute_pos]] 131 | target_object_moving_direction = task_plan['moving direction'] 132 | 133 | if task_plan['are other objects moving'] == "Yes": 134 | question = f"What is the object that is moving {target_object_moving_direction} in the video?" 135 | else: 136 | question = f"What is the moving object in the video?" 137 | 138 | queries = [self._get_target_object_query(task_plan)] 139 | 140 | remaining_query = self.metadata.and_query([("category", target_category, False)]) 141 | 142 | video_metadata = self._make_video_metadata( 143 | grid_size, 144 | grids, 145 | queries=queries, 146 | remaining_query=remaining_query, 147 | target_object_moving_direction=target_object_moving_direction, 148 | are_other_objects_moving=task_plan['are other objects moving'], 149 | object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER 150 | ) 151 | 152 | answer = self.metadata.get_surfacename(target_category) 153 | negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category)) 154 | for o in video_metadata['objects'][1:]] 155 | options = self._compose_options(answer, negatives) 156 | 157 | return question, answer, options, video_metadata 158 | 159 | 160 | class WhatAttributeMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator): 161 | schema = { 162 | 'task type' : 'str', 163 | 'grid number' : 'int', 164 | 'target category' : 'str', 165 | 'absolute position' : 'str', 166 | 'attribute type' : 'str', 167 | 'attribute value' : 'str', 168 | 'moving direction' : 'str', 169 | 'are other objects moving': 'str' 170 | } 171 | 172 | def enumerate_task_plans(self, task_store: TaskStore): 173 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what attribute move video] task"): 174 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 175 | for attribute_type, attribute_values in attribute_dict.items(): 176 | for attribute_value in attribute_values: 177 | for grid_size in grid_options: 178 | for absolute_pos in self.grid_mappings[grid_size]: 179 | for target_object_moving_direction in moving_options: 180 | task_plan = { 181 | 'task type' : 'what attribute move video', 182 | 'grid number' : grid_size, 183 | 'target category' : target_category, 184 | 'absolute position' : absolute_pos, 185 | 'attribute type' : attribute_type, 186 | 'attribute value' : attribute_value, 187 | 'moving direction' : target_object_moving_direction, 188 | 'are other objects moving': "Yes" 189 | } 190 | task_store.add(task_plan) 191 | 192 | task_plan = { 193 | 'task type' : 'what attribute move video', 194 | 'grid number' : grid_size, 195 | 'target category' : target_category, 196 | 'absolute position' : absolute_pos, 197 | 'attribute type' : attribute_type, 198 | 'attribute value' : attribute_value, 199 | 'moving direction' : target_object_moving_direction, 200 | 'are other objects moving': "No" 201 | } 202 | task_store.add(task_plan) 203 | 204 | def _generate_task(self, task_plan): 205 | grid_size = task_plan['grid number'] 206 | 207 | attribute_type = task_plan['attribute type'] 208 | absolute_pos = task_plan['absolute position'] 209 | target_object_moving_direction = task_plan['moving direction'] 210 | grids = [self.grid_mappings[grid_size][absolute_pos]] 211 | 212 | queries = [self._get_target_object_query(task_plan)] 213 | if task_plan['are other objects moving'] == "Yes": 214 | question = f"What is the {attribute_type} of the object that is moving {target_object_moving_direction} in the video?" 215 | else: 216 | question = f"What is the {attribute_type} of the moving object in the video?" 217 | 218 | video_metadata = self._make_video_metadata( 219 | grid_size, 220 | grids, 221 | queries=queries, 222 | target_object_moving_direction=target_object_moving_direction, 223 | are_other_objects_moving=task_plan['are other objects moving'], 224 | object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER 225 | ) 226 | 227 | answer = task_plan['attribute value'] 228 | target_object = video_metadata['objects'][0] 229 | negative_query = self.metadata.and_query([ 230 | (attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)])) 231 | ]) 232 | negatives = self.metadata.sample( 233 | self.rng, 234 | NUM_OPTIONS - 1, 235 | attribute_type, 236 | query=negative_query, 237 | ) 238 | options = [answer] + negatives 239 | return question, answer, options, video_metadata 240 | 241 | 242 | class WhereMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator): 243 | schema = { 244 | 'task type' : 'str', 245 | 'grid number' : 'int', 246 | 'target category' : 'str', 247 | 'absolute position' : 'str', 248 | 'attribute type' : 'str', 249 | 'attribute value' : 'str', 250 | 'moving direction' : 'str', 251 | 'are other objects moving': 'str' 252 | } 253 | 254 | def __init__(self, metadata: MetaData, seed=42): 255 | super().__init__(metadata, seed=seed) 256 | self.relative_positions = relative_positions 257 | 258 | def enumerate_task_plans(self, task_store: TaskStore): 259 | for target_category in tqdm(self.metadata.categories, desc="enumerating [where move video] task"): 260 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 261 | for attribute_type, attribute_values in attribute_dict.items(): 262 | for attribute_value in attribute_values: 263 | for grid_size in grid_options: 264 | for absolute_pos in self.grid_mappings[grid_size]: 265 | for target_object_moving_direction in moving_options: 266 | task_plan = { 267 | 'task type' : 'where move video', 268 | 'grid number' : grid_size, 269 | 'target category' : target_category, 270 | 'absolute position' : absolute_pos, 271 | 'attribute type' : attribute_type, 272 | 'attribute value' : attribute_value, 273 | 'moving direction' : target_object_moving_direction, 274 | 'are other objects moving': "Yes" 275 | } 276 | task_store.add(task_plan) 277 | 278 | task_plan = { 279 | 'task type' : 'where move video', 280 | 'grid number' : grid_size, 281 | 'target category' : target_category, 282 | 'absolute position' : absolute_pos, 283 | 'attribute type' : attribute_type, 284 | 'attribute value' : attribute_value, 285 | 'moving direction' : target_object_moving_direction, 286 | 'are other objects moving': "No" 287 | } 288 | task_store.add(task_plan) 289 | 290 | for grid_size in grid_options: 291 | for absolute_pos in self.grid_mappings[grid_size]: 292 | for target_object_moving_direction in moving_options: 293 | task_plan = { 294 | 'task type' : 'where move video', 295 | 'grid number' : grid_size, 296 | 'target category' : target_category, 297 | 'absolute position' : absolute_pos, 298 | 'moving direction' : target_object_moving_direction, 299 | 'are other objects moving': "Yes" 300 | } 301 | task_store.add(task_plan) 302 | 303 | task_plan = { 304 | 'task type' : 'where move video', 305 | 'grid number' : grid_size, 306 | 'target category' : target_category, 307 | 'absolute position' : absolute_pos, 308 | 'moving direction' : target_object_moving_direction, 309 | 'are other objects moving': "No" 310 | } 311 | task_store.add(task_plan) 312 | 313 | def _generate_task(self, task_plan): 314 | grid_size = task_plan['grid number'] 315 | 316 | target_category = task_plan['target category'] 317 | categories = [target_category] 318 | queries = [self._get_target_object_query(task_plan)] 319 | absolute_pos = task_plan['absolute position'] 320 | grids = [self.grid_mappings[grid_size][absolute_pos]] 321 | target_object_moving_direction = task_plan['moving direction'] 322 | 323 | if task_plan['are other objects moving'] == "Yes": 324 | question = f"Where is the object that is moving {target_object_moving_direction} located in the video?" 325 | else: 326 | question = f"Where is the moving object located in the video?" 327 | answer = absolute_pos 328 | negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer] 329 | 330 | options = self._compose_options(answer, negatives) 331 | video_metadata = self._make_video_metadata( 332 | grid_size, 333 | grids, 334 | queries=queries, 335 | remaining_query=self.metadata.and_query([("category", c, False) for c in categories]), 336 | target_object_moving_direction=target_object_moving_direction, 337 | are_other_objects_moving=task_plan['are other objects moving'], 338 | object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER 339 | ) 340 | 341 | return question, answer, options, video_metadata 342 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/run_blender.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import math 4 | import os 5 | import sys 6 | import urllib.request 7 | from math import radians 8 | 9 | try: 10 | import bpy 11 | from mathutils import Vector, Matrix, Quaternion, Euler 12 | except ImportError: 13 | pass 14 | 15 | 16 | def rotate(obj, degree): 17 | """Rotates around the z axis by theta""" 18 | degree = -degree 19 | bpy.ops.object.select_all(action='DESELECT') 20 | obj.select_set(True) 21 | bpy.context.view_layer.objects.active = obj 22 | radian = radians(degree) 23 | bpy.context.object.rotation_mode = 'XYZ' 24 | rot_x, rot_y, rot_z = obj.rotation_euler 25 | obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian)) 26 | freeze_transformation(obj) 27 | 28 | 29 | def reset_scene(): 30 | # delete everything that isn't part of a camera or a light 31 | bpy.ops.object.select_all(action="SELECT") 32 | for obj in bpy.data.objects: 33 | bpy.data.objects.remove(obj, do_unlink=True) 34 | bpy.ops.ptcache.free_bake_all() 35 | 36 | 37 | def select_hierarchy(obj): 38 | """Recursively select an object and all of its descendants.""" 39 | obj.select_set(True) 40 | for child in obj.children: 41 | select_hierarchy(child) 42 | 43 | 44 | def load_object(object_path: str) -> None: 45 | """Loads a glb model into the scene.""" 46 | bpy.ops.object.select_all(action='DESELECT') 47 | if object_path.endswith(".glb"): 48 | bpy.ops.import_scene.gltf(filepath=object_path, merge_vertices=True) 49 | elif object_path.endswith(".fbx"): 50 | bpy.ops.import_scene.fbx(filepath=object_path) 51 | else: 52 | raise ValueError(f"Unsupported file type: {object_path}") 53 | 54 | base_name = os.path.basename(object_path) 55 | object_name, _ = os.path.splitext(base_name) 56 | bpy.context.view_layer.objects.active.name = object_name 57 | bpy.ops.object.select_all(action='DESELECT') 58 | 59 | obj = bpy.data.objects.get(object_name) 60 | # bpy.context.view_layer.objects.active = obj 61 | select_hierarchy(obj) 62 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 63 | meshes = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"] 64 | non_meshes = [obj for obj in bpy.context.selected_objects if obj.type != "MESH"] 65 | bpy.ops.object.select_all(action="DESELECT") 66 | 67 | # delete non-mesh and consolidate 68 | 69 | for obj in non_meshes: 70 | obj.select_set(True) 71 | bpy.ops.object.delete() 72 | bpy.ops.object.select_all(action="DESELECT") 73 | for obj in meshes: 74 | obj.select_set(True) 75 | bpy.context.view_layer.objects.active = meshes[0] 76 | bpy.ops.object.join() 77 | bpy.context.view_layer.objects.active.name = object_name 78 | bpy.ops.object.origin_set(type='GEOMETRY_ORIGIN', center='BOUNDS') 79 | bpy.ops.object.select_all(action="DESELECT") 80 | 81 | return object_name 82 | 83 | 84 | def scene_meshes(): 85 | for obj in bpy.context.scene.objects.values(): 86 | if isinstance(obj.data, (bpy.types.Mesh)): 87 | yield obj 88 | 89 | 90 | def download_uid(uid_path, save_dir): 91 | return download_object(uid_path, save_dir) 92 | 93 | 94 | def download_object(object_url, save_dir) -> str: 95 | """Download the object and return the path.""" 96 | # uid = uuid.uuid4() 97 | uid = object_url.split("/")[-1].split(".")[0] 98 | tmp_local_path = os.path.join(save_dir, f"{uid}.glb" + ".tmp") 99 | local_path = os.path.join(save_dir, f"{uid}.glb") 100 | # wget the file and put it in local_path 101 | os.makedirs(os.path.dirname(tmp_local_path), exist_ok=True) 102 | urllib.request.urlretrieve(object_url, tmp_local_path) 103 | os.rename(tmp_local_path, local_path) 104 | # get the absolute path 105 | local_path = os.path.abspath(local_path) 106 | return local_path 107 | 108 | 109 | def scene_bbox(single_obj=None, ignore_matrix=False): 110 | bbox_min = (math.inf,) * 3 111 | bbox_max = (-math.inf,) * 3 112 | found = False 113 | for obj in scene_meshes() if single_obj is None else [single_obj]: 114 | found = True 115 | for coord in obj.bound_box: 116 | coord = Vector(coord) 117 | if not ignore_matrix: 118 | coord = obj.matrix_world @ coord 119 | bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord)) 120 | bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord)) 121 | if not found: 122 | raise RuntimeError("no objects in scene to compute bounding box for") 123 | return Vector(bbox_min), Vector(bbox_max) 124 | 125 | 126 | def scene_root_objects(): 127 | for obj in bpy.context.scene.objects.values(): 128 | if not obj.parent: 129 | yield obj 130 | 131 | 132 | def freeze_transformation(obj): 133 | bpy.context.view_layer.objects.active = obj 134 | obj.select_set(True) 135 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 136 | bpy.ops.object.select_all(action='DESELECT') 137 | 138 | 139 | def scale(obj, scale_factor): 140 | bpy.ops.object.select_all(action='DESELECT') 141 | obj.select_set(True) 142 | bpy.ops.transform.resize(value=(scale_factor, scale_factor, scale_factor)) 143 | bpy.ops.object.select_all(action='DESELECT') 144 | freeze_transformation(obj) 145 | 146 | 147 | def get_3d_dimensions(obj): 148 | # pdb.set_trace() 149 | max_x, max_y, max_z = float("-inf"), float("-inf"), float("-inf") 150 | min_x, min_y, min_z = float("inf"), float("inf"), float("inf") 151 | 152 | for vertex in obj.data.vertices: 153 | v_world = obj.matrix_world @ vertex.co 154 | max_x, max_y, max_z = max(max_x, v_world.x), max(max_y, v_world.y), max(max_z, v_world.z) 155 | min_x, min_y, min_z = min(min_x, v_world.x), min(min_y, v_world.y), min(min_z, v_world.z) 156 | 157 | return (max_x - min_x, max_y - min_y, max_z - min_z) 158 | 159 | 160 | def normalize_object(obj, factor=1.0): 161 | max_dimension = max(get_3d_dimensions(obj)) 162 | scale_factor = factor * (1 / max_dimension) 163 | scale(obj, scale_factor) 164 | 165 | 166 | def move_to_xy(obj, x, y): 167 | min_z = float('inf') 168 | for vertex in obj.data.vertices: 169 | z = obj.matrix_world @ vertex.co 170 | min_z = min(min_z, z.z) 171 | obj.location -= Vector((0, 0, min_z)) 172 | freeze_transformation(obj) 173 | 174 | # move location x,y to sampled box center 175 | new_location = Vector((x, y, obj.location[2])) 176 | obj.location = new_location 177 | freeze_transformation(obj) 178 | 179 | 180 | def normalize_scene(): 181 | bbox_min, bbox_max = scene_bbox() 182 | scale = 1 / max(bbox_max - bbox_min) 183 | for obj in scene_root_objects(): 184 | obj.scale = obj.scale * scale 185 | # Apply scale to matrix_world. 186 | bpy.context.view_layer.update() 187 | bbox_min, bbox_max = scene_bbox() 188 | offset = -(bbox_min + bbox_max) / 2 189 | for obj in scene_root_objects(): 190 | obj.matrix_world.translation += offset 191 | bpy.ops.object.select_all(action="DESELECT") 192 | 193 | 194 | def setup_plane_and_background(plane_texture_path, hdri_path): 195 | # load plane 196 | plane_name = load_object(plane_texture_path) 197 | plane = bpy.data.objects.get(plane_name) 198 | scale(plane, 0.5) 199 | 200 | # load light map 201 | print(f"HDRI PATH: {hdri_path}") 202 | bpy.ops.image.open(filepath=hdri_path) 203 | if bpy.data.worlds.get("World") is None: 204 | bpy.data.worlds.new("World") 205 | 206 | bpy.context.scene.world = bpy.data.worlds["World"] 207 | 208 | bpy.context.scene.world.use_nodes = True 209 | tree = bpy.context.scene.world.node_tree 210 | tree.nodes.clear() 211 | 212 | tex_env = tree.nodes.new(type="ShaderNodeTexEnvironment") 213 | tex_env.image = bpy.data.images[hdri_path.split('/')[-1]] # Image name is typically the last part of the path 214 | background = tree.nodes.new(type="ShaderNodeBackground") 215 | output = tree.nodes.new(type="ShaderNodeOutputWorld") 216 | 217 | tree.links.new(tex_env.outputs[0], background.inputs[0]) 218 | tree.links.new(background.outputs[0], output.inputs[0]) 219 | 220 | return plane_texture_path + " " + hdri_path 221 | 222 | 223 | def setup_camera_and_lights( 224 | sun_x, 225 | sun_y, 226 | sun_energy, 227 | key_light_horizontal_angle, 228 | fill_light_horizontal_angle, 229 | key_light_vertical_angle, 230 | fill_light_vertical_angle 231 | ): 232 | # for seeting up the three point lighting, we mostly follow https://courses.cs.washington.edu/courses/cse458/05au/reading/3point_lighting.pdf 233 | # in order to keep lights and camera on the hemisphere pointing to origin, we use a hierarchy of empties 234 | 235 | # create the sun 236 | 237 | bpy.ops.object.light_add(type="SUN") 238 | sun = bpy.context.active_object 239 | sun.rotation_euler = Euler((sun_x, sun_y, 0), "XYZ") 240 | sun.data.energy = sun_energy 241 | 242 | # create global empty 243 | 244 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 245 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 246 | empty = bpy.context.scene.objects.get("Empty") 247 | 248 | # create camera 249 | 250 | # radius = random.uniform(1.8,2.2) 251 | radius = 2.5 252 | 253 | bpy.ops.object.camera_add(enter_editmode=False, align='VIEW', location=(-radius, 0, 0), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 254 | cam = bpy.context.scene.objects.get("Camera") 255 | cam.data.lens = 35 256 | cam.data.sensor_width = 32 257 | bpy.context.scene.camera = cam 258 | 259 | # create camera empty 260 | 261 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 262 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 263 | cam_empty = bpy.context.scene.objects.get("Empty.001") 264 | cam_empty.name = "camera_empty" 265 | 266 | # make camera empty parent of camera 267 | 268 | bpy.ops.object.select_all(action='DESELECT') 269 | cam.select_set(True) 270 | cam_empty.select_set(True) 271 | bpy.context.view_layer.objects.active = cam_empty 272 | bpy.ops.object.parent_set() 273 | bpy.ops.object.select_all(action='DESELECT') 274 | 275 | # make camera empty parent of global empty 276 | 277 | bpy.ops.object.select_all(action='DESELECT') 278 | cam_empty.select_set(True) 279 | empty.select_set(True) 280 | bpy.context.view_layer.objects.active = empty 281 | bpy.ops.object.parent_set() 282 | bpy.ops.object.select_all(action='DESELECT') 283 | 284 | light_names = ["key_light", "fill_light", "back_light"] 285 | light_energies = [1000., 300., 500.] 286 | 287 | for light_name, light_energy in zip(light_names, light_energies): 288 | # create light empty 289 | 290 | empty_name = light_name + "_empty" 291 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 292 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 293 | light_empty = bpy.context.scene.objects.get("Empty.001") 294 | light_empty.name = empty_name 295 | 296 | # parent light empty to main (camera) empty 297 | 298 | bpy.ops.object.select_all(action='DESELECT') 299 | light_empty.select_set(True) 300 | empty.select_set(True) 301 | bpy.context.view_layer.objects.active = empty 302 | bpy.ops.object.parent_set() 303 | bpy.ops.object.select_all(action='DESELECT') 304 | 305 | # create light 306 | 307 | x_loc, y_loc, z_loc = -radius, 0, 0 308 | bpy.ops.object.light_add(type='POINT', radius=1, align='WORLD', location=(x_loc, y_loc, z_loc), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 309 | bpy.data.objects["Point"].name = light_name 310 | light = bpy.data.objects[light_name] 311 | light.data.energy = light_energy 312 | # light.data.size = 0.5 313 | 314 | # parent light empty to light 315 | 316 | bpy.ops.object.select_all(action='DESELECT') 317 | light.select_set(True) 318 | light_empty.select_set(True) 319 | bpy.context.view_layer.objects.active = light_empty 320 | bpy.ops.object.parent_set() 321 | bpy.ops.object.select_all(action='DESELECT') 322 | 323 | # rotate camera and lights around the z-axis 324 | 325 | z_random_rot = radians(90) # radians(random.uniform(0,360)) 326 | empty.rotation_euler = Euler((0, 0, z_random_rot)) 327 | 328 | # # raise the camera while having it point to origin 329 | 330 | # cam_y_random_rot = radians(random.uniform(10,50)) 331 | # cam_empty.rotation_euler = Euler((0,cam_y_random_rot,0),"XYZ") 332 | 333 | bpy.context.view_layer.update() 334 | 335 | back_light_horizontal_angle = radians(180) 336 | light_horizontal_angles = [key_light_horizontal_angle, fill_light_horizontal_angle, back_light_horizontal_angle] 337 | for light_angle, light_name in zip(light_horizontal_angles, light_names): 338 | light_empty = bpy.data.objects[light_name + "_empty"] 339 | global_z = (light_empty.matrix_world.inverted() @ Vector((0.0, 0.0, 1.0, 0.0)))[:3] 340 | quat = Quaternion(global_z, light_angle) 341 | light_empty.rotation_euler = quat.to_euler() 342 | 343 | back_light_vertical_angle = 0 344 | light_vertical_angles = [key_light_vertical_angle, fill_light_vertical_angle, back_light_vertical_angle] 345 | # light_vertical_angles = [radians(-45)]*3 346 | 347 | for light_angle, light_name in zip(light_vertical_angles, light_names): 348 | light_empty = bpy.data.objects[light_name + "_empty"] 349 | global_x = (light_empty.matrix_world.inverted() @ Vector((1.0, 0.0, 0.0, 0.0)))[:3] 350 | quat = Quaternion(global_x, light_angle) 351 | euler_add = quat.to_euler() 352 | euler_current = light_empty.rotation_euler 353 | new_euler = Euler((euler_add[0] + euler_current[0], euler_add[1] + euler_current[1], euler_add[2] + euler_current[2])) 354 | light_empty.rotation_euler = new_euler 355 | 356 | # bpy.context.view_layer.update() 357 | 358 | return cam, empty 359 | 360 | 361 | def render(fp): 362 | # Render image 363 | bpy.context.view_layer.update() 364 | bpy.context.scene.render.filepath = fp 365 | bpy.ops.render.render(write_still=True) 366 | 367 | 368 | def setup_renderer(H, W, use_cpu=False): 369 | scene = bpy.context.scene 370 | render = bpy.context.scene.render 371 | 372 | render.engine = "CYCLES" 373 | render.image_settings.file_format = "PNG" 374 | render.image_settings.color_mode = "RGBA" 375 | render.resolution_x = W 376 | render.resolution_y = H 377 | render.resolution_percentage = 100 378 | 379 | scene.cycles.device = "CPU" if use_cpu else "GPU" 380 | scene.cycles.samples = 10 if use_cpu else 128 381 | scene.cycles.diffuse_bounces = 1 382 | scene.cycles.glossy_bounces = 1 383 | scene.cycles.transparent_max_bounces = 3 384 | scene.cycles.transmission_bounces = 3 385 | scene.cycles.filter_width = 0.01 386 | scene.cycles.use_denoising = True 387 | scene.render.film_transparent = False 388 | 389 | bpy.context.preferences.addons["cycles"].preferences.get_devices() 390 | # Set the device_type 391 | bpy.context.preferences.addons[ 392 | "cycles" 393 | ].preferences.compute_device_type = "METAL" if use_cpu else "CUDA" 394 | bpy.context.scene.view_settings.view_transform = 'Filmic' 395 | 396 | 397 | # def randomize_camera_view(axis): 398 | # euler_y = radians(random.uniform(-90, 90)) 399 | # euler_z = radians(random.uniform(0, 360)) 400 | # axis.rotation_euler = Euler((0, euler_y, euler_z)) 401 | 402 | 403 | def run_render(metadata, save_image_path, use_cpu): 404 | reset_scene() 405 | 406 | objs = [] 407 | for uid in metadata["objects"]: 408 | object_path = metadata["object_path"][uid] 409 | objs.append(bpy.data.objects.get(load_object(object_path))) 410 | 411 | grid_number = metadata["grid number"] 412 | 413 | if grid_number == 2: 414 | locations = { 415 | 0: [0.7, 0.5], 416 | 1: [0.7, -0.5], 417 | 2: [-0.6, 0.5], 418 | 3: [-0.6, -0.5] 419 | } 420 | scale_factor = 1 / 2 421 | elif grid_number == 3: 422 | locations = { 423 | 0: [0.9, 0.6], 424 | 1: [0.9, 0], 425 | 2: [0.9, -0.6], 426 | 3: [0.0, 0.6], 427 | 4: [0.0, 0.0], 428 | 5: [0.0, -0.6], 429 | 6: [-0.9, 0.6], 430 | 7: [-0.9, 0.0], 431 | 8: [-0.9, -0.6] 432 | } 433 | scale_factor = 1 / 3 434 | else: 435 | raise ValueError(f"Expected grid number to be 2 or 3 but got {grid_number}") 436 | 437 | # process rotate 438 | for idx, obj in enumerate(objs): 439 | rotate(obj, degree=metadata['object_angles'][idx]) 440 | 441 | # process scale 442 | if "sizes" in metadata: 443 | for idx, obj in enumerate(objs): 444 | normalize_object(obj, factor=metadata['sizes'][idx] * scale_factor) 445 | else: 446 | for obj in objs: 447 | normalize_object(obj, factor=scale_factor) 448 | 449 | for pos, obj in zip(metadata["grids"], objs): 450 | x, y = locations[pos] 451 | move_to_xy(obj, x, y) 452 | 453 | blender_config = metadata["blender_config"] 454 | 455 | setup_plane_and_background(blender_config["plane_texture_path"], blender_config["hdri_path"]) 456 | cam, axis = setup_camera_and_lights( 457 | blender_config["sun_x"], 458 | blender_config["sun_y"], 459 | blender_config["sun_energy"], 460 | blender_config["key_light_horizontal_angle"], 461 | blender_config["fill_light_horizontal_angle"], 462 | blender_config["key_light_vertical_angle"], 463 | blender_config["fill_light_vertical_angle"] 464 | ) 465 | axis.rotation_euler = Euler((0, radians(45), 0)) 466 | setup_renderer(H=metadata["H"], W=metadata["W"], use_cpu=use_cpu) 467 | render(save_image_path) 468 | 469 | 470 | if __name__ == "__main__": 471 | parser = argparse.ArgumentParser() 472 | parser.add_argument( 473 | "--save_local", 474 | type=str, 475 | default="" 476 | ) 477 | parser.add_argument( 478 | "--save_image_path", 479 | type=str, 480 | default="render.png" 481 | ) 482 | parser.add_argument( 483 | "--json_file", 484 | type=str, 485 | default="image_metadata.json" 486 | ) 487 | 488 | parser.add_argument( 489 | "--use_cpu", 490 | action="store_true", 491 | default=False 492 | ) 493 | 494 | argv = sys.argv[sys.argv.index("--") + 1:] 495 | args = parser.parse_args(argv) 496 | 497 | with open(args.json_file, "r") as f: 498 | metadata = json.load(f) 499 | 500 | run_render(metadata, args.save_image_path, args.use_cpu) 501 | -------------------------------------------------------------------------------- /tma/imageqa/scene_graph/single_image_task.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | from itertools import combinations 4 | from typing import List, Tuple 5 | 6 | import numpy as np 7 | from PIL import Image 8 | from tqdm import tqdm 9 | 10 | from ..metadata import SceneGraphMetaData 11 | from ...base import TaskGenerator 12 | from ...task_store import TaskStore 13 | 14 | 15 | def scene_graph_adjacent_objects(scene_graph, node): 16 | adjacent_objects = {} 17 | for edge in scene_graph["objects"][node]['relations']: 18 | obj = edge['object'] 19 | if obj not in adjacent_objects: 20 | adjacent_objects[obj] = [] 21 | adjacent_objects[obj].append((edge['name'], 0)) 22 | 23 | for obj, edges in scene_graph["objects"].items(): 24 | for edge in edges["relations"]: 25 | if edge['object'] == node: 26 | if obj not in adjacent_objects: 27 | adjacent_objects[obj] = [] 28 | adjacent_objects[obj].append((edge['name'], 1)) 29 | return adjacent_objects 30 | 31 | 32 | def subgraph_to_json_str(subgraph, scene_graph): 33 | subgraph_json = { 34 | "attributes" : [], 35 | "adjacent_objects": [], 36 | } 37 | adjacent_object_info = {} 38 | for element in subgraph: 39 | if isinstance(element, str): 40 | subgraph_json["attributes"].append(element) 41 | else: 42 | if len(element) == 2: 43 | obj, attr = element 44 | if obj not in adjacent_object_info: 45 | adjacent_object_info[obj] = { 46 | "attributes": [attr], 47 | "relation" : None 48 | } 49 | else: 50 | adjacent_object_info[obj]["attributes"].append(attr) 51 | else: 52 | obj, rel, direction = element 53 | if obj not in adjacent_object_info: 54 | adjacent_object_info[obj] = { 55 | "attributes": [], 56 | "relation" : (rel, direction) 57 | } 58 | else: 59 | adjacent_object_info[obj]["relation"] = (rel, direction) 60 | 61 | for obj, info in adjacent_object_info.items(): 62 | subgraph_json["adjacent_objects"].append({ 63 | "object" : scene_graph["objects"][obj]["name"], 64 | "attributes": sorted(info["attributes"]), 65 | "relation" : info["relation"] 66 | }) 67 | subgraph_json["attributes"] = sorted(subgraph_json["attributes"]) 68 | subgraph_json["adjacent_objects"] = sorted(subgraph_json["adjacent_objects"], key=lambda x: json.dumps(x)) 69 | return json.dumps(subgraph_json) 70 | 71 | 72 | def constrained_combinations(n, k, constraints): 73 | """ 74 | Generate all combinations of k elements from n elements that satisfy the constraints 75 | :param n: 76 | :param k: 77 | :param constraints: a list of tuples (i, j) that means that when i is not selected, i+1 ~ j should not be selected 78 | :return: a binary array of shape (x, n) where each row represents a valid combination 79 | """ 80 | combo = np.array(list(combinations(range(n), k))) 81 | selection = np.zeros((len(combo), n), dtype=bool) 82 | selection[np.arange(len(combo))[:, None], combo] = 1 83 | for start, end in constraints: 84 | selection = selection[~((selection[:, start] == 0) & (np.any(selection[:, start + 1:end], axis=1)))] 85 | return selection 86 | 87 | 88 | def compose_parallel_phrase(phrases): 89 | if len(phrases) == 0: 90 | return "" 91 | elif len(phrases) == 1: 92 | return phrases[0] 93 | elif len(phrases) == 2: 94 | return f"{phrases[0]} and {phrases[1]}" 95 | else: 96 | phrases[-1] = "and " + phrases[-1] 97 | return ", ".join(phrases) 98 | 99 | 100 | def compose_attributed_name(attributes, name): 101 | if len(attributes) > 0: 102 | attributes = compose_parallel_phrase(attributes) 103 | return f"{attributes} {name}" 104 | else: 105 | return name 106 | 107 | 108 | @functools.lru_cache(maxsize=100000) 109 | def compose_object_reference(subgraph: str): 110 | subgraph = json.loads(subgraph) 111 | 112 | # Helper function to create relation phrases 113 | def create_relation_phrase(attributed_name, relation_name, is_forward=True): 114 | return f"is {relation_name} the {attributed_name}" if is_forward else f"the {attributed_name} is {relation_name}" 115 | 116 | # Process relations 117 | forward_relations, backward_relations = [], [] 118 | 119 | for idx, node in enumerate(subgraph['adjacent_objects']): 120 | rel = node['relation'] 121 | attributed_name = compose_attributed_name(node.get("attributes", []), node['object']) 122 | if rel[1] == 0: 123 | forward_relations.append(create_relation_phrase(attributed_name, rel[0], True)) 124 | else: 125 | backward_relations.append(create_relation_phrase(attributed_name, rel[0], False)) 126 | 127 | # Combine relations into reference string 128 | reference = "" 129 | if forward_relations: 130 | reference += compose_parallel_phrase(forward_relations) 131 | if backward_relations: 132 | if forward_relations: 133 | reference += ", and also, " 134 | reference += compose_parallel_phrase(backward_relations) 135 | return reference 136 | 137 | 138 | def subgraph_contain_multiple_same_direction_relations(subgraph): 139 | out_rel = False 140 | in_rel = False 141 | for item in subgraph: 142 | if len(item) == 3: 143 | if item[2] == 0: 144 | if out_rel: 145 | return True 146 | out_rel = True 147 | else: 148 | if in_rel: 149 | return True 150 | in_rel = True 151 | return False 152 | 153 | 154 | def subgraph_contain_multiple_relations(subgraph): 155 | rel = False 156 | for item in subgraph: 157 | if isinstance(item, tuple) and len(item) == 3: 158 | if rel: 159 | return True 160 | rel = True 161 | return False 162 | 163 | 164 | class SceneGraphTaskGenerator(TaskGenerator): 165 | metadata: SceneGraphMetaData 166 | 167 | embed_schema = [ 168 | "task type", 169 | "object", 170 | "attribute value", 171 | "attribute type", 172 | "relation", 173 | "source object", 174 | "target object" 175 | ] 176 | 177 | def __init__(self, metadata: SceneGraphMetaData, subgraph_size=4, n_subgraph_per_answer=1, max_scene_graph_size=10000, seed=42): 178 | super().__init__(metadata, seed=seed) 179 | self.subgraph_size = subgraph_size 180 | self.n_subgraph_per_answer = n_subgraph_per_answer 181 | self.max_scene_graph_size = max_scene_graph_size 182 | 183 | def _enumerate_subgraphs_w_object( 184 | self, 185 | scene_graph, 186 | start_node, 187 | exclude_attribute_type=None, 188 | exclude_object=[] 189 | ): 190 | 191 | stamp = [] 192 | elements = [ 193 | attr for attr in scene_graph["objects"][start_node]['attributes'] 194 | if exclude_attribute_type is None or self.metadata.get_attribute_type(attr) != exclude_attribute_type 195 | ] 196 | adjacent_objects = scene_graph_adjacent_objects(scene_graph, start_node) 197 | for obj in adjacent_objects: 198 | if obj not in exclude_object: 199 | start = len(elements) 200 | elements.append(obj) 201 | elements += [(obj, attr) for attr in scene_graph["objects"][obj]['attributes']] 202 | stamp.append((start, len(elements))) 203 | if len(elements) < self.subgraph_size: 204 | return [] 205 | 206 | # sample all subgraphs that contain the start node with the given size 207 | selection = constrained_combinations(len(elements), self.subgraph_size, stamp) 208 | 209 | # distinguish subgraphs with and without the objects 210 | with_object_mask = np.any(selection[:, [start for start, _ in stamp]], axis=1) 211 | subgraphs_w_objects = [[elements[i] for i in np.where(indices)[0]] for indices in selection[with_object_mask]] 212 | subgraphs_wo_objects = [[elements[i] for i in np.where(indices)[0]] for indices in selection[~with_object_mask]] 213 | 214 | # for subgraph with object, add its all possible relations to the start node 215 | for obj, rels in adjacent_objects.items(): 216 | new_subgraphs = [] 217 | for subgraph in subgraphs_w_objects: 218 | if obj in subgraph: 219 | obj_id = subgraph.index(obj) 220 | for rel, direction in rels: 221 | subgraph_rel = subgraph.copy() 222 | subgraph_rel[obj_id] = (obj, rel, direction) 223 | # remove subgraphs with multiple same-direction relations 224 | if not subgraph_contain_multiple_relations(subgraph_rel): 225 | new_subgraphs.append(subgraph_rel) 226 | else: 227 | new_subgraphs.append(subgraph) 228 | subgraphs_w_objects = new_subgraphs 229 | 230 | subgraph_json_strs = [subgraph_to_json_str(subgraph, scene_graph) 231 | for subgraph in subgraphs_w_objects + subgraphs_wo_objects] 232 | 233 | return set(subgraph_json_strs) 234 | 235 | def _task_plan_to_str(self, task_plan) -> str: 236 | t = [] 237 | for k, v in task_plan.items(): 238 | if k in self.embed_schema: 239 | assert isinstance(v, str) 240 | t.append(f'{k}: {v}') 241 | return '\n'.join(t) 242 | 243 | def _generate_task(self, task_plan) -> Tuple[str, str, List[str], str]: 244 | "(Abstract method) generate task" 245 | 246 | def generate(self, task_plan, return_data=True, seed=None): 247 | if seed is not None: 248 | self.rng = np.random.default_rng(seed=seed) 249 | 250 | question, answer, options, scene_graph_id = self._generate_task(task_plan) 251 | 252 | task = { 253 | "question" : question.replace("_", " "), 254 | "answer" : answer.replace("_", " "), 255 | "options" : [o.replace("_", " ") for o in options], 256 | "task_plan" : self._task_plan_to_str(task_plan), 257 | "scene_graph_id": scene_graph_id, 258 | 'image' : Image.open(self.metadata.get_image_path(scene_graph_id)) if return_data else None 259 | } 260 | return task 261 | 262 | 263 | class WhatObjectSceneGraphTaskGenerator(SceneGraphTaskGenerator): 264 | schema = { 265 | "task type" : "str", 266 | "object" : "str", 267 | "subgraph" : "str", 268 | "scene graph id": "str", 269 | "answers" : "list", 270 | } 271 | 272 | def enumerate_object_subgraphs(self, scene_graph): 273 | subgraph_to_objects = {} 274 | for object, info in scene_graph["objects"].items(): 275 | obj_name = info['name'] 276 | if self.metadata.check_object_in_category(obj_name): 277 | subgraphs = self._enumerate_subgraphs_w_object(scene_graph, object) 278 | # subgraphs = self.rng.choice(list(subgraphs), min(self.n_subgraph_per_answer, len(subgraphs)), replace=False) 279 | for subgraph in subgraphs: 280 | if subgraph not in subgraph_to_objects: 281 | subgraph_to_objects[subgraph] = set() 282 | subgraph_to_objects[subgraph].add(obj_name) 283 | return subgraph_to_objects 284 | 285 | def enumerate_task_plans(self, task_store: TaskStore): 286 | 287 | for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what object] task"): 288 | 289 | if len(scene_graph["objects"]) < self.max_scene_graph_size: 290 | subgraph_to_nodes = self.enumerate_object_subgraphs(scene_graph) 291 | 292 | for subgraph_str, nodes in subgraph_to_nodes.items(): 293 | answers = sorted(list(nodes)) 294 | for node in nodes: 295 | task_plan = { 296 | "task type" : "what object", 297 | "scene graph id": scene_graph_id, 298 | "subgraph" : subgraph_str, 299 | "object" : node, 300 | "answers" : answers, 301 | } 302 | task_store.add(task_plan) 303 | 304 | def _generate_task(self, task_plan): 305 | obj_reference = compose_object_reference(task_plan["subgraph"]) 306 | subgraph = json.loads(task_plan["subgraph"]) 307 | object = task_plan["object"] 308 | scene_graph_id = task_plan["scene graph id"] 309 | 310 | attributed_name = compose_attributed_name(subgraph.get("attributes", []), "object") 311 | 312 | if obj_reference != "": 313 | obj_reference = f" that {obj_reference}" 314 | question = f"What is the {attributed_name}{obj_reference}?" 315 | 316 | answer = object 317 | exclude_categories = [self.metadata.sg_object_to_cateid[obj] for obj in task_plan["answers"]] 318 | negative_objects = [self.metadata.get_surfacename(cateid) for cateid in self.metadata.get_irrelevant_categories(exclude_categories)] 319 | options = self._compose_options(answer, negative_objects) 320 | 321 | return question, answer, options, scene_graph_id 322 | 323 | 324 | class WhatAttributeSceneGraphTaskGenerator(SceneGraphTaskGenerator): 325 | schema = { 326 | "task type" : "str", 327 | "attribute type" : "str", 328 | "attribute value": "str", 329 | "subgraph" : "str", 330 | "scene graph id" : "str", 331 | "answers" : "list", 332 | } 333 | 334 | def enumerate_attribute_subgraphs(self, scene_graph): 335 | subgraph_to_nodes = {} 336 | for node, info in scene_graph["objects"].items(): 337 | for attr in info['attributes']: 338 | attr_type = self.metadata.get_attribute_type(attr) 339 | subgraphs = self._enumerate_subgraphs_w_object(scene_graph, node, exclude_attribute_type=attr_type) 340 | # subgraphs = self.rng.choice(list(subgraphs), min(self.n_subgraph_per_answer, len(subgraphs)), replace=False) 341 | for subgraph in subgraphs: 342 | if subgraph not in subgraph_to_nodes: 343 | subgraph_to_nodes[subgraph] = {} 344 | if attr_type not in subgraph_to_nodes[subgraph]: 345 | subgraph_to_nodes[subgraph][attr_type] = set() 346 | subgraph_to_nodes[subgraph][attr_type].add(attr) 347 | return subgraph_to_nodes 348 | 349 | def enumerate_task_plans(self, task_store: TaskStore): 350 | for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what attribute] task"): 351 | if len(scene_graph["objects"]) < self.max_scene_graph_size: 352 | 353 | subgraphs_to_attrs = self.enumerate_attribute_subgraphs(scene_graph) 354 | for subgraph_str, attributes in subgraphs_to_attrs.items(): 355 | for attribute_type, attribute_set in attributes.items(): 356 | answers = sorted(list(attribute_set)) 357 | for attribute in attribute_set: 358 | task_plan = { 359 | "task type" : "what attribute", 360 | "scene graph id" : scene_graph_id, 361 | "subgraph" : subgraph_str, 362 | "attribute value": attribute, 363 | "answers" : answers, 364 | "attribute type" : attribute_type 365 | } 366 | task_store.add(task_plan) 367 | 368 | def _generate_task(self, task_plan): 369 | 370 | obj_reference = compose_object_reference(task_plan["subgraph"]) 371 | subgraph = json.loads(task_plan["subgraph"]) 372 | 373 | scene_graph_id = task_plan["scene graph id"] 374 | attribute = task_plan["attribute value"] 375 | attribute_type = task_plan["attribute type"] 376 | 377 | attributed_name = compose_attributed_name(subgraph.get("attributes", []), "object") 378 | 379 | if obj_reference != "": 380 | obj_reference = f" that {obj_reference}" 381 | 382 | attribute_type_word = lambda x: "attribute value" if x == "other" else x 383 | question = f"What is the {attribute_type_word(attribute_type)} of the {attributed_name}{obj_reference}?" 384 | answer = attribute 385 | negative_attributes = list(set(self.metadata.type_to_attribute[attribute_type]) - set(task_plan["answers"])) 386 | options = self._compose_options(answer, negative_attributes) 387 | 388 | return question, answer, options, scene_graph_id 389 | 390 | 391 | class WhatRelationSceneGraphTaskGenerator(SceneGraphTaskGenerator): 392 | schema = { 393 | "task type" : "str", 394 | "relation" : "str", 395 | "source object" : "str", 396 | "target object" : "str", 397 | "source subgraph": "str", 398 | "target subgraph": "str", 399 | "scene graph id" : "str", 400 | "answers" : "list" 401 | 402 | } 403 | 404 | def enumerate_relation_subgraphs(self, scene_graph): 405 | subgraph_to_nodes_cnt = {} 406 | for node, info in scene_graph["objects"].items(): 407 | subgraphs = self._enumerate_subgraphs_w_object(scene_graph, node) 408 | for subgraph in subgraphs: 409 | if subgraph not in subgraph_to_nodes_cnt: 410 | subgraph_to_nodes_cnt[subgraph] = 0 411 | subgraph_to_nodes_cnt[subgraph] += 1 412 | 413 | relations = {} 414 | for node, info in scene_graph["objects"].items(): 415 | for rel in info['relations']: 416 | obj2 = rel['object'] 417 | if (node, obj2) not in relations: 418 | relations[(node, obj2)] = set() 419 | relations[(node, obj2)].add(rel['name']) 420 | 421 | subgraph_to_relation = {} 422 | for (obj1, obj2), rels in relations.items(): 423 | 424 | subgraphs1 = self._enumerate_subgraphs_w_object(scene_graph, obj1, exclude_object=[obj2]) 425 | subgraphs1 = [subgraph for subgraph in subgraphs1 if subgraph_to_nodes_cnt[subgraph] == 1] 426 | subgraphs1 = self.rng.choice(list(subgraphs1), min(self.n_subgraph_per_answer, len(subgraphs1)), replace=False) 427 | 428 | subgraphs2 = self._enumerate_subgraphs_w_object(scene_graph, obj2, exclude_object=[obj1]) 429 | subgraphs2 = [subgraph for subgraph in subgraphs2 if subgraph_to_nodes_cnt[subgraph] == 1] 430 | subgraphs2 = self.rng.choice(list(subgraphs2), min(self.n_subgraph_per_answer, len(subgraphs2)), replace=False) 431 | 432 | obj1_name = scene_graph["objects"][obj1]["name"] 433 | obj2_name = scene_graph["objects"][obj2]["name"] 434 | for subgraph1 in subgraphs1: 435 | for subgraph2 in subgraphs2: 436 | subgraph_to_relation[(subgraph1, subgraph2)] = (rels, obj1_name, obj2_name) 437 | 438 | return subgraph_to_relation 439 | 440 | def enumerate_task_plans(self, task_store: TaskStore): 441 | for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what relation] task"): 442 | if len(scene_graph["objects"]) < self.max_scene_graph_size: 443 | subgraphs_to_rels = self.enumerate_relation_subgraphs(scene_graph) 444 | for subgraph, (rels, obj1, obj2) in subgraphs_to_rels.items(): 445 | answers = sorted(list(rels)) 446 | for rel in rels: 447 | task_plan = { 448 | "task type" : "what relation", 449 | "relation" : rel, 450 | "source object" : obj1, 451 | "target object" : obj2, 452 | "scene graph id" : scene_graph_id, 453 | "source subgraph": subgraph[0], 454 | "target subgraph": subgraph[1], 455 | "answers" : answers, 456 | } 457 | task_store.add(task_plan) 458 | 459 | def _generate_task(self, task_plan): 460 | source_obj_reference = compose_object_reference(task_plan["source subgraph"]) 461 | target_obj_reference = compose_object_reference(task_plan["target subgraph"]) 462 | 463 | source_subgraph = json.loads(task_plan["source subgraph"]) 464 | target_subgraph = json.loads(task_plan["target subgraph"]) 465 | relation = task_plan["relation"] 466 | scene_graph_id = task_plan["scene graph id"] 467 | 468 | source_attributed_name = compose_attributed_name(source_subgraph.get("attributes", []), "object") 469 | target_attributed_name = compose_attributed_name(target_subgraph.get("attributes", []), "object") 470 | 471 | if source_obj_reference != "": 472 | source_obj_reference = f", which {source_obj_reference}" 473 | if target_obj_reference != "": 474 | target_obj_reference = f", which {target_obj_reference}" 475 | 476 | question = f"What is the relation from the {source_attributed_name}{source_obj_reference}, to the {target_attributed_name}{target_obj_reference}?" 477 | answer = relation 478 | negatives = list(set(self.metadata.relations) - set(task_plan["answers"])) 479 | options = self._compose_options(answer, negatives) 480 | 481 | return question, answer, options, scene_graph_id 482 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/size_single_image_task.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from .single_image_task import _3DGridTaskGenerator 4 | from .utils import relative_positions, reverse_relative_positions 5 | from ..metadata import Objaverse3DMetaData 6 | from ...constant import NUM_OPTIONS 7 | from ...task_store import TaskStore 8 | 9 | largest_size = 1.5 10 | smallest_size = 0.5 11 | all_size_options = set([0.5, 1.0, 1.5]) 12 | grid_options = [2] 13 | 14 | 15 | class Size3DGridTaskGenerator(_3DGridTaskGenerator): 16 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 17 | super().__init__(metadata, seed=seed) 18 | self.grid_options = grid_options 19 | 20 | def _make_image_metadata(self, grid_size, sizes, size_options, grids, queries, remaining_query=...): 21 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 22 | 23 | remaining_grids = [g for g in range(grid_size ** 2) if g not in grids] 24 | for _ in remaining_grids: 25 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 26 | objects.append(uid) 27 | remaining_sizes = list(self.rng.choice(size_options, replace=True, size=len(remaining_grids))) 28 | 29 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 30 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 31 | 32 | image_metadata = { 33 | 'grid number' : grid_size, 34 | 'objects' : objects, 35 | 'object_path' : object_path, 36 | 'object_angles' : angles, 37 | 'grids' : grids + remaining_grids, 38 | 'blender_config': self.metadata.sample_blender_configuration(self.rng), 39 | 'sizes' : sizes + remaining_sizes 40 | } 41 | return image_metadata 42 | 43 | 44 | class WhatSize3DGridTaskGenerator(Size3DGridTaskGenerator): 45 | schema = { 46 | 'task type' : 'str', 47 | 'size' : 'str', 48 | 'grid number' : 'int', 49 | 'target category' : 'str', 50 | 'absolute position': 'str', 51 | 'attribute type' : 'str', 52 | 'attribute value' : 'str', 53 | } 54 | 55 | def enumerate_task_plans(self, task_store: TaskStore): 56 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what size] task"): 57 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 58 | for attribute_type, attribute_values in attribute_dict.items(): 59 | for attribute_value in attribute_values: 60 | for grid_size in self.grid_options: 61 | for absolute_pos in self.grid_mappings[grid_size]: 62 | task_plan = { 63 | 'task type' : 'what size', 64 | 'size' : 'largest', 65 | 'grid number' : grid_size, 66 | 'target category' : target_category, 67 | 'absolute position': absolute_pos, 68 | 'attribute type' : attribute_type, 69 | 'attribute value' : attribute_value, 70 | } 71 | task_store.add(task_plan) 72 | 73 | task_plan = { 74 | 'task type' : 'what size', 75 | 'size' : 'smallest', 76 | 'grid number' : grid_size, 77 | 'target category' : target_category, 78 | 'absolute position': absolute_pos, 79 | 'attribute type' : attribute_type, 80 | 'attribute value' : attribute_value, 81 | } 82 | task_store.add(task_plan) 83 | 84 | for grid_size in self.grid_options: 85 | for absolute_pos in self.grid_mappings[grid_size]: 86 | task_plan = { 87 | 'task type' : 'what size', 88 | 'size' : 'largest', 89 | 'grid number' : grid_size, 90 | 'target category' : target_category, 91 | 'absolute position': absolute_pos, 92 | } 93 | task_store.add(task_plan) 94 | 95 | task_plan = { 96 | 'task type' : 'what size', 97 | 'size' : 'smallest', 98 | 'grid number' : grid_size, 99 | 'target category' : target_category, 100 | 'absolute position': absolute_pos, 101 | } 102 | task_store.add(task_plan) 103 | 104 | def _generate_task(self, task_plan): 105 | grid_size = task_plan['grid number'] 106 | target_category = task_plan['target category'] 107 | absolute_pos = task_plan['absolute position'] 108 | grids = [self.grid_mappings[grid_size][absolute_pos]] 109 | 110 | if task_plan['size'] == 'largest': 111 | sizes = [largest_size] 112 | size_options = list(all_size_options - {largest_size}) 113 | question = f"What is the largest object in the image?" 114 | else: 115 | sizes = [smallest_size] 116 | size_options = list(all_size_options - {smallest_size}) 117 | question = f"What is the smallest object in the image?" 118 | 119 | queries = [self._get_target_object_query(task_plan)] 120 | 121 | remaining_query = self.metadata.and_query([("category", target_category, False)]) 122 | 123 | image_metadata = self._make_image_metadata( 124 | grid_size, 125 | sizes, 126 | size_options, 127 | grids, 128 | queries=queries, 129 | remaining_query=remaining_query, 130 | ) 131 | 132 | answer = self.metadata.get_surfacename(target_category) 133 | negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category)) 134 | for o in image_metadata['objects'][1:]] 135 | options = self._compose_options(answer, negatives) 136 | 137 | return question, answer, options, image_metadata 138 | 139 | 140 | class WhatAttributeSize3DGridTaskGenerator(Size3DGridTaskGenerator): 141 | schema = { 142 | 'task type' : 'str', 143 | 'size' : 'str', 144 | 'grid number' : 'int', 145 | 'target category' : 'str', 146 | 'absolute position': 'str', 147 | 'attribute type' : 'str', 148 | 'attribute value' : 'str', 149 | } 150 | 151 | def enumerate_task_plans(self, task_store: TaskStore): 152 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what size attribute] task"): 153 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 154 | for attribute_type, attribute_values in attribute_dict.items(): 155 | for attribute_value in attribute_values: 156 | for grid_size in self.grid_options: 157 | for absolute_pos in self.grid_mappings[grid_size]: 158 | task_plan = { 159 | 'task type' : 'what attribute size', 160 | 'size' : 'largest', 161 | 'grid number' : grid_size, 162 | 'target category' : target_category, 163 | 'absolute position': absolute_pos, 164 | 'attribute type' : attribute_type, 165 | 'attribute value' : attribute_value, 166 | } 167 | task_store.add(task_plan) 168 | 169 | task_plan = { 170 | 'task type' : 'what attribute size', 171 | 'size' : 'smallest', 172 | 'grid number' : grid_size, 173 | 'target category' : target_category, 174 | 'absolute position': absolute_pos, 175 | 'attribute type' : attribute_type, 176 | 'attribute value' : attribute_value, 177 | } 178 | task_store.add(task_plan) 179 | 180 | def _generate_task(self, task_plan): 181 | grid_size = task_plan['grid number'] 182 | 183 | attribute_type = task_plan['attribute type'] 184 | 185 | absolute_pos = task_plan['absolute position'] 186 | grids = [self.grid_mappings[grid_size][absolute_pos]] 187 | 188 | queries = [self._get_target_object_query(task_plan)] 189 | 190 | if task_plan['size'] == 'largest': 191 | sizes = [largest_size] 192 | size_options = list(all_size_options - {largest_size}) 193 | question = f"What is the {attribute_type} of the largest object in the image?" 194 | else: 195 | sizes = [smallest_size] 196 | size_options = list(all_size_options - {smallest_size}) 197 | question = f"What is the {attribute_type} of the smallest object in the image?" 198 | 199 | image_metadata = self._make_image_metadata( 200 | grid_size, 201 | sizes, 202 | size_options, 203 | grids, 204 | queries=queries, 205 | ) 206 | 207 | answer = task_plan['attribute value'] 208 | target_object = image_metadata['objects'][0] 209 | negative_query = self.metadata.and_query([ 210 | (attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)])) 211 | ]) 212 | negatives = self.metadata.sample( 213 | self.rng, 214 | NUM_OPTIONS - 1, 215 | attribute_type, 216 | query=negative_query, 217 | ) 218 | options = [answer] + negatives 219 | 220 | return question, answer, options, image_metadata 221 | 222 | 223 | class WhereSize3DGridTaskGenerator(Size3DGridTaskGenerator): 224 | schema = { 225 | 'task type' : 'str', 226 | 'size' : 'str', 227 | 'grid number' : 'int', 228 | 'target category' : 'str', 229 | 'absolute position' : 'str', 230 | 'attribute type' : 'str', 231 | 'attribute value' : 'str', 232 | 'reference category' : 'str', 233 | 'reference position' : 'str', 234 | 'target-reference order': 'str' 235 | } 236 | 237 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 238 | super().__init__(metadata, seed=seed) 239 | self.relative_positions = relative_positions 240 | 241 | def enumerate_task_plans(self, task_store: TaskStore): 242 | for target_category in tqdm(self.metadata.categories, desc="enumerating [where size] task"): 243 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 244 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 245 | for attribute_type, attribute_values in attribute_dict.items(): 246 | for attribute_value in attribute_values: 247 | for grid_size in self.grid_options: 248 | for absolute_pos in self.grid_mappings[grid_size]: 249 | task_plan = { 250 | 'task type' : 'where size', 251 | 'size' : 'largest', 252 | 'grid number' : grid_size, 253 | 'target category' : target_category, 254 | 'absolute position': absolute_pos, 255 | 'attribute type' : attribute_type, 256 | 'attribute value' : attribute_value, 257 | } 258 | task_store.add(task_plan) 259 | 260 | task_plan = { 261 | 'task type' : 'where size', 262 | 'size' : 'smallest', 263 | 'grid number' : grid_size, 264 | 'target category' : target_category, 265 | 'absolute position': absolute_pos, 266 | 'attribute type' : attribute_type, 267 | 'attribute value' : attribute_value, 268 | } 269 | task_store.add(task_plan) 270 | 271 | grid = self.grid_mappings[grid_size][absolute_pos] 272 | 273 | for reference_category in irrelevant_categories: 274 | for reference_pos in self.relative_positions: 275 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 276 | if ref_grid >= 0: 277 | task_plan = { 278 | 'task type' : 'where size', 279 | 'size' : 'largest', 280 | 'grid number' : grid_size, 281 | 'target category' : target_category, 282 | 'absolute position' : absolute_pos, 283 | 'reference category' : reference_category, 284 | 'reference position' : reference_pos, 285 | 'attribute type' : attribute_type, 286 | 'attribute value' : attribute_value, 287 | 'target-reference order': 'target first' 288 | } 289 | task_store.add(task_plan) 290 | 291 | task_plan = { 292 | 'task type' : 'where size', 293 | 'size' : 'largest', 294 | 'grid number' : grid_size, 295 | 'target category' : target_category, 296 | 'absolute position' : absolute_pos, 297 | 'reference category' : reference_category, 298 | 'reference position' : reference_pos, 299 | 'attribute type' : attribute_type, 300 | 'attribute value' : attribute_value, 301 | 'target-reference order': 'reference first' 302 | } 303 | task_store.add(task_plan) 304 | 305 | task_plan = { 306 | 'task type' : 'where size', 307 | 'size' : 'smallest', 308 | 'grid number' : grid_size, 309 | 'target category' : target_category, 310 | 'absolute position' : absolute_pos, 311 | 'reference category' : reference_category, 312 | 'reference position' : reference_pos, 313 | 'attribute type' : attribute_type, 314 | 'attribute value' : attribute_value, 315 | 'target-reference order': 'target first' 316 | } 317 | task_store.add(task_plan) 318 | 319 | task_plan = { 320 | 'task type' : 'where size', 321 | 'size' : 'smallest', 322 | 'grid number' : grid_size, 323 | 'target category' : target_category, 324 | 'absolute position' : absolute_pos, 325 | 'reference category' : reference_category, 326 | 'reference position' : reference_pos, 327 | 'attribute type' : attribute_type, 328 | 'attribute value' : attribute_value, 329 | 'target-reference order': 'reference first' 330 | } 331 | task_store.add(task_plan) 332 | 333 | for grid_size in self.grid_options: 334 | for absolute_pos in self.grid_mappings[grid_size]: 335 | task_plan = { 336 | 'task type' : 'where size', 337 | 'size' : 'largest', 338 | 'grid number' : grid_size, 339 | 'target category' : target_category, 340 | 'absolute position': absolute_pos, 341 | } 342 | task_store.add(task_plan) 343 | 344 | task_plan = { 345 | 'task type' : 'where size', 346 | 'size' : 'smallest', 347 | 'grid number' : grid_size, 348 | 'target category' : target_category, 349 | 'absolute position': absolute_pos, 350 | } 351 | task_store.add(task_plan) 352 | 353 | grid = self.grid_mappings[grid_size][absolute_pos] 354 | 355 | for reference_category in irrelevant_categories: 356 | for reference_pos in self.relative_positions: 357 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 358 | if ref_grid >= 0: 359 | task_plan = { 360 | 'task type' : 'where size', 361 | 'size' : 'largest', 362 | 'grid number' : grid_size, 363 | 'target category' : target_category, 364 | 'absolute position' : absolute_pos, 365 | 'reference category' : reference_category, 366 | 'reference position' : reference_pos, 367 | 'target-reference order': 'target first' 368 | } 369 | task_store.add(task_plan) 370 | 371 | task_plan = { 372 | 'task type' : 'where size', 373 | 'size' : 'largest', 374 | 'grid number' : grid_size, 375 | 'target category' : target_category, 376 | 'absolute position' : absolute_pos, 377 | 'reference category' : reference_category, 378 | 'reference position' : reference_pos, 379 | 'target-reference order': 'reference first' 380 | } 381 | task_store.add(task_plan) 382 | 383 | task_plan = { 384 | 'task type' : 'where size', 385 | 'size' : 'smallest', 386 | 'grid number' : grid_size, 387 | 'target category' : target_category, 388 | 'absolute position' : absolute_pos, 389 | 'reference category' : reference_category, 390 | 'reference position' : reference_pos, 391 | 'target-reference order': 'target first' 392 | } 393 | task_store.add(task_plan) 394 | 395 | task_plan = { 396 | 'task type' : 'where size', 397 | 'size' : 'smallest', 398 | 'grid number' : grid_size, 399 | 'target category' : target_category, 400 | 'absolute position' : absolute_pos, 401 | 'reference category' : reference_category, 402 | 'reference position' : reference_pos, 403 | 'target-reference order': 'reference first' 404 | } 405 | task_store.add(task_plan) 406 | 407 | def _generate_task(self, task_plan): 408 | grid_size = task_plan['grid number'] 409 | 410 | target_category = task_plan['target category'] 411 | categories = [target_category] 412 | queries = [self._get_target_object_query(task_plan)] 413 | absolute_pos = task_plan['absolute position'] 414 | grids = [self.grid_mappings[grid_size][absolute_pos]] 415 | 416 | if 'reference category' in task_plan: 417 | reference_pos = task_plan['reference position'] 418 | reference_category = task_plan['reference category'] 419 | categories.append(reference_category) 420 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 421 | 422 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 423 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 424 | grids.append(ref_grid) 425 | 426 | if task_plan['target-reference order'] == 'target first': 427 | if task_plan['size'] == 'largest': 428 | question = f"Where is the largest object in the image with respect to the {self.metadata.get_surfacename(reference_category)}?" 429 | else: 430 | question = f"Where is the smallest object in the image with respect to the {self.metadata.get_surfacename(reference_category)}?" 431 | answer = reference_pos 432 | else: 433 | if task_plan['size'] == 'largest': 434 | question = f"Where is the {self.metadata.get_surfacename(reference_category)} with respect to the largest object in the image?" 435 | else: 436 | question = f"Where is the {self.metadata.get_surfacename(reference_category)} with respect to the smallest object in the image?" 437 | answer = reverse_relative_positions[reference_pos] 438 | negatives = [o for o in self.relative_positions if o != answer] 439 | else: 440 | if task_plan['size'] == 'largest': 441 | question = f"Where is the largest object in the image?" 442 | else: 443 | question = f"Where is the smallest object in the image?" 444 | answer = absolute_pos 445 | negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer] 446 | 447 | if task_plan['size'] == 'largest': 448 | sizes = [largest_size] 449 | size_options = list(all_size_options - {largest_size}) 450 | else: 451 | sizes = [smallest_size] 452 | size_options = list(all_size_options - {smallest_size}) 453 | sizes += list(self.rng.choice(size_options, replace=True, size=1)) 454 | 455 | options = self._compose_options(answer, negatives) 456 | image_metadata = self._make_image_metadata( 457 | grid_size, 458 | sizes, 459 | size_options, 460 | grids, 461 | queries=queries, 462 | remaining_query=self.metadata.and_query([("category", c, False) for c in categories]) 463 | ) 464 | 465 | return question, answer, options, image_metadata 466 | -------------------------------------------------------------------------------- /tma/imageqa/tabletop_3d/distance_single_image_task.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from .single_image_task import _3DGridTaskGenerator 4 | from .utils import relative_positions 5 | from ..metadata import Objaverse3DMetaData 6 | from ...constant import NUM_OPTIONS 7 | from ...task_store import TaskStore 8 | 9 | grid_options = [3] 10 | 11 | relative_distance = { 12 | '0': [[1, 3], [4], [2, 6], [5, 7], [8]], 13 | '1': [[0, 4, 2], [3, 5], [7], [6, 8]], 14 | '2': [[1, 5], [4], [0, 8], [3, 7], [6]], 15 | '3': [[0, 4, 6], [1, 7], [5], [2, 8]], 16 | '4': [[1, 3, 5, 7], [0, 2, 6, 8]], 17 | '5': [[2, 4, 8], [1, 7], [3], [0, 6]], 18 | '6': [[3, 7], [4], [0, 8], [1, 5], [2]], 19 | '7': [[4, 6, 8], [3, 5], [1], [2, 0]], 20 | '8': [[5, 7], [4], [2, 6], [1, 3], [0]], 21 | } 22 | 23 | 24 | def _get_relative_distance_level(ref, target): 25 | for idx, level in enumerate(relative_distance[str(ref)]): 26 | if target in level: 27 | return idx 28 | 29 | 30 | def _get_max_distance_level(ref): 31 | return len(relative_distance[str(ref)]) - 1 32 | 33 | 34 | def _get_farther_grids(ref, target): 35 | ref_level = _get_relative_distance_level(ref, target) 36 | farther_grids = [] 37 | for level in relative_distance[str(ref)][ref_level + 1:]: 38 | farther_grids.extend(level) 39 | return farther_grids 40 | 41 | 42 | def _get_closer_grids(ref, target): 43 | ref_level = _get_relative_distance_level(ref, target) 44 | closer_grids = [] 45 | for level in relative_distance[str(ref)][:ref_level]: 46 | closer_grids.extend(level) 47 | return closer_grids 48 | 49 | 50 | class Distance3DGridTaskGenerator(_3DGridTaskGenerator): 51 | 52 | def __init__(self, metadata: Objaverse3DMetaData, max_num_distracting_object=2, seed=42): 53 | super().__init__(metadata, seed=seed) 54 | self.grid_options = grid_options 55 | self.max_num_distracting_object = max_num_distracting_object 56 | 57 | def _make_image_metadata(self, grid_size, distance_type, grids, queries, remaining_query=...): 58 | target_grid = grids[0] 59 | ref_grid = grids[1] 60 | objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries] 61 | if distance_type == 'farthest': 62 | possible_closer_grids = _get_closer_grids(ref_grid, target_grid) 63 | remaining_grids = self.rng.choice(possible_closer_grids, replace=False, size=min(self.max_num_distracting_object, len(possible_closer_grids))) 64 | else: 65 | possible_farther_grids = _get_farther_grids(ref_grid, target_grid) 66 | remaining_grids = self.rng.choice(possible_farther_grids, replace=False, size=min(self.max_num_distracting_object, len(possible_farther_grids))) 67 | 68 | remaining_grids = [int(grid) for grid in remaining_grids] # convert numpy.int64 to int to feed into json 69 | 70 | for _ in remaining_grids: 71 | uid = self.metadata.sample(self.rng, 1, "object", remaining_query) 72 | objects.append(uid) 73 | 74 | object_path = {k: self.metadata.get_object_path(k) for k in objects} 75 | angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects] 76 | 77 | image_metadata = { 78 | 'grid number' : grid_size, 79 | 'objects' : objects, 80 | 'object_path' : object_path, 81 | 'object_angles' : angles, 82 | 'grids' : grids + remaining_grids, 83 | 'blender_config': self.metadata.sample_blender_configuration(self.rng), 84 | } 85 | return image_metadata 86 | 87 | 88 | class WhatDistance3DGridTaskGenerator(Distance3DGridTaskGenerator): 89 | schema = { 90 | 'task type' : 'str', 91 | 'distance type' : 'str', 92 | 'grid number' : 'int', 93 | 'target category' : 'str', 94 | 'absolute position' : 'str', 95 | 'attribute type' : 'str', 96 | 'attribute value' : 'str', 97 | 'reference category': 'str', 98 | 'reference position': 'str', 99 | } 100 | 101 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 102 | super().__init__(metadata, seed=seed) 103 | self.relative_positions = relative_positions 104 | 105 | def enumerate_task_plans(self, task_store: TaskStore): 106 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what distance] task"): 107 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 108 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 109 | for attribute_type, attribute_values in attribute_dict.items(): 110 | for attribute_value in attribute_values: 111 | for grid_size in self.grid_options: 112 | for absolute_pos in self.grid_mappings[grid_size]: 113 | grid = self.grid_mappings[grid_size][absolute_pos] 114 | for reference_category in irrelevant_categories: 115 | for reference_pos in self.relative_positions: 116 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 117 | if ref_grid >= 0: 118 | if (_get_relative_distance_level(ref_grid, grid) > 0): 119 | task_plan = { 120 | 'task type' : 'what distance', 121 | 'distance type' : 'farthest', 122 | 'grid number' : grid_size, 123 | 'target category' : target_category, 124 | 'absolute position' : absolute_pos, 125 | 'reference category': reference_category, 126 | 'reference position': reference_pos, 127 | 'attribute type' : attribute_type, 128 | 'attribute value' : attribute_value, 129 | } 130 | task_store.add(task_plan) 131 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 132 | task_plan = { 133 | 'task type' : 'what distance', 134 | 'distance type' : 'closest', 135 | 'grid number' : grid_size, 136 | 'target category' : target_category, 137 | 'absolute position' : absolute_pos, 138 | 'reference category': reference_category, 139 | 'reference position': reference_pos, 140 | 'attribute type' : attribute_type, 141 | 'attribute value' : attribute_value, 142 | } 143 | task_store.add(task_plan) 144 | 145 | for grid_size in self.grid_options: 146 | for absolute_pos in self.grid_mappings[grid_size]: 147 | grid = self.grid_mappings[grid_size][absolute_pos] 148 | for reference_category in irrelevant_categories: 149 | for reference_pos in self.relative_positions: 150 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 151 | if ref_grid >= 0: 152 | if (_get_relative_distance_level(ref_grid, grid) > 0): 153 | task_plan = { 154 | 'task type' : 'what distance', 155 | 'distance type' : 'farthest', 156 | 'grid number' : grid_size, 157 | 'target category' : target_category, 158 | 'absolute position' : absolute_pos, 159 | 'reference category': reference_category, 160 | 'reference position': reference_pos, 161 | } 162 | task_store.add(task_plan) 163 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 164 | task_plan = { 165 | 'task type' : 'what distance', 166 | 'distance type' : 'closest', 167 | 'grid number' : grid_size, 168 | 'target category' : target_category, 169 | 'absolute position' : absolute_pos, 170 | 'reference category': reference_category, 171 | 'reference position': reference_pos, 172 | } 173 | task_store.add(task_plan) 174 | 175 | def _generate_task(self, task_plan): 176 | grid_size = task_plan['grid number'] 177 | 178 | target_category = task_plan['target category'] 179 | absolute_pos = task_plan['absolute position'] 180 | grids = [self.grid_mappings[grid_size][absolute_pos]] 181 | queries = [self._get_target_object_query(task_plan)] 182 | 183 | remaining_query = [("category", target_category, False)] 184 | 185 | reference_pos = task_plan['reference position'] 186 | reference_category = task_plan['reference category'] 187 | 188 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 189 | remaining_query += [("category", reference_category, False)] 190 | 191 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 192 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 193 | grids.append(ref_grid) 194 | 195 | if task_plan['distance type'] == 'farthest': 196 | question = f"What is the object that is farthest from the {self.metadata.get_surfacename(reference_category)}?" 197 | else: 198 | question = f"What is the object that is closest to the {self.metadata.get_surfacename(reference_category)}?" 199 | 200 | image_metadata = self._make_image_metadata( 201 | grid_size, 202 | distance_type=task_plan['distance type'], 203 | grids=grids, 204 | queries=queries, 205 | remaining_query=self.metadata.and_query(remaining_query) 206 | ) 207 | 208 | answer = self.metadata.get_surfacename(target_category) 209 | negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category)) 210 | for o in image_metadata['objects'][1:]] 211 | options = self._compose_options(answer, negatives) 212 | 213 | return question, answer, options, image_metadata 214 | 215 | 216 | class WhatAttributeDistance3DGridTaskGenerator(Distance3DGridTaskGenerator): 217 | schema = { 218 | 'task type' : 'str', 219 | 'distance type' : 'str', 220 | 'grid number' : 'int', 221 | 'target category' : 'str', 222 | 'absolute position' : 'str', 223 | 'attribute type' : 'str', 224 | 'attribute value' : 'str', 225 | 'reference category': 'str', 226 | 'reference position': 'str', 227 | } 228 | 229 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 230 | super().__init__(metadata, seed=seed) 231 | self.relative_positions = relative_positions 232 | 233 | def enumerate_task_plans(self, task_store: TaskStore): 234 | for target_category in tqdm(self.metadata.categories, desc="enumerating [what attribute distance] task"): 235 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 236 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 237 | for attribute_type, attribute_values in attribute_dict.items(): 238 | for attribute_value in attribute_values: 239 | for grid_size in self.grid_options: 240 | for absolute_pos in self.grid_mappings[grid_size]: 241 | grid = self.grid_mappings[grid_size][absolute_pos] 242 | for reference_category in irrelevant_categories: 243 | for reference_pos in self.relative_positions: 244 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 245 | if ref_grid >= 0: 246 | if (_get_relative_distance_level(ref_grid, grid) > 0): 247 | task_plan = { 248 | 'task type' : 'what attribute distance', 249 | 'distance type' : 'farthest', 250 | 'grid number' : grid_size, 251 | 'target category' : target_category, 252 | 'absolute position' : absolute_pos, 253 | 'reference category': reference_category, 254 | 'reference position': reference_pos, 255 | 'attribute type' : attribute_type, 256 | 'attribute value' : attribute_value, 257 | } 258 | task_store.add(task_plan) 259 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 260 | task_plan = { 261 | 'task type' : 'what attribute distance', 262 | 'distance type' : 'closest', 263 | 'grid number' : grid_size, 264 | 'target category' : target_category, 265 | 'absolute position' : absolute_pos, 266 | 'reference category': reference_category, 267 | 'reference position': reference_pos, 268 | 'attribute type' : attribute_type, 269 | 'attribute value' : attribute_value, 270 | } 271 | task_store.add(task_plan) 272 | 273 | def _generate_task(self, task_plan): 274 | grid_size = task_plan['grid number'] 275 | 276 | attribute_type = task_plan['attribute type'] 277 | 278 | absolute_pos = task_plan['absolute position'] 279 | grids = [self.grid_mappings[grid_size][absolute_pos]] 280 | 281 | queries = [self._get_target_object_query(task_plan)] 282 | 283 | reference_pos = task_plan['reference position'] 284 | reference_category = task_plan['reference category'] 285 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 286 | 287 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 288 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 289 | 290 | grids.append(ref_grid) 291 | if task_plan['distance type'] == 'farthest': 292 | question = f"What is the {attribute_type} of the object that is farthest from the {self.metadata.get_surfacename(reference_category)}?" 293 | else: 294 | question = f"What is the {attribute_type} of the object that is closest to the {self.metadata.get_surfacename(reference_category)}?" 295 | 296 | image_metadata = self._make_image_metadata( 297 | grid_size, 298 | distance_type=task_plan['distance type'], 299 | grids=grids, 300 | queries=queries, 301 | ) 302 | 303 | answer = task_plan['attribute value'] 304 | target_object = image_metadata['objects'][0] 305 | negative_query = self.metadata.and_query([ 306 | (attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)])) 307 | ]) 308 | negatives = self.metadata.sample( 309 | self.rng, 310 | NUM_OPTIONS - 1, 311 | attribute_type, 312 | query=negative_query, 313 | ) 314 | options = [answer] + negatives 315 | 316 | return question, answer, options, image_metadata 317 | 318 | 319 | class WhereDistance3DGridTaskGenerator(Distance3DGridTaskGenerator): 320 | schema = { 321 | 'task type' : 'str', 322 | 'distance type' : 'str', 323 | 'grid number' : 'int', 324 | 'target category' : 'str', 325 | 'absolute position' : 'str', 326 | 'attribute type' : 'str', 327 | 'attribute value' : 'str', 328 | 'reference category': 'str', 329 | 'reference position': 'str', 330 | } 331 | 332 | def __init__(self, metadata: Objaverse3DMetaData, seed=42): 333 | super().__init__(metadata, seed=seed) 334 | self.relative_positions = relative_positions 335 | 336 | def enumerate_task_plans(self, task_store: TaskStore): 337 | for target_category in tqdm(self.metadata.categories, desc="enumerating [where distance] task"): 338 | irrelevant_categories = self.metadata.get_irrelevant_categories(target_category) 339 | attribute_dict = self.metadata.get_category_attribute_dict(target_category) 340 | for attribute_type, attribute_values in attribute_dict.items(): 341 | for attribute_value in attribute_values: 342 | for grid_size in self.grid_options: 343 | for absolute_pos in self.grid_mappings[grid_size]: 344 | grid = self.grid_mappings[grid_size][absolute_pos] 345 | for reference_category in irrelevant_categories: 346 | for reference_pos in self.relative_positions: 347 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 348 | if ref_grid >= 0: 349 | if (_get_relative_distance_level(ref_grid, grid) > 0): 350 | task_plan = { 351 | 'task type' : 'where distance', 352 | 'distance type' : 'farthest', 353 | 'grid number' : grid_size, 354 | 'target category' : target_category, 355 | 'absolute position' : absolute_pos, 356 | 'reference category': reference_category, 357 | 'reference position': reference_pos, 358 | 'attribute type' : attribute_type, 359 | 'attribute value' : attribute_value, 360 | } 361 | task_store.add(task_plan) 362 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 363 | task_plan = { 364 | 'task type' : 'where distance', 365 | 'distance type' : 'closest', 366 | 'grid number' : grid_size, 367 | 'target category' : target_category, 368 | 'absolute position' : absolute_pos, 369 | 'reference category': reference_category, 370 | 'reference position': reference_pos, 371 | 'attribute type' : attribute_type, 372 | 'attribute value' : attribute_value, 373 | } 374 | task_store.add(task_plan) 375 | 376 | for grid_size in self.grid_options: 377 | for absolute_pos in self.grid_mappings[grid_size]: 378 | grid = self.grid_mappings[grid_size][absolute_pos] 379 | for reference_category in irrelevant_categories: 380 | for reference_pos in self.relative_positions: 381 | ref_grid = self._relative_grid(grid_size, grid, reference_pos) 382 | if ref_grid >= 0: 383 | if (_get_relative_distance_level(ref_grid, grid) > 0): 384 | task_plan = { 385 | 'task type' : 'where distance', 386 | 'distance type' : 'farthest', 387 | 'grid number' : grid_size, 388 | 'target category' : target_category, 389 | 'absolute position' : absolute_pos, 390 | 'reference category': reference_category, 391 | 'reference position': reference_pos, 392 | } 393 | task_store.add(task_plan) 394 | if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)): 395 | task_plan = { 396 | 'task type' : 'where distance', 397 | 'distance type' : 'closest', 398 | 'grid number' : grid_size, 399 | 'target category' : target_category, 400 | 'absolute position' : absolute_pos, 401 | 'reference category': reference_category, 402 | 'reference position': reference_pos, 403 | } 404 | task_store.add(task_plan) 405 | 406 | def _generate_task(self, task_plan): 407 | grid_size = task_plan['grid number'] 408 | 409 | target_category = task_plan['target category'] 410 | categories = [target_category] 411 | queries = [self._get_target_object_query(task_plan)] 412 | absolute_pos = task_plan['absolute position'] 413 | grids = [self.grid_mappings[grid_size][absolute_pos]] 414 | 415 | reference_pos = task_plan['reference position'] 416 | reference_category = task_plan['reference category'] 417 | categories.append(reference_category) 418 | queries.append(self.metadata.and_query([("category", reference_category, True)])) 419 | 420 | ref_grid = self._relative_grid(grid_size, grids[0], reference_pos) 421 | assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid) 422 | grids.append(ref_grid) 423 | 424 | if task_plan['distance type'] == 'farthest': 425 | question = f"Where is the object that is farthest from the {self.metadata.get_surfacename(reference_category)} in the image?" 426 | else: 427 | question = f"Where is the object that is closest to the {self.metadata.get_surfacename(reference_category)} in the image?" 428 | answer = absolute_pos 429 | negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer] 430 | 431 | options = self._compose_options(answer, negatives) 432 | image_metadata = self._make_image_metadata( 433 | grid_size, 434 | distance_type=task_plan['distance type'], 435 | grids=grids, 436 | queries=queries, 437 | remaining_query=self.metadata.and_query([("category", c, False) for c in categories]) 438 | ) 439 | 440 | return question, answer, options, image_metadata 441 | -------------------------------------------------------------------------------- /tma/videoqa/tabletop_3d/run_blender.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import math 4 | import os 5 | import sys 6 | import urllib.request 7 | from math import radians 8 | 9 | try: 10 | import bpy 11 | from mathutils import Vector, Matrix, Quaternion, Euler 12 | except ImportError: 13 | pass 14 | 15 | 16 | def get_exact_frame(current_keyframe_idx, total_num_frames): 17 | return (current_keyframe_idx) * (total_num_frames // 4) + 1 18 | 19 | 20 | def rotate(obj, degree): 21 | """Rotates around the z axis by theta""" 22 | degree = -degree 23 | bpy.ops.object.select_all(action='DESELECT') 24 | obj.select_set(True) 25 | bpy.context.view_layer.objects.active = obj 26 | radian = radians(degree) 27 | bpy.context.object.rotation_mode = 'XYZ' 28 | rot_x, rot_y, rot_z = obj.rotation_euler 29 | obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian)) 30 | freeze_transformation(obj) 31 | 32 | 33 | def rotate_and_keyframe(obj, degree, frame): 34 | degree = -degree 35 | bpy.ops.object.select_all(action='DESELECT') 36 | obj.select_set(True) 37 | bpy.context.scene.frame_set(frame) 38 | 39 | bpy.context.view_layer.objects.active = obj 40 | bpy.ops.object.origin_set(type='ORIGIN_GEOMETRY') 41 | bpy.context.object.rotation_mode = 'XYZ' 42 | radian = radians(degree) 43 | rot_x, rot_y, rot_z = obj.rotation_euler 44 | obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian)) 45 | obj.keyframe_insert(data_path="rotation_euler", frame=frame) 46 | bpy.ops.object.select_all(action='DESELECT') 47 | 48 | 49 | def reset_scene(): 50 | # delete everything that isn't part of a camera or a light 51 | bpy.ops.object.select_all(action="SELECT") 52 | for obj in bpy.data.objects: 53 | bpy.data.objects.remove(obj, do_unlink=True) 54 | bpy.ops.ptcache.free_bake_all() 55 | 56 | 57 | def select_hierarchy(obj): 58 | """Recursively select an object and all of its descendants.""" 59 | obj.select_set(True) 60 | for child in obj.children: 61 | select_hierarchy(child) 62 | 63 | 64 | def load_object(object_path: str) -> None: 65 | """Loads a glb model into the scene.""" 66 | bpy.ops.object.select_all(action='DESELECT') 67 | if object_path.endswith(".glb"): 68 | bpy.ops.import_scene.gltf(filepath=object_path, merge_vertices=True) 69 | elif object_path.endswith(".fbx"): 70 | bpy.ops.import_scene.fbx(filepath=object_path) 71 | else: 72 | raise ValueError(f"Unsupported file type: {object_path}") 73 | 74 | base_name = os.path.basename(object_path) 75 | object_name, _ = os.path.splitext(base_name) 76 | bpy.context.view_layer.objects.active.name = object_name 77 | bpy.ops.object.select_all(action='DESELECT') 78 | 79 | obj = bpy.data.objects.get(object_name) 80 | # bpy.context.view_layer.objects.active = obj 81 | select_hierarchy(obj) 82 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 83 | meshes = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"] 84 | non_meshes = [obj for obj in bpy.context.selected_objects if obj.type != "MESH"] 85 | bpy.ops.object.select_all(action="DESELECT") 86 | 87 | # delete non-mesh and consolidate 88 | 89 | for obj in non_meshes: 90 | obj.select_set(True) 91 | bpy.ops.object.delete() 92 | bpy.ops.object.select_all(action="DESELECT") 93 | for obj in meshes: 94 | obj.select_set(True) 95 | bpy.context.view_layer.objects.active = meshes[0] 96 | bpy.ops.object.join() 97 | bpy.context.view_layer.objects.active.name = object_name 98 | bpy.ops.object.origin_set(type='GEOMETRY_ORIGIN', center='BOUNDS') 99 | 100 | bpy.ops.object.select_all(action="DESELECT") 101 | 102 | return object_name 103 | 104 | 105 | def scene_meshes(): 106 | for obj in bpy.context.scene.objects.values(): 107 | if isinstance(obj.data, (bpy.types.Mesh)): 108 | yield obj 109 | 110 | 111 | def download_uid(uid_path, save_dir): 112 | return download_object(uid_path, save_dir) 113 | 114 | 115 | def download_object(object_url, save_dir) -> str: 116 | """Download the object and return the path.""" 117 | # uid = uuid.uuid4() 118 | uid = object_url.split("/")[-1].split(".")[0] 119 | tmp_local_path = os.path.join(save_dir, f"{uid}.glb" + ".tmp") 120 | local_path = os.path.join(save_dir, f"{uid}.glb") 121 | # wget the file and put it in local_path 122 | os.makedirs(os.path.dirname(tmp_local_path), exist_ok=True) 123 | urllib.request.urlretrieve(object_url, tmp_local_path) 124 | os.rename(tmp_local_path, local_path) 125 | # get the absolute path 126 | local_path = os.path.abspath(local_path) 127 | return local_path 128 | 129 | 130 | def scene_bbox(single_obj=None, ignore_matrix=False): 131 | bbox_min = (math.inf,) * 3 132 | bbox_max = (-math.inf,) * 3 133 | found = False 134 | for obj in scene_meshes() if single_obj is None else [single_obj]: 135 | found = True 136 | for coord in obj.bound_box: 137 | coord = Vector(coord) 138 | if not ignore_matrix: 139 | coord = obj.matrix_world @ coord 140 | bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord)) 141 | bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord)) 142 | if not found: 143 | raise RuntimeError("no objects in scene to compute bounding box for") 144 | return Vector(bbox_min), Vector(bbox_max) 145 | 146 | 147 | def scene_root_objects(): 148 | for obj in bpy.context.scene.objects.values(): 149 | if not obj.parent: 150 | yield obj 151 | 152 | 153 | def freeze_transformation(obj): 154 | bpy.context.view_layer.objects.active = obj 155 | obj.select_set(True) 156 | bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) 157 | bpy.ops.object.select_all(action='DESELECT') 158 | 159 | 160 | def scale(obj, scale_factor): 161 | bpy.ops.object.select_all(action='DESELECT') 162 | obj.select_set(True) 163 | bpy.ops.transform.resize(value=(scale_factor, scale_factor, scale_factor)) 164 | bpy.ops.object.select_all(action='DESELECT') 165 | freeze_transformation(obj) 166 | 167 | 168 | def get_3d_dimensions(obj): 169 | # pdb.set_trace() 170 | max_x, max_y, max_z = float("-inf"), float("-inf"), float("-inf") 171 | min_x, min_y, min_z = float("inf"), float("inf"), float("inf") 172 | 173 | for vertex in obj.data.vertices: 174 | v_world = obj.matrix_world @ vertex.co 175 | max_x, max_y, max_z = max(max_x, v_world.x), max(max_y, v_world.y), max(max_z, v_world.z) 176 | min_x, min_y, min_z = min(min_x, v_world.x), min(min_y, v_world.y), min(min_z, v_world.z) 177 | 178 | return (max_x - min_x, max_y - min_y, max_z - min_z) 179 | 180 | 181 | def normalize_object(obj, factor=1.0): 182 | max_dimension = max(get_3d_dimensions(obj)) 183 | scale_factor = factor * (1 / max_dimension) 184 | scale(obj, scale_factor) 185 | 186 | 187 | def move_to_xy(obj, x, y): 188 | min_z = float('inf') 189 | for vertex in obj.data.vertices: 190 | z = obj.matrix_world @ vertex.co 191 | min_z = min(min_z, z.z) 192 | obj.location -= Vector((0, 0, min_z)) 193 | freeze_transformation(obj) 194 | 195 | # move location x,y to sampled box center 196 | new_location = Vector((x, y, obj.location[2])) 197 | obj.location = new_location 198 | freeze_transformation(obj) 199 | 200 | 201 | def move_to_xy_at_frame(obj, movement, frame): 202 | # Set the scene to the specific frame 203 | x, y = movement[0], movement[1] 204 | bpy.context.scene.frame_set(frame) 205 | new_location = Vector((x, y, 0)) 206 | obj.location = obj.location + new_location 207 | obj.keyframe_insert(data_path="location", frame=frame) 208 | 209 | 210 | def normalize_scene(): 211 | bbox_min, bbox_max = scene_bbox() 212 | scale = 1 / max(bbox_max - bbox_min) 213 | for obj in scene_root_objects(): 214 | obj.scale = obj.scale * scale 215 | # Apply scale to matrix_world. 216 | bpy.context.view_layer.update() 217 | bbox_min, bbox_max = scene_bbox() 218 | offset = -(bbox_min + bbox_max) / 2 219 | for obj in scene_root_objects(): 220 | obj.matrix_world.translation += offset 221 | bpy.ops.object.select_all(action="DESELECT") 222 | 223 | 224 | def setup_plane_and_background(plane_texture_path, hdri_path): 225 | # load plane 226 | plane_name = load_object(plane_texture_path) 227 | plane = bpy.data.objects.get(plane_name) 228 | scale(plane, 0.5) 229 | 230 | # load light map 231 | print(f"HDRI PATH: {hdri_path}") 232 | bpy.ops.image.open(filepath=hdri_path) 233 | if bpy.data.worlds.get("World") is None: 234 | bpy.data.worlds.new("World") 235 | 236 | bpy.context.scene.world = bpy.data.worlds["World"] 237 | 238 | bpy.context.scene.world.use_nodes = True 239 | tree = bpy.context.scene.world.node_tree 240 | tree.nodes.clear() 241 | 242 | tex_env = tree.nodes.new(type="ShaderNodeTexEnvironment") 243 | tex_env.image = bpy.data.images[hdri_path.split('/')[-1]] # Image name is typically the last part of the path 244 | background = tree.nodes.new(type="ShaderNodeBackground") 245 | output = tree.nodes.new(type="ShaderNodeOutputWorld") 246 | 247 | tree.links.new(tex_env.outputs[0], background.inputs[0]) 248 | tree.links.new(background.outputs[0], output.inputs[0]) 249 | 250 | return plane_texture_path + " " + hdri_path 251 | 252 | 253 | def setup_camera_and_lights( 254 | sun_x, 255 | sun_y, 256 | sun_energy, 257 | key_light_horizontal_angle, 258 | fill_light_horizontal_angle, 259 | key_light_vertical_angle, 260 | fill_light_vertical_angle 261 | ): 262 | # for seeting up the three point lighting, we mostly follow https://courses.cs.washington.edu/courses/cse458/05au/reading/3point_lighting.pdf 263 | # in order to keep lights and camera on the hemisphere pointing to origin, we use a hierarchy of empties 264 | 265 | # create the sun 266 | 267 | bpy.ops.object.light_add(type="SUN") 268 | sun = bpy.context.active_object 269 | sun.rotation_euler = Euler((sun_x, sun_y, 0), "XYZ") 270 | sun.data.energy = sun_energy 271 | 272 | # create global empty 273 | 274 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 275 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 276 | empty = bpy.context.scene.objects.get("Empty") 277 | 278 | # create camera 279 | 280 | # radius = random.uniform(1.8,2.2) 281 | radius = 2.5 282 | 283 | bpy.ops.object.camera_add(enter_editmode=False, align='VIEW', location=(-radius, 0, 0), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 284 | cam = bpy.context.scene.objects.get("Camera") 285 | cam.data.lens = 35 286 | cam.data.sensor_width = 32 287 | bpy.context.scene.camera = cam 288 | 289 | # create camera empty 290 | 291 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 292 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 293 | cam_empty = bpy.context.scene.objects.get("Empty.001") 294 | cam_empty.name = "camera_empty" 295 | 296 | # make camera empty parent of camera 297 | 298 | bpy.ops.object.select_all(action='DESELECT') 299 | cam.select_set(True) 300 | cam_empty.select_set(True) 301 | bpy.context.view_layer.objects.active = cam_empty 302 | bpy.ops.object.parent_set() 303 | bpy.ops.object.select_all(action='DESELECT') 304 | 305 | # make camera empty parent of global empty 306 | 307 | bpy.ops.object.select_all(action='DESELECT') 308 | cam_empty.select_set(True) 309 | empty.select_set(True) 310 | bpy.context.view_layer.objects.active = empty 311 | bpy.ops.object.parent_set() 312 | bpy.ops.object.select_all(action='DESELECT') 313 | 314 | light_names = ["key_light", "fill_light", "back_light"] 315 | light_energies = [1000., 300., 500.] 316 | 317 | for light_name, light_energy in zip(light_names, light_energies): 318 | # create light empty 319 | 320 | empty_name = light_name + "_empty" 321 | bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1)) 322 | x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90) 323 | light_empty = bpy.context.scene.objects.get("Empty.001") 324 | light_empty.name = empty_name 325 | 326 | # parent light empty to main (camera) empty 327 | 328 | bpy.ops.object.select_all(action='DESELECT') 329 | light_empty.select_set(True) 330 | empty.select_set(True) 331 | bpy.context.view_layer.objects.active = empty 332 | bpy.ops.object.parent_set() 333 | bpy.ops.object.select_all(action='DESELECT') 334 | 335 | # create light 336 | 337 | x_loc, y_loc, z_loc = -radius, 0, 0 338 | bpy.ops.object.light_add(type='POINT', radius=1, align='WORLD', location=(x_loc, y_loc, z_loc), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1)) 339 | bpy.data.objects["Point"].name = light_name 340 | light = bpy.data.objects[light_name] 341 | light.data.energy = light_energy 342 | # light.data.size = 0.5 343 | 344 | # parent light empty to light 345 | 346 | bpy.ops.object.select_all(action='DESELECT') 347 | light.select_set(True) 348 | light_empty.select_set(True) 349 | bpy.context.view_layer.objects.active = light_empty 350 | bpy.ops.object.parent_set() 351 | bpy.ops.object.select_all(action='DESELECT') 352 | 353 | # rotate camera and lights around the z-axis 354 | 355 | z_random_rot = radians(90) # radians(random.uniform(0,360)) 356 | empty.rotation_euler = Euler((0, 0, z_random_rot)) 357 | 358 | # # raise the camera while having it point to origin 359 | 360 | # cam_y_random_rot = radians(random.uniform(10,50)) 361 | # cam_empty.rotation_euler = Euler((0,cam_y_random_rot,0),"XYZ") 362 | 363 | bpy.context.view_layer.update() 364 | 365 | back_light_horizontal_angle = radians(180) 366 | light_horizontal_angles = [key_light_horizontal_angle, fill_light_horizontal_angle, back_light_horizontal_angle] 367 | for light_angle, light_name in zip(light_horizontal_angles, light_names): 368 | light_empty = bpy.data.objects[light_name + "_empty"] 369 | global_z = (light_empty.matrix_world.inverted() @ Vector((0.0, 0.0, 1.0, 0.0)))[:3] 370 | quat = Quaternion(global_z, light_angle) 371 | light_empty.rotation_euler = quat.to_euler() 372 | 373 | back_light_vertical_angle = 0 374 | light_vertical_angles = [key_light_vertical_angle, fill_light_vertical_angle, back_light_vertical_angle] 375 | # light_vertical_angles = [radians(-45)]*3 376 | 377 | for light_angle, light_name in zip(light_vertical_angles, light_names): 378 | light_empty = bpy.data.objects[light_name + "_empty"] 379 | global_x = (light_empty.matrix_world.inverted() @ Vector((1.0, 0.0, 0.0, 0.0)))[:3] 380 | quat = Quaternion(global_x, light_angle) 381 | euler_add = quat.to_euler() 382 | euler_current = light_empty.rotation_euler 383 | new_euler = Euler((euler_add[0] + euler_current[0], euler_add[1] + euler_current[1], euler_add[2] + euler_current[2])) 384 | light_empty.rotation_euler = new_euler 385 | 386 | # bpy.context.view_layer.update() 387 | 388 | return cam, empty 389 | 390 | 391 | def render_animation(fp): 392 | bpy.context.scene.render.filepath = fp 393 | bpy.context.scene.render.image_settings.file_format = 'FFMPEG' 394 | bpy.context.scene.render.ffmpeg.format = 'MPEG4' 395 | bpy.context.scene.render.ffmpeg.codec = 'H264' 396 | bpy.context.scene.render.ffmpeg.constant_rate_factor = 'MEDIUM' 397 | bpy.ops.render.render(animation=True) 398 | 399 | 400 | def setup_renderer(H, W, use_cpu=False): 401 | scene = bpy.context.scene 402 | render = bpy.context.scene.render 403 | 404 | render.engine = "CYCLES" 405 | render.image_settings.file_format = "PNG" 406 | render.image_settings.color_mode = "RGBA" 407 | render.resolution_x = W 408 | render.resolution_y = H 409 | render.resolution_percentage = 100 410 | 411 | scene.cycles.device = "CPU" if use_cpu else "GPU" 412 | scene.cycles.samples = 10 if use_cpu else 128 413 | scene.cycles.diffuse_bounces = 1 414 | scene.cycles.glossy_bounces = 1 415 | scene.cycles.transparent_max_bounces = 3 416 | scene.cycles.transmission_bounces = 3 417 | scene.cycles.filter_width = 0.01 418 | scene.cycles.use_denoising = True 419 | scene.render.film_transparent = False 420 | 421 | bpy.context.preferences.addons["cycles"].preferences.get_devices() 422 | # Set the device_type 423 | bpy.context.preferences.addons[ 424 | "cycles" 425 | ].preferences.compute_device_type = "METAL" if use_cpu else "CUDA" 426 | bpy.context.scene.view_settings.view_transform = 'Filmic' 427 | 428 | 429 | # def randomize_camera_view(axis): 430 | # euler_y = radians(random.uniform(-90, 90)) 431 | # euler_z = radians(random.uniform(0, 360)) 432 | # axis.rotation_euler = Euler((0, euler_y, euler_z)) 433 | 434 | 435 | def run_render(metadata, save_image_path, use_cpu): 436 | reset_scene() 437 | 438 | bpy.context.scene.render.fps = metadata['fps'] 439 | bpy.context.scene.frame_start = 1 440 | bpy.context.scene.frame_end = metadata['total_num_frames'] 441 | 442 | objs = [] 443 | for uid in metadata["objects"]: 444 | object_path = metadata["object_path"][uid] 445 | objs.append(bpy.data.objects.get(load_object(object_path))) 446 | 447 | grid_number = metadata["grid number"] 448 | 449 | if grid_number == 2: 450 | locations = { 451 | 0: [0.7, 0.5], 452 | 1: [0.7, -0.5], 453 | 2: [-0.6, 0.5], 454 | 3: [-0.6, -0.5] 455 | } 456 | scale_factor = 1 / 2 457 | elif grid_number == 3: 458 | locations = { 459 | 0: [0.9, 0.6], 460 | 1: [0.9, 0], 461 | 2: [0.9, -0.6], 462 | 3: [0.0, 0.6], 463 | 4: [0.0, 0.0], 464 | 5: [0.0, -0.6], 465 | 6: [-0.9, 0.6], 466 | 7: [-0.9, 0.0], 467 | 8: [-0.9, -0.6] 468 | } 469 | scale_factor = 1 / 3 470 | else: 471 | raise ValueError(f"Expected grid number to be 2 or 3 but got {grid_number}") 472 | 473 | # process rotate 474 | for idx, obj in enumerate(objs): 475 | rotate(obj, degree=metadata['object_angles'][idx]) 476 | 477 | # process scale 478 | if "sizes" in metadata: 479 | for idx, obj in enumerate(objs): 480 | normalize_object(obj, factor=metadata['sizes'][idx] * scale_factor) 481 | else: 482 | for obj in objs: 483 | normalize_object(obj, factor=scale_factor) 484 | 485 | for pos, obj in zip(metadata["grids"], objs): 486 | x, y = locations[pos] 487 | move_to_xy(obj, x, y) 488 | 489 | # set the first keyframe of video 490 | for obj in objs: 491 | rotate_and_keyframe(obj, 0, 1) 492 | move_to_xy_at_frame(obj, (0, 0), 1) 493 | 494 | # set other keyframes based on the metadata 495 | for idx, obj in enumerate(objs): 496 | for keyframe_order, keyframe_info in enumerate(metadata["keyframes"][idx]): 497 | if "rotation" in keyframe_info: 498 | rotate_and_keyframe(obj, keyframe_info["rotation"], get_exact_frame(keyframe_order, metadata['total_num_frames'])) 499 | if "movement" in keyframe_info: 500 | move_to_xy_at_frame(obj, keyframe_info["movement"], get_exact_frame(keyframe_order, metadata['total_num_frames'])) 501 | 502 | blender_config = metadata["blender_config"] 503 | 504 | setup_plane_and_background(blender_config["plane_texture_path"], blender_config["hdri_path"]) 505 | cam, axis = setup_camera_and_lights( 506 | blender_config["sun_x"], 507 | blender_config["sun_y"], 508 | blender_config["sun_energy"], 509 | blender_config["key_light_horizontal_angle"], 510 | blender_config["fill_light_horizontal_angle"], 511 | blender_config["key_light_vertical_angle"], 512 | blender_config["fill_light_vertical_angle"] 513 | ) 514 | axis.rotation_euler = Euler((0, radians(45), 0)) 515 | setup_renderer(H=metadata["VIDEO_H"], W=metadata["VIDEO_W"], use_cpu=use_cpu) 516 | render_animation(save_image_path) 517 | 518 | 519 | if __name__ == "__main__": 520 | parser = argparse.ArgumentParser() 521 | parser.add_argument( 522 | "--save_local", 523 | type=str, 524 | default="" 525 | ) 526 | parser.add_argument( 527 | "--save_video_path", 528 | type=str, 529 | default="render.png" 530 | ) 531 | parser.add_argument( 532 | "--json_file", 533 | type=str, 534 | default="video_metadata.json" 535 | ) 536 | 537 | parser.add_argument( 538 | "--use_cpu", 539 | action="store_true", 540 | default=False 541 | ) 542 | 543 | argv = sys.argv[sys.argv.index("--") + 1:] 544 | args = parser.parse_args(argv) 545 | 546 | with open(args.json_file, "r") as f: 547 | metadata = json.load(f) 548 | 549 | run_render(metadata, args.save_video_path, args.use_cpu) 550 | --------------------------------------------------------------------------------