├── tma
    ├── __init__.py
    ├── imageqa
    │   ├── __init__.py
    │   ├── scene_graph
    │   │   ├── __init__.py
    │   │   └── single_image_task.py
    │   ├── sticker_2d
    │   │   ├── __init__.py
    │   │   └── utils.py
    │   ├── tabletop_3d
    │   │   ├── __init__.py
    │   │   ├── single_image_task.py
    │   │   ├── utils.py
    │   │   ├── run_blender.py
    │   │   ├── size_single_image_task.py
    │   │   └── distance_single_image_task.py
    │   └── metadata.py
    ├── videoqa
    │   ├── __init__.py
    │   ├── scene_graph
    │   │   ├── __init__.py
    │   │   └── single_video_task.py
    │   ├── tabletop_3d
    │   │   ├── __init__.py
    │   │   ├── single_video_task.py
    │   │   ├── utils.py
    │   │   ├── movement_single_video_task.py
    │   │   └── run_blender.py
    │   └── metadata.py
    ├── models
    │   ├── __init__.py
    │   └── qa_model
    │   │   ├── __init__.py
    │   │   ├── prompt.py
    │   │   ├── base_qa_model.py
    │   │   └── videoqa_model.py
    ├── constant.py
    ├── metadata.py
    ├── base.py
    └── task_store.py
├── teaser.png
├── assets
    ├── 2024-imageqa-result.png
    ├── 2024-videoqa-result.png
    ├── 2024vsrandom-imageqa.png
    ├── 2024vsrandom-videoqa.png
    ├── random-imageqa-result.png
    └── random-videoqa-result.png
├── requirements.txt
├── .gitignore
├── annotations
    ├── relation_to_type.json
    └── attribute_category.json
├── LICENSE
└── README.md


/tma/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tma/imageqa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tma/videoqa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tma/models/__init__.py:
--------------------------------------------------------------------------------
1 | class Model:
2 | 	model_name: str
3 | 


--------------------------------------------------------------------------------
/tma/imageqa/scene_graph/__init__.py:
--------------------------------------------------------------------------------
1 | from .single_image_task import *
2 | 


--------------------------------------------------------------------------------
/tma/imageqa/sticker_2d/__init__.py:
--------------------------------------------------------------------------------
1 | from .single_image_task import *
2 | 


--------------------------------------------------------------------------------
/tma/videoqa/scene_graph/__init__.py:
--------------------------------------------------------------------------------
1 | from .single_video_task import *
2 | 


--------------------------------------------------------------------------------
/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/teaser.png


--------------------------------------------------------------------------------
/assets/2024-imageqa-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024-imageqa-result.png


--------------------------------------------------------------------------------
/assets/2024-videoqa-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024-videoqa-result.png


--------------------------------------------------------------------------------
/assets/2024vsrandom-imageqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024vsrandom-imageqa.png


--------------------------------------------------------------------------------
/assets/2024vsrandom-videoqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/2024vsrandom-videoqa.png


--------------------------------------------------------------------------------
/assets/random-imageqa-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/random-imageqa-result.png


--------------------------------------------------------------------------------
/assets/random-videoqa-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JieyuZ2/TaskMeAnything/HEAD/assets/random-videoqa-result.png


--------------------------------------------------------------------------------
/tma/videoqa/tabletop_3d/__init__.py:
--------------------------------------------------------------------------------
1 | from .movement_single_video_task import *
2 | from .rotation_single_video_task import *
3 | 


--------------------------------------------------------------------------------
/tma/imageqa/tabletop_3d/__init__.py:
--------------------------------------------------------------------------------
1 | from .distance_single_image_task import *
2 | from .single_image_task import *
3 | from .size_single_image_task import *
4 | 


--------------------------------------------------------------------------------
/tma/constant.py:
--------------------------------------------------------------------------------
 1 | NUM_OPTIONS = 4
 2 | 
 3 | # ImageQA
 4 | 
 5 | IMAGE_H = 512
 6 | IMAGE_W = 512
 7 | 
 8 | # VideoQA
 9 | 
10 | VIDEO_H = 224
11 | VIDEO_W = 224
12 | VIDEO_FPS = 4
13 | VIDEO_NUM_FRAMES = 16
14 | 


--------------------------------------------------------------------------------
/tma/models/qa_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_qa_model import QAModel, QAModelInstance
2 | from .imageqa_model import ImageQAModel, list_imageqa_models, set_imageqa_model_key
3 | from .videoqa_model import ImageQAModel4Video, VideoQAModel, list_videoqa_models
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | sentence-transformers==2.5.1
 2 | transformers==4.38.1
 3 | accelerate==0.27.2
 4 | diskcache
 5 | networkx
 6 | openai
 7 | pyarrow
 8 | scikit-learn
 9 | pandas
10 | matplotlib
11 | tiktoken
12 | einops
13 | transformers_stream_generator
14 | prefixspan
15 | dashscope
16 | oss2
17 | google.generativeai
18 | replicate
19 | decord
20 | opencv-python


--------------------------------------------------------------------------------
/tma/metadata.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import networkx as nx
 4 | 
 5 | 
 6 | class MetaData:
 7 | 	"""
 8 | 	Abstract class for metadata
 9 | 	"""
10 | 
11 | 
12 | class CategoryMetaData(MetaData):
13 | 	def __init__(self):
14 | 		super().__init__()
15 | 
16 | 		self.taxonomy = None
17 | 		self.categories = None
18 | 		self.category_info = None
19 | 
20 | 	def check_category_exists(self, cateid):
21 | 		return cateid in self.categories
22 | 
23 | 	def get_surfacename(self, node):
24 | 		return self.category_info[node]['surface_name'][0]
25 | 
26 | 	def get_relevant_categories(self, cateid):
27 | 		return set(nx.descendants(self.taxonomy, cateid)) | set(nx.ancestors(self.taxonomy, cateid)) | {cateid}
28 | 
29 | 	def get_irrelevant_categories(self, cateid):
30 | 		if isinstance(cateid, List):
31 | 			relevant_categories = set()
32 | 			for c in cateid:
33 | 				relevant_categories |= self.get_relevant_categories(c)
34 | 		else:
35 | 			relevant_categories = self.get_relevant_categories(cateid)
36 | 		return set(self.categories) - relevant_categories
37 | 


--------------------------------------------------------------------------------
/tma/videoqa/metadata.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pickle
 4 | 
 5 | from ..imageqa.metadata import Objaverse3DMetaData
 6 | from ..metadata import MetaData
 7 | 
 8 | 
 9 | class ObjaverseVideoMetaData(Objaverse3DMetaData):
10 | 	pass
11 | 
12 | 
13 | def load_video_scene_graph(video_scene_graph_folder):
14 | 	video_folder = os.path.join(video_scene_graph_folder, "Charades_v1_480")
15 | 	scene_graphs = json.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/video_scene_graph.json")))
16 | 	idx2name = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/idx2name.pkl"), "rb"))
17 | 	objects = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/objects.pkl"), "rb"))
18 | 	actions = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/actions.pkl"), "rb"))
19 | 	spatial_relations = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/spatial_relations.pkl"), "rb"))
20 | 	contact_relations = pickle.load(open(os.path.join(video_scene_graph_folder, "video_scene_graph/contact_relations.pkl"), "rb"))
21 | 	return video_folder, scene_graphs, idx2name, objects, actions, spatial_relations, contact_relations
22 | 
23 | 
24 | class VideoSceneGraphMetaData(MetaData):
25 | 	def __init__(self, path_to_metadata, video_scene_graph_folder):
26 | 		super().__init__()
27 | 		# video scene graph use idx to represent relations, objects, and actions, like r1, o1, idx_to_name is a dict to map idx to its name.
28 | 		self.image_folder, self.video_scene_graphs, self.idx2name, self.objects, self.actions, self.spatial_relations, self.contact_relations = (
29 | 			load_video_scene_graph(video_scene_graph_folder))
30 | 
31 | 	def get_video_path(self, video_scene_graph_id):
32 | 		return os.path.join(self.image_folder, video_scene_graph_id + ".mp4")
33 | 


--------------------------------------------------------------------------------
/tma/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .constant import NUM_OPTIONS
 6 | from .metadata import MetaData
 7 | from .task_store import TaskStore
 8 | 
 9 | 
10 | class TaskGenerator:
11 | 	schema = {}
12 | 
13 | 	def __init__(self, metadata: MetaData, seed=42):
14 | 		self.metadata = metadata
15 | 		self.rng = np.random.default_rng(seed=seed)
16 | 
17 | 	def _compose_options(self, answer, negatives):
18 | 		if len(negatives) > NUM_OPTIONS - 1:
19 | 			negatives = self.rng.choice(negatives, NUM_OPTIONS - 1, replace=False).tolist()
20 | 		options = [answer] + negatives
21 | 		return options
22 | 
23 | 	def _task_plan_to_str(self, task_plan) -> str:
24 | 		"(Abstract method) convert task plan to string for task embedding"
25 | 
26 | 	def enumerate_task_plans(self, task_store: TaskStore):
27 | 		"(Abstract method) enumerate task plan"
28 | 
29 | 	def generate(self, task_plan, return_data=True, seed=None):
30 | 		"(Abstract method) enumerate task"
31 | 
32 | 
33 | class JointTaskGenerator:
34 | 	def __init__(self, metadata: MetaData, generators: Dict, seed=42):
35 | 		self.generators = {
36 | 			k: v(metadata, seed=seed) for k, v in generators.items()
37 | 		}
38 | 		self.stats = {generator_type: 0 for generator_type in generators}
39 | 		self.schema = {}
40 | 		for generator_type, generator in self.generators.items():
41 | 			self.schema.update(generator.schema)
42 | 
43 | 	def enumerate_task_plans(self, task_store: TaskStore):
44 | 		for generator_type, generator in self.generators.items():
45 | 			before = len(task_store)
46 | 			generator.enumerate_task_plans(task_store)
47 | 			self.stats[generator_type] = len(task_store) - before
48 | 			print(f"Generated [{self.stats[generator_type]}] {generator_type} tasks")
49 | 		task_store.dump()
50 | 
51 | 	def generate(self, task_plan, return_data=True, seed=None):
52 | 		return self.generators[task_plan['task type']].generate(task_plan, return_data=return_data, seed=seed)
53 | 


--------------------------------------------------------------------------------
/tma/models/qa_model/prompt.py:
--------------------------------------------------------------------------------
 1 | def succinct_prompt(question, choices=[]):
 2 | 	if len(choices) == 0:
 3 | 		prompt = question
 4 | 	else:
 5 | 		choices = '\n'.join(choices)
 6 | 		prompt = (f"{question}\n"
 7 | 				  f"Select from the following choices.\n"
 8 | 				  f"{choices}")
 9 | 
10 | 	return prompt
11 | 
12 | 
13 | ####################################################################################################
14 | # videoqa
15 | ####################################################################################################
16 | 
17 | 
18 | def detailed_videoqa_prompt(question, choices=[]):
19 | 	if len(choices) == 0:
20 | 		prompt = f"Based on the video, answer the question. Question: {question} Answer:"
21 | 	else:
22 | 		prompt = (f"Based on the video, output the best option for the question.\n"
23 | 				  f"You must only output the option.\n"
24 | 				  f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(")
25 | 	return prompt
26 | 
27 | 
28 | def detailed_video2imageqa_prompt(question, choices=[]):
29 | 	if len(choices) == 0:
30 | 		prompt = f"This is a series of images sampled at equal intervals from the beginning to the end of a video, based on the series of images, answer the question. Question: {question} Answer:"
31 | 	else:
32 | 		prompt = (f"This is a series of images sampled at equal intervals from the beginning to the end of a video, based on the series of images, output the best option for the question.\n"
33 | 				  f"You must only output the option.\n"
34 | 				  f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(")
35 | 	return prompt
36 | 
37 | 
38 | ####################################################################################################
39 | # imageqa
40 | ####################################################################################################
41 | 
42 | def detailed_imageqa_prompt(question, choices=[]):
43 | 	if len(choices) == 0:
44 | 		prompt = f"Based on the image, answer the question. Question: {question} Answer:"
45 | 	else:
46 | 		prompt = (f"Based on the image, output the best option for the question.\n"
47 | 				  f"You must only output the option.\n"
48 | 				  f"Question: {question}\nOptions: {' '.join(choices)}\nBest option:(")
49 | 	return prompt
50 | 


--------------------------------------------------------------------------------
/tma/task_store.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pyarrow as pa
 3 | import pyarrow.parquet as pq
 4 | 
 5 | pa_schema_map = {
 6 | 	'str' : pa.string(),
 7 | 	'int' : pa.int64(),
 8 | 	'list': pa.list_(pa.string()),
 9 | }
10 | 
11 | pd_schema_map = {
12 | 	'str' : 'string',
13 | 	'int' : 'Int64',
14 | 	'list': 'object',
15 | }
16 | 
17 | 
18 | def get_pa_schema(schema):
19 | 	return pa.schema([(k, pa_schema_map[v]) for k, v in schema.items()])
20 | 
21 | 
22 | def get_pd_schema(schema):
23 | 	return {k: pd_schema_map[v] for k, v in schema.items()}
24 | 
25 | 
26 | class TaskStore:
27 | 
28 | 	def __init__(self, schema, output_file=None, buffer_size=1e8):
29 | 		self.columns = list(schema.keys())
30 | 		self.dtypes = list(schema.values())
31 | 		self.buffer = []
32 | 		self.buffer_size = buffer_size
33 | 		self.output_file = output_file
34 | 		if output_file is None:
35 | 			self.schema = get_pd_schema(schema)
36 | 			self.task_plan_df = pd.DataFrame({k: pd.Series(dtype=v) for k, v in self.schema.items()})
37 | 		else:
38 | 			print(f'Writing to {output_file}')
39 | 			self.counter = 0
40 | 			self.schema = get_pa_schema(schema)
41 | 			self.parquet_writer = pq.ParquetWriter(output_file, schema=self.schema)
42 | 
43 | 	def _update_buffer(self):
44 | 		if len(self.buffer) > self.buffer_size:
45 | 			self.dump()
46 | 
47 | 	def dump(self):
48 | 		if len(self.buffer) > 0:
49 | 			if self.output_file is None:
50 | 				self.task_plan_df = pd.concat(
51 | 					[self.task_plan_df, pd.DataFrame(self.buffer, columns=self.columns).astype(self.schema, errors='ignore')],
52 | 					ignore_index=True,
53 | 					sort=False
54 | 				)
55 | 			else:
56 | 				self.parquet_writer.write_table(pa.Table.from_pylist(self.buffer, schema=self.schema))
57 | 				self.counter += len(self.buffer)
58 | 			self.buffer = []
59 | 
60 | 	def add_many(self, xs):
61 | 		self.buffer.extend(xs)
62 | 		self._update_buffer()
63 | 
64 | 	def add(self, x):
65 | 		self.buffer.append(x)
66 | 		self._update_buffer()
67 | 
68 | 	def __len__(self):
69 | 		if self.output_file is None:
70 | 			return len(self.task_plan_df) + len(self.buffer)
71 | 		else:
72 | 			return self.counter + len(self.buffer)
73 | 
74 | 	def return_df(self):
75 | 		self.dump()
76 | 		return self.task_plan_df
77 | 
78 | 	def close(self):
79 | 		if self.output_file is not None:
80 | 			self.dump()
81 | 			self.parquet_writer.close()
82 | 


--------------------------------------------------------------------------------
/tma/imageqa/tabletop_3d/single_image_task.py:
--------------------------------------------------------------------------------
 1 | from .utils import grid_mappings, grid_options, make_image, relative_grid, relative_position_phrase, relative_positions
 2 | from ..metadata import Objaverse3DMetaData, ObjaverseMetaData
 3 | from ..sticker_2d import GridTaskGenerator, HowManyGridTaskGenerator, WhatAttributeGridTaskGenerator, WhatGridTaskGenerator, WhereAttributeGridTaskGenerator, WhereGridTaskGenerator
 4 | from ...constant import IMAGE_H, IMAGE_W
 5 | 
 6 | 
 7 | class _3DGridTaskGenerator(GridTaskGenerator):
 8 | 	metadata: Objaverse3DMetaData
 9 | 
10 | 	def __init__(self, metadata: ObjaverseMetaData, seed=42):
11 | 		super().__init__(metadata, seed=seed)
12 | 		self.grid_mappings = grid_mappings
13 | 		self.grid_options = grid_options
14 | 		self.relative_positions = relative_positions
15 | 		self.relative_position_phrase = relative_position_phrase
16 | 
17 | 	def _make_image_metadata(self, grid_size, grids, queries, remaining_query=...):
18 | 		objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries]
19 | 
20 | 		remaining_grids = [g for g in range(grid_size ** 2) if g not in grids]
21 | 		for _ in remaining_grids:
22 | 			uid = self.metadata.sample(self.rng, 1, "object", remaining_query)
23 | 			objects.append(uid)
24 | 
25 | 		object_path = {k: self.metadata.get_object_path(k) for k in objects}
26 | 		angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects]
27 | 
28 | 		image_metadata = {
29 | 			'grid number'   : grid_size,
30 | 			'objects'       : objects,
31 | 			'object_path'   : object_path,
32 | 			'object_angles' : angles,
33 | 			'grids'         : grids + remaining_grids,
34 | 			'blender_config': self.metadata.sample_blender_configuration(self.rng)
35 | 		}
36 | 
37 | 		return image_metadata
38 | 
39 | 	def make_image(self, image_metadata):
40 | 		return make_image(image_metadata, self.metadata, IMAGE_H, IMAGE_W)
41 | 
42 | 	def _relative_grid(self, grid_size, grid, reference_pos):
43 | 		return relative_grid(grid_size, grid, reference_pos)
44 | 
45 | 
46 | class What3DGridTaskGenerator(_3DGridTaskGenerator, WhatGridTaskGenerator):
47 | 	metadata: Objaverse3DMetaData
48 | 
49 | 
50 | class Where3DGridTaskGenerator(_3DGridTaskGenerator, WhereGridTaskGenerator):
51 | 	metadata: Objaverse3DMetaData
52 | 
53 | 
54 | class WhatAttribute3DGridTaskGenerator(_3DGridTaskGenerator, WhatAttributeGridTaskGenerator):
55 | 	metadata: Objaverse3DMetaData
56 | 
57 | 
58 | class WhereAttribute3DGridTaskGenerator(_3DGridTaskGenerator, WhereAttributeGridTaskGenerator):
59 | 	metadata: Objaverse3DMetaData
60 | 
61 | 
62 | class HowMany3DGridTaskGenerator(_3DGridTaskGenerator, HowManyGridTaskGenerator):
63 | 	metadata: Objaverse3DMetaData
64 | 


--------------------------------------------------------------------------------
/tma/videoqa/tabletop_3d/single_video_task.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from typing import Dict, List, Tuple
 3 | 
 4 | import numpy as np
 5 | 
 6 | from .utils import grid_mappings, grid_options, make_video, relative_grid
 7 | from ..metadata import ObjaverseVideoMetaData
 8 | from ...base import TaskGenerator
 9 | from ...constant import VIDEO_H, VIDEO_W
10 | 
11 | 
12 | def check_video(video):
13 | 	from decord import VideoReader, cpu
14 | 	with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp:
15 | 		try:
16 | 			with open(tmp.name, 'wb') as file:
17 | 				file.write(video)
18 | 			with open(tmp.name, 'rb') as f:
19 | 				VideoReader(f, ctx=cpu(0))
20 | 		except Exception as e:
21 | 			return False
22 | 	return True
23 | 
24 | 
25 | class GridVideoTaskGenerator(TaskGenerator):
26 | 	metadata: ObjaverseVideoMetaData
27 | 
28 | 	def __init__(self, metadata: ObjaverseVideoMetaData, seed=42):
29 | 		super().__init__(metadata, seed=seed)
30 | 		self.grid_options = grid_options
31 | 		self.grid_mappings = grid_mappings
32 | 
33 | 	def _relative_grid(self, grid_size, grid, reference_pos):
34 | 		return relative_grid(grid_size, grid, reference_pos)
35 | 
36 | 	def _get_target_object_query(self, task_plan):
37 | 		if 'attribute type' in task_plan:
38 | 			return self.metadata.and_query([("category", task_plan['target category'], True), (task_plan['attribute type'], task_plan['attribute value'], True)])
39 | 		else:
40 | 			return self.metadata.and_query([("category", task_plan['target category'], True)])
41 | 
42 | 	def _task_plan_to_str(self, task_plan):
43 | 		t = []
44 | 		for k, v in task_plan.items():
45 | 			if self.metadata.check_category_exists(v):
46 | 				t.append(f'{k}: {self.metadata.get_surfacename(v)}')
47 | 			else:
48 | 				t.append(f'{k}: {v}')
49 | 		return '\n'.join(t)
50 | 
51 | 	def make_video(self, video_metadata):
52 | 		return make_video(video_metadata, self.metadata, VIDEO_H, VIDEO_W)
53 | 
54 | 	def _generate_task(self, task_plan) -> Tuple[str, str, List[str], Dict]:
55 | 		"(Abstract method) generate task"
56 | 
57 | 	def generate(self, task_plan, return_data=True, seed=None):
58 | 		if seed is not None:
59 | 			self.rng = np.random.default_rng(seed=seed)
60 | 
61 | 		retry = 0
62 | 		while True:
63 | 			question, answer, options, video_metadata = self._generate_task(task_plan)
64 | 			task = {
65 | 				'question'      : question.replace('_', ' '),
66 | 				'answer'        : answer.replace('_', ' '),
67 | 				'options'       : [o.replace('_', ' ') for o in options],
68 | 				'task_plan'     : self._task_plan_to_str(task_plan),
69 | 				'video_metadata': video_metadata,
70 | 				'video'         : self.make_video(video_metadata) if return_data else None
71 | 			}
72 | 			if return_data:
73 | 				if check_video(task['video']):
74 | 					break
75 | 				else:
76 | 					retry -= 1
77 | 					if retry <= 0:
78 | 						raise Exception("Failed to generate video")
79 | 			else:
80 | 				break
81 | 
82 | 		return task
83 | 


--------------------------------------------------------------------------------
/tma/videoqa/tabletop_3d/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import subprocess
  4 | 
  5 | from ..metadata import ObjaverseVideoMetaData
  6 | 
  7 | grid_options = [2, 3]
  8 | 
  9 | grid_mappings = {
 10 | 	2:
 11 | 		{
 12 | 			'back left'  : 0,
 13 | 			'back right' : 1,
 14 | 			'front left' : 2,
 15 | 			'front right': 3
 16 | 		},
 17 | 	3:
 18 | 		{
 19 | 			'back left'   : 0,
 20 | 			'back middle' : 1,
 21 | 			'back right'  : 2,
 22 | 			'middle left' : 3,
 23 | 			'middle'      : 4,
 24 | 			'middle right': 5,
 25 | 			'front left'  : 6,
 26 | 			'front middle': 7,
 27 | 			'front right' : 8
 28 | 		}
 29 | }
 30 | 
 31 | relative_positions = ['left', 'right', 'back', 'front', 'back left', 'back right', 'front left', 'front right']
 32 | relative_position_phrase = {
 33 | 	'left'       : 'to the left of',
 34 | 	'right'      : 'to the right of',
 35 | 	'back'       : 'behind',
 36 | 	'front'      : 'in front of',
 37 | 	'back left'  : 'behind and to the left of',
 38 | 	'back right' : 'behind and to the right of',
 39 | 	'front left' : 'in front and to the left of',
 40 | 	'front right': 'in front and to the right of'
 41 | }
 42 | reverse_relative_positions = {
 43 | 	'left'       : 'right',
 44 | 	'right'      : 'left',
 45 | 	'back'       : 'front',
 46 | 	'front'      : 'back',
 47 | 	'front left' : 'back right',
 48 | 	'front right': 'back left',
 49 | 	'back left'  : 'front right',
 50 | 	'back right' : 'front left'
 51 | }
 52 | 
 53 | 
 54 | def relative_grid(grid_size, grid, reference_pos):
 55 | 	if 'right' in reference_pos:
 56 | 		if grid % grid_size == 0: return -1
 57 | 		grid = grid - 1
 58 | 	if 'left' in reference_pos:
 59 | 		if grid % grid_size == grid_size - 1: return -1
 60 | 		grid = grid + 1
 61 | 	if 'back' in reference_pos:
 62 | 		if grid + grid_size >= grid_size * grid_size: return -1
 63 | 		grid = grid + grid_size
 64 | 	if 'front' in reference_pos:
 65 | 		if grid - grid_size < 0: return -1
 66 | 		grid = grid - grid_size
 67 | 	return grid
 68 | 
 69 | 
 70 | import tempfile
 71 | import diskcache
 72 | 
 73 | run_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "run_blender.py")
 74 | 
 75 | 
 76 | def make_video(scene_json, metadata: ObjaverseVideoMetaData, VIDEO_H, VIDEO_W):
 77 | 	device = metadata.render_device
 78 | 	blender_cache = metadata.blender_cache
 79 | 	assert len(scene_json["objects"]) <= (scene_json["grid number"] ** 2)
 80 | 	scene_json["VIDEO_H"] = VIDEO_H
 81 | 	scene_json["VIDEO_W"] = VIDEO_W
 82 | 
 83 | 	with diskcache.Cache(blender_cache, size_limit=100 * (2 ** 30)) as cache:
 84 | 		key = json.dumps(scene_json, sort_keys=True)
 85 | 		video = cache.get(key, None)
 86 | 		if video is None:
 87 | 			with (tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp_video,
 88 | 				  tempfile.NamedTemporaryFile(delete=True, suffix=".json") as tmp_json):
 89 | 				json.dump(scene_json, open(tmp_json.name, 'w'))
 90 | 
 91 | 				env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(device))
 92 | 				command = (
 93 | 					f"{metadata.blender_path} -b -noaudio --python {run_script_path} -- "
 94 | 					f"--save_video_path {tmp_video.name} "
 95 | 					f"--json_file {tmp_json.name}"
 96 | 				)
 97 | 
 98 | 				subprocess.run(command, shell=True, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
 99 | 
100 | 				with open(tmp_video.name, 'rb') as video_file:
101 | 					video = video_file.read()  # save video to a binary files
102 | 				cache.set(key, video)
103 | 
104 | 	return video
105 | 


--------------------------------------------------------------------------------
/tma/imageqa/sticker_2d/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | from PIL import Image
  5 | 
  6 | grid_options = [2, 3]
  7 | grid_mappings = {
  8 | 	2:
  9 | 		{
 10 | 			'top left'    : 0,
 11 | 			'top right'   : 1,
 12 | 			'bottom left' : 2,
 13 | 			'bottom right': 3
 14 | 		},
 15 | 	3:
 16 | 		{
 17 | 			'top left'     : 0,
 18 | 			'top middle'   : 1,
 19 | 			'top right'    : 2,
 20 | 			'middle left'  : 3,
 21 | 			'middle'       : 4,
 22 | 			'middle right' : 5,
 23 | 			'bottom left'  : 6,
 24 | 			'bottom middle': 7,
 25 | 			'bottom right' : 8
 26 | 		}
 27 | }
 28 | 
 29 | relative_positions = ['left', 'right', 'top', 'bottom', 'top left', 'top right', 'bottom left', 'bottom right']
 30 | relative_position_phrase = {
 31 | 	'left'        : 'to the left of',
 32 | 	'right'       : 'to the right of',
 33 | 	'top'         : 'above',
 34 | 	'bottom'      : 'below',
 35 | 	'top left'    : 'above and to the left of',
 36 | 	'top right'   : 'above and to the right of',
 37 | 	'bottom left' : 'below and to the left of',
 38 | 	'bottom right': 'below and to the right of'
 39 | }
 40 | 
 41 | 
 42 | def relative_grid(grid_size, grid, reference_pos):
 43 | 	if 'right' in reference_pos:
 44 | 		if grid % grid_size == 0: return -1
 45 | 		grid = grid - 1
 46 | 	if 'left' in reference_pos:
 47 | 		if grid % grid_size == grid_size - 1: return -1
 48 | 		grid = grid + 1
 49 | 	if 'top' in reference_pos:
 50 | 		if grid + grid_size >= grid_size * grid_size: return -1
 51 | 		grid = grid + grid_size
 52 | 	if 'bottom' in reference_pos:
 53 | 		if grid - grid_size < 0: return -1
 54 | 		grid = grid - grid_size
 55 | 	return grid
 56 | 
 57 | 
 58 | def does_overlap(box1, box2):
 59 | 	# Returns True if box1 and box2 overlap, False otherwise
 60 | 	x1, y1, x2, y2 = box1
 61 | 	x3, y3, x4, y4 = box2
 62 | 	return not (x2 < x3 or x4 < x1 or y2 < y3 or y4 < y1)
 63 | 
 64 | 
 65 | def sample_bounding_boxes(num_objects, H, W, size_range=(0.3, 0.45)):
 66 | 	while True:
 67 | 		frac = random.uniform(*size_range)
 68 | 		boxes = []
 69 | 		count = 0
 70 | 		num_chances = 5
 71 | 		while len(boxes) < num_objects and count < num_chances:
 72 | 			box_w = int(frac * W)
 73 | 			box_h = int(frac * H)
 74 | 			box_x = random.randint(0, W - box_w)
 75 | 			box_y = random.randint(0, H - box_h)
 76 | 			new_box = (box_x, box_y, box_x + box_w, box_y + box_h)
 77 | 			if not any(does_overlap(new_box, box) for box in boxes):
 78 | 				boxes.append(new_box)
 79 | 			count += 1
 80 | 		if count >= num_chances:
 81 | 			continue
 82 | 		return boxes
 83 | 
 84 | 
 85 | def grid_to_box(H, W, grid_size, grid_index, grid_H, grid_W):
 86 | 	grid_height = H // grid_size
 87 | 	grid_width = W // grid_size
 88 | 
 89 | 	# grid_x, grid_y = np.unravel_index(grid_index, (grid_size, grid_size))
 90 | 	grid_y, grid_x = np.unravel_index(grid_index, (grid_size, grid_size))
 91 | 
 92 | 	box_x = grid_x * grid_width
 93 | 	box_y = grid_y * grid_height
 94 | 	box_w = grid_W * grid_width
 95 | 	box_h = grid_H * grid_height
 96 | 	return (box_x, box_y, box_x + box_w, box_y + box_h)
 97 | 
 98 | 
 99 | def paste_image(background, obj, box):
100 | 	obj = obj.resize((box[2] - box[0], box[3] - box[1]))
101 | 	background.paste(obj, box=box, mask=obj)
102 | 
103 | 
104 | def make_image(metadata, H=512, W=512):
105 | 	# sample bounding boxes
106 | 	grid_size = metadata["grid number"]
107 | 	object_paths = metadata["object paths"]
108 | 	assert len(metadata["objects"]) <= (grid_size ** 2)
109 | 	boxes = [grid_to_box(H, W, grid_size, x, 1, 1) for x in metadata["grids"]]
110 | 
111 | 	im_target = Image.new("RGBA", (W, H), 'WHITE')  # you can load this as a background image if you want
112 | 
113 | 	for view, box in zip(object_paths, boxes):
114 | 		obj = Image.open(view)
115 | 		paste_image(im_target, obj, box)
116 | 
117 | 	return im_target.convert('RGB')
118 | 


--------------------------------------------------------------------------------
/tma/imageqa/tabletop_3d/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import subprocess
  3 | 
  4 | from ..metadata import Objaverse3DMetaData
  5 | 
  6 | grid_options = [2, 3]
  7 | 
  8 | grid_mappings = {
  9 | 	2:
 10 | 		{
 11 | 			'back left'  : 0,
 12 | 			'back right' : 1,
 13 | 			'front left' : 2,
 14 | 			'front right': 3
 15 | 		},
 16 | 	3:
 17 | 		{
 18 | 			'back left'   : 0,
 19 | 			'back middle' : 1,
 20 | 			'back right'  : 2,
 21 | 			'middle left' : 3,
 22 | 			'middle'      : 4,
 23 | 			'middle right': 5,
 24 | 			'front left'  : 6,
 25 | 			'front middle': 7,
 26 | 			'front right' : 8
 27 | 		}
 28 | }
 29 | 
 30 | relative_positions = ['left', 'right', 'back', 'front', 'back left', 'back right', 'front left', 'front right']
 31 | relative_position_phrase = {
 32 | 	'left'       : 'to the left of',
 33 | 	'right'      : 'to the right of',
 34 | 	'back'       : 'behind',
 35 | 	'front'      : 'in front of',
 36 | 	'back left'  : 'behind and to the left of',
 37 | 	'back right' : 'behind and to the right of',
 38 | 	'front left' : 'in front and to the left of',
 39 | 	'front right': 'in front and to the right of'
 40 | }
 41 | reverse_relative_positions = {
 42 | 	'left'       : 'right',
 43 | 	'right'      : 'left',
 44 | 	'back'       : 'front',
 45 | 	'front'      : 'back',
 46 | 	'front left' : 'back right',
 47 | 	'front right': 'back left',
 48 | 	'back left'  : 'front right',
 49 | 	'back right' : 'front left'
 50 | }
 51 | 
 52 | 
 53 | def relative_grid(grid_size, grid, reference_pos):
 54 | 	if 'right' in reference_pos:
 55 | 		if grid % grid_size == 0: return -1
 56 | 		grid = grid - 1
 57 | 	if 'left' in reference_pos:
 58 | 		if grid % grid_size == grid_size - 1: return -1
 59 | 		grid = grid + 1
 60 | 	if 'back' in reference_pos:
 61 | 		if grid + grid_size >= grid_size * grid_size: return -1
 62 | 		grid = grid + grid_size
 63 | 	if 'front' in reference_pos:
 64 | 		if grid - grid_size < 0: return -1
 65 | 		grid = grid - grid_size
 66 | 	return grid
 67 | 
 68 | 
 69 | import os
 70 | import tempfile
 71 | import io, base64
 72 | from PIL import Image
 73 | import diskcache
 74 | 
 75 | run_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "run_blender.py")
 76 | 
 77 | 
 78 | def image_to_base64(pil_image):
 79 | 	import io
 80 | 	import base64
 81 | 	img_byte_arr = io.BytesIO()
 82 | 	pil_image.save(img_byte_arr, format='PNG')
 83 | 	img_byte_arr = img_byte_arr.getvalue()
 84 | 	base64_str = base64.b64encode(img_byte_arr).decode('utf-8')
 85 | 	return base64_str
 86 | 
 87 | 
 88 | def make_image(scene_json, metadata: Objaverse3DMetaData, H=512, W=512):
 89 | 	device = metadata.render_device
 90 | 	blender_cache = metadata.blender_cache
 91 | 	assert len(scene_json["objects"]) <= (scene_json["grid number"] ** 2)
 92 | 	scene_json["H"] = H
 93 | 	scene_json["W"] = W
 94 | 
 95 | 	with diskcache.Cache(blender_cache, size_limit=100 * (2 ** 30)) as cache:
 96 | 		key = json.dumps(scene_json, sort_keys=True)
 97 | 		base64_str = cache.get(key, None)
 98 | 		if base64_str is None:
 99 | 			with (tempfile.NamedTemporaryFile(delete=True, suffix=".png") as tmp_image,
100 | 				  tempfile.NamedTemporaryFile(delete=True, suffix=".json") as tmp_json):
101 | 				json.dump(scene_json, open(tmp_json.name, 'w'))
102 | 
103 | 				env = dict(os.environ, CUDA_VISIBLE_DEVICES=str(device))
104 | 				command = (
105 | 					f"{metadata.blender_path} -b -noaudio --python {run_script_path} -- "
106 | 					f"--save_image_path {tmp_image.name} "
107 | 					f"--json_file {tmp_json.name}"
108 | 				)
109 | 
110 | 				subprocess.run(command, shell=True, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
111 | 
112 | 				img = Image.open(tmp_image.name).convert("RGB")
113 | 				cache.set(key, image_to_base64(img))
114 | 		else:
115 | 			img = Image.open(io.BytesIO(base64.decodebytes(bytes(base64_str, "utf-8"))))
116 | 
117 | 	return img
118 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | **/.DS_STORE
163 | 
164 | 
165 | 
166 | output/


--------------------------------------------------------------------------------
/tma/models/qa_model/base_qa_model.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | from typing import Callable
  4 | 
  5 | import diskcache
  6 | import numpy as np
  7 | import sentence_transformers
  8 | import torch
  9 | 
 10 | from .. import Model
 11 | 
 12 | 
 13 | def make_options(choices, format='letter'):
 14 | 	assert format in ['numeric', 'letter']
 15 | 	if format == 'numeric':
 16 | 		prefix1 = [str(i + 1) for i in range(len(choices))]
 17 | 	else:
 18 | 		prefix1 = [chr(ord("a") + i).upper() for i in range(len(choices))]
 19 | 	prefix2 = [f"({p})" for p in prefix1]
 20 | 	return prefix1, prefix2, [f'{p} {c}' for p, c in zip(prefix2, choices)]
 21 | 
 22 | 
 23 | def check_contain(answer, options):
 24 | 	contains = [option in answer for option in options]
 25 | 	if sum(contains) == 1:
 26 | 		return contains.index(True)
 27 | 	else:
 28 | 		return -1
 29 | 
 30 | 
 31 | class QAModelInstance:
 32 | 	def qa(self, data, prompt):
 33 | 		"(Abstract method) abstract QA method"
 34 | 
 35 | 
 36 | class QAModel(Model):
 37 | 	def __init__(
 38 | 			self,
 39 | 			model_name: str,
 40 | 			prompt_name: str,
 41 | 			prompt_func: Callable,
 42 | 			choice_format='letter',
 43 | 			enable_choice_search: bool = False,
 44 | 			cache_path: str = None,
 45 | 	):
 46 | 		self.model = None
 47 | 		self.model_name = f'{model_name} ({prompt_name})'
 48 | 		self.prompt_func = prompt_func
 49 | 		self.format = choice_format
 50 | 		self.cache_path = cache_path
 51 | 
 52 | 		if self.cache_path is None:
 53 | 			print("[IMPORTANT] model cache is disabled")
 54 | 		else:
 55 | 			print(f"[IMPORTANT] model cache is enabled, cache path: {cache_path}")
 56 | 
 57 | 		self.enable_choice_search = enable_choice_search
 58 | 		if enable_choice_search:
 59 | 			# use SBERT to find the closest choice
 60 | 			self.sentence_transformer = sentence_transformers.SentenceTransformer("all-mpnet-base-v2", device='cpu')
 61 | 
 62 | 	@torch.no_grad()
 63 | 	def choice_search(self, free_form_answer, choices):
 64 | 		query_embedding = self.sentence_transformer.encode([free_form_answer], normalize_embeddings=True)
 65 | 		choices_embedding = self.sentence_transformer.encode(choices, normalize_embeddings=True)
 66 | 		top_choice_index = np.argmax(np.dot(choices_embedding, query_embedding.T))
 67 | 		return choices[top_choice_index]
 68 | 
 69 | 	def _data_to_str(self, data):
 70 | 		""" abstract method """
 71 | 
 72 | 	@torch.no_grad()
 73 | 	def _qa(self, data, prompt):
 74 | 		if self.cache_path is None:
 75 | 			return self.model.qa(data, prompt)
 76 | 		else:
 77 | 			with diskcache.Cache(self.cache_path, size_limit=10 * (2 ** 30)) as cache:
 78 | 				key = json.dumps([self.model_name, self._data_to_str(data), prompt])
 79 | 				response = cache.get(key, None)
 80 | 				if response is None:
 81 | 					response = self.model.qa(data, prompt)
 82 | 					cache.set(key, response)
 83 | 				return response
 84 | 
 85 | 	@torch.no_grad()
 86 | 	def qa(self, data, question):
 87 | 		prompt = self.prompt_func(question)
 88 | 		return self._qa(data, prompt)
 89 | 
 90 | 	@torch.no_grad()
 91 | 	def multiple_choice_qa(self, data, question, choices, answer=None):
 92 | 		# Get VQA model's answer
 93 | 		prefix1, prefix2, options = make_options(choices, self.format)
 94 | 		prompt = self.prompt_func(question, options)
 95 | 		free_form_answer = self._qa(data, prompt)
 96 | 		free_form_answer = free_form_answer.strip()
 97 | 
 98 | 		# Limit the answer to the choices
 99 | 		if free_form_answer in choices:
100 | 			multiple_choice_answer = free_form_answer
101 | 		elif free_form_answer in options:
102 | 			multiple_choice_answer = choices[options.index(free_form_answer)]
103 | 		elif free_form_answer in prefix1:
104 | 			multiple_choice_answer = choices[prefix1.index(free_form_answer)]
105 | 		elif free_form_answer in prefix2:
106 | 			multiple_choice_answer = choices[prefix2.index(free_form_answer)]
107 | 		elif self.enable_choice_search:
108 | 			multiple_choice_answer = self.choice_search(free_form_answer, choices)
109 | 		else:
110 | 			multiple_choice_answer = ""
111 | 			for to_check in [choices, options, prefix1, prefix2]:
112 | 				idx = check_contain(free_form_answer, to_check)
113 | 				if idx != -1:
114 | 					multiple_choice_answer = choices[idx]
115 | 					break
116 | 
117 | 		result = {
118 | 			"free_form_answer"      : free_form_answer,
119 | 			"multiple_choice_answer": multiple_choice_answer,
120 | 			"choices"               : choices.copy(),
121 | 		}
122 | 		if answer is not None:
123 | 			result["accuracy"] = int(answer == multiple_choice_answer)
124 | 		return result
125 | 
126 | 	@torch.no_grad()
127 | 	def multiple_choice_qa_random_ordering(self, data, question, choices, answer=None, n_trials=3):
128 | 		results = {}
129 | 		accuracy = 0
130 | 		for i in range(n_trials):
131 | 			choices_i = choices.copy()
132 | 			random.shuffle(choices_i)
133 | 			results[i] = self.multiple_choice_qa(data, question, choices_i, answer)
134 | 			accuracy += results[i]["accuracy"]
135 | 		results["accuracy"] = accuracy / n_trials
136 | 		return results
137 | 


--------------------------------------------------------------------------------
/tma/models/qa_model/videoqa_model.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | from typing import Callable, Union
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from PIL import Image, ImageDraw, ImageFont
  7 | 
  8 | from .base_qa_model import QAModel, QAModelInstance
  9 | from .imageqa_model import ImageQAModel
 10 | 
 11 | videoqa_models = {
 12 | 
 13 | }
 14 | 
 15 | 
 16 | def list_videoqa_models():
 17 | 	return list(videoqa_models.keys())
 18 | 
 19 | 
 20 | class VideoQAModel(QAModel):
 21 | 	def __init__(
 22 | 			self,
 23 | 			model_name,
 24 | 			prompt_name: str,
 25 | 			prompt_func: Callable,
 26 | 			model: QAModelInstance = None,
 27 | 			torch_device: Union[int, str] = -1,
 28 | 			precision=torch.bfloat16,
 29 | 			choice_format='letter',
 30 | 			enable_choice_search: bool = False,
 31 | 	):
 32 | 		super().__init__(model_name, prompt_name, prompt_func, choice_format, enable_choice_search)
 33 | 
 34 | 		if isinstance(torch_device, str):
 35 | 			torch_device = torch.device(torch_device)
 36 | 		else:
 37 | 			if torch_device == -1:
 38 | 				torch_device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
 39 | 			else:
 40 | 				torch_device = torch.device(f"cuda:{torch_device}")
 41 | 
 42 | 		if model is None:
 43 | 			print(f"Loading {model_name}...")
 44 | 			class_name, ckpt = videoqa_models[model_name]
 45 | 			self.model_precision = precision
 46 | 			self.model = eval(class_name)(ckpt, torch_device, self.model_precision)
 47 | 			print(f"Finish loading {model_name}")
 48 | 		else:
 49 | 			print(f"Using provided self.model...")
 50 | 			self.model = model
 51 | 
 52 | 	@torch.no_grad()
 53 | 	def _qa(self, data, prompt):
 54 | 		if isinstance(data, str):
 55 | 			return self.model.qa(data, prompt)
 56 | 		else:
 57 | 			with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp:
 58 | 				with open(tmp.name, 'wb') as file:
 59 | 					file.write(data)
 60 | 				video_path = tmp.name
 61 | 				answer = self.model.qa(video_path, prompt)
 62 | 			return answer
 63 | 
 64 | 
 65 | def sample_frames(video_path, n):
 66 | 	import cv2
 67 | 	# Open the video file
 68 | 	cap = cv2.VideoCapture(video_path)
 69 | 	if not cap.isOpened():
 70 | 		print("Error: Could not open video.")
 71 | 		return []
 72 | 
 73 | 	# Calculate total number of frames and video FPS
 74 | 	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 75 | 
 76 | 	# Calculate interval in terms of frames
 77 | 	interval = max(1, total_frames // n)
 78 | 
 79 | 	# Sample frames
 80 | 	sampled_frames = []
 81 | 	for i in range(0, total_frames, interval):
 82 | 		# Set the current frame position
 83 | 		cap.set(cv2.CAP_PROP_POS_FRAMES, i)
 84 | 
 85 | 		# Read the frame
 86 | 		ret, frame = cap.read()
 87 | 		if not ret:
 88 | 			print(f"Error: Could not read frame {i}.")
 89 | 			break
 90 | 
 91 | 		# Convert the frame to PIL Image
 92 | 		frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 93 | 		pil_img = Image.fromarray(frame_rgb)
 94 | 		sampled_frames.append(pil_img)
 95 | 
 96 | 		# Stop if we have collected n frames
 97 | 		if len(sampled_frames) >= n:
 98 | 			break
 99 | 
100 | 	# Release the video capture object
101 | 	cap.release()
102 | 
103 | 	return sampled_frames
104 | 
105 | 
106 | def get_contrasting_color(image, x, y, width, height):
107 | 	"""
108 | 	Determine a contrasting color (black or white) based on the average color of a specified area in the image.
109 | 	"""
110 | 	# Crop the relevant part of the image
111 | 	cropped_image = image.crop((x, y, x + width, y + height))
112 | 	# Convert to numpy array for analysis
113 | 	np_image = np.array(cropped_image)
114 | 	# Calculate the average color
115 | 	average_color = np.mean(np_image, axis=(0, 1))
116 | 	# Brightness calculation based on perceived luminance
117 | 	brightness = np.sqrt(0.299 * average_color[0] ** 2 + 0.587 * average_color[1] ** 2 + 0.114 * average_color[2] ** 2)
118 | 	# Return white for dark backgrounds and black for light backgrounds
119 | 	return 'white' if brightness < 128 else 'black'
120 | 
121 | 
122 | def concatenate_image(images, rows, columns, separator_width=10):
123 | 	# Ensure we have the exact number of images needed
124 | 	if len(images) != rows * columns:
125 | 		raise ValueError(f"Expected {rows * columns} images, but got {len(images)}.")
126 | 
127 | 	# Calculate the max width and height of images to standardize sizes
128 | 	max_width = max(img.width for img in images)
129 | 	max_height = max(img.height for img in images)
130 | 
131 | 	# Resize images to the max width and height
132 | 	resized_images = [img.resize((max_width, max_height), Image.Resampling.LANCZOS) for img in images]
133 | 
134 | 	# Calculate the total width and height for the combined image
135 | 	total_width = max_width * columns + separator_width * (columns - 1)
136 | 	total_height = max_height * rows + separator_width * (rows - 1)
137 | 	combined_image = Image.new('RGB', (total_width, total_height), color='white')
138 | 
139 | 	# Place images in the specified grid
140 | 	x_offset = 0
141 | 	y_offset = 0
142 | 	for i, img in enumerate(resized_images):
143 | 		combined_image.paste(img, (x_offset, y_offset))
144 | 		if (i + 1) % columns == 0:  # Move to the next row after the last column
145 | 			x_offset = 0
146 | 			y_offset += img.height + separator_width
147 | 		else:  # Move to the next column
148 | 			x_offset += img.width + separator_width
149 | 
150 | 	# Add numbers to each image for identification
151 | 	draw = ImageDraw.Draw(combined_image)
152 | 	try:
153 | 		font_size = (max_width + max_height) // 2 // 12
154 | 		font = ImageFont.load_default(size=font_size)
155 | 	except IOError:
156 | 		font = ImageFont.truetype("arial", 20)
157 | 
158 | 	x_offset = 0
159 | 	y_offset = 0
160 | 	for i, img in enumerate(resized_images):
161 | 		text = str(i + 1)
162 | 		text_x = x_offset + 10
163 | 		text_y = y_offset + 10
164 | 		text_width, text_height = font_size, font_size
165 | 		font_color = get_contrasting_color(combined_image, text_x, text_y, text_width, text_height)
166 | 		draw.text((text_x, text_y), text, fill=font_color, font=font)
167 | 		if (i + 1) % columns == 0:
168 | 			x_offset = 0
169 | 			y_offset += img.height + separator_width
170 | 		else:
171 | 			x_offset += img.width + separator_width
172 | 
173 | 	return combined_image
174 | 
175 | 
176 | def video_to_concat_image(video_path, num_rows, num_columns):
177 | 	return concatenate_image(sample_frames(video_path, num_rows * num_columns), num_rows, num_columns)
178 | 
179 | 
180 | class ImageQAModel4Video(VideoQAModel):
181 | 	def __init__(
182 | 			self,
183 | 			model: ImageQAModel,
184 | 			prompt_name: str,
185 | 			prompt_func: Callable,
186 | 			num_rows: int = 2,
187 | 			num_columns: int = 2,
188 | 			choice_format='letter',
189 | 			enable_choice_search: bool = False,
190 | 	):
191 | 		super(VideoQAModel, self).__init__(model.model_name, prompt_name, prompt_func, choice_format, enable_choice_search)
192 | 		self.num_rows = num_rows
193 | 		self.num_columns = num_columns
194 | 		self.num_frames = self.num_rows * self.num_columns
195 | 		self.model = model
196 | 
197 | 	@torch.no_grad()
198 | 	def _qa(self, data, prompt):
199 | 		if isinstance(data, Image.Image):
200 | 			return self.model._qa(data, prompt)
201 | 		elif isinstance(data, str):
202 | 			return self.model._qa(video_to_concat_image(data, self.num_rows, self.num_columns), prompt)
203 | 		else:
204 | 			with tempfile.NamedTemporaryFile(delete=True, suffix=".mp4") as tmp:
205 | 				with open(tmp.name, 'wb') as file:
206 | 					file.write(data)
207 | 				video_path = tmp.name
208 | 				answer = self.model._qa(video_to_concat_image(video_path, self.num_rows, self.num_columns), prompt)
209 | 			return answer
210 | 


--------------------------------------------------------------------------------
/annotations/relation_to_type.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "standing behind": "spatial",
  3 |     "displayed in": "interactional",
  4 |     "jumping on": "interactional",
  5 |     "sitting next to": "interactional",
  6 |     "moving": "interactional",
  7 |     "exiting": "interactional",
  8 |     "sitting with": "social",
  9 |     "drinking from": "interactional",
 10 |     "herding": "interactional",
 11 |     "larger than": "spatial",
 12 |     "tied around": "spatial",
 13 |     "covered with": "spatial",
 14 |     "lying inside": "interactional",
 15 |     "growing behind": "interactional",
 16 |     "reflecting in": "functional",
 17 |     "on": "spatial",
 18 |     "sitting atop": "spatial",
 19 |     "topped with": "interactional",
 20 |     "brushing": "interactional",
 21 |     "sitting in": "spatial",
 22 |     "pushed by": "interactional",
 23 |     "walking up": "spatial",
 24 |     "tossing": "interactional",
 25 |     "sitting under": "spatial",
 26 |     "entering": "interactional",
 27 |     "by": "spatial",
 28 |     "sitting in front of": "spatial",
 29 |     "standing against": "spatial",
 30 |     "about to hit": "interactional",
 31 |     "buying": "interactional",
 32 |     "tying": "interactional",
 33 |     "reflected in": "spatial",
 34 |     "lying next to": "interactional",
 35 |     "cutting": "interactional",
 36 |     "surrounding": "spatial",
 37 |     "pushing": "interactional",
 38 |     "skiing on": "interactional",
 39 |     "walking in": "spatial",
 40 |     "with": "spatial",
 41 |     "looking toward": "spatial",
 42 |     "lying on": "spatial",
 43 |     "grazing in": "interactional",
 44 |     "drawn on": "spatial",
 45 |     "connected to": "spatial",
 46 |     "taller than": "spatial",
 47 |     "longer than": "spatial",
 48 |     "pouring": "interactional",
 49 |     "sitting by": "spatial",
 50 |     "smaller than": "spatial",
 51 |     "on the side of": "spatial",
 52 |     "jumping off": "interactional",
 53 |     "sitting beside": "spatial",
 54 |     "throwing": "interactional",
 55 |     "pulling": "interactional",
 56 |     "waiting for": "interactional",
 57 |     "running through": "spatial",
 58 |     "contain": "interactional",
 59 |     "hitting": "interactional",
 60 |     "at": "spatial",
 61 |     "smoking": "interactional",
 62 |     "growing by": "spatial",
 63 |     "drinking": "interactional",
 64 |     "hanging from": "spatial",
 65 |     "hugging": "interactional",
 66 |     "sleeping in": "interactional",
 67 |     "towing": "interactional",
 68 |     "walking across": "spatial",
 69 |     "parked in front of": "spatial",
 70 |     "growing along": "interactional",
 71 |     "resting on": "interactional",
 72 |     "looking over": "interactional",
 73 |     "parked along": "spatial",
 74 |     "beside": "spatial",
 75 |     "driving": "interactional",
 76 |     "sewn on": "interactional",
 77 |     "looking into": "interactional",
 78 |     "eating in": "spatial",
 79 |     "traveling down": "spatial",
 80 |     "close to": "spatial",
 81 |     "slicing": "interactional",
 82 |     "bigger than": "spatial",
 83 |     "underneath": "spatial",
 84 |     "leading": "interactional",
 85 |     "talking to": "interactional",
 86 |     "getting on": "spatial",
 87 |     "growing from": "interactional",
 88 |     "swimming in": "interactional",
 89 |     "talking on": "interactional",
 90 |     "hung on": "interactional",
 91 |     "catching": "interactional",
 92 |     "sprinkled on": "interactional",
 93 |     "opening": "interactional",
 94 |     "mounted to": "spatial",
 95 |     "standing in front of": "spatial",
 96 |     "seen through": "spatial",
 97 |     "going into": "spatial",
 98 |     "growing in": "spatial",
 99 |     "licking": "interactional",
100 |     "full of": "interactional",
101 |     "hanging out of": "spatial",
102 |     "next to": "spatial",
103 |     "hanging above": "spatial",
104 |     "standing on top of": "spatial",
105 |     "cooking": "interactional",
106 |     "looking through": "interactional",
107 |     "between": "spatial",
108 |     "riding": "interactional",
109 |     "playing with": "interactional",
110 |     "eating from": "interactional",
111 |     "going through": "spatial",
112 |     "leaning against": "spatial",
113 |     "scattered on": "spatial",
114 |     "parked behind": "spatial",
115 |     "flying in": "spatial",
116 |     "worn on": "interactional",
117 |     "surrounded by": "spatial",
118 |     "feeding": "interactional",
119 |     "standing under": "spatial",
120 |     "floating on": "spatial",
121 |     "walking down": "spatial",
122 |     "skating on": "interactional",
123 |     "under": "spatial",
124 |     "playing in": "interactional",
125 |     "lying on top of": "spatial",
126 |     "on the bottom of": "spatial",
127 |     "inside": "spatial",
128 |     "kissing": "interactional",
129 |     "playing at": "interactional",
130 |     "standing at": "spatial",
131 |     "helping": "interactional",
132 |     "riding in": "interactional",
133 |     "chained to": "spatial",
134 |     "parked in": "spatial",
135 |     "on top of": "spatial",
136 |     "kept in": "spatial",
137 |     "covering": "spatial",
138 |     "grazing on": "interactional",
139 |     "approaching": "interactional",
140 |     "climbing": "interactional",
141 |     "covered in": "spatial",
142 |     "growing next to": "spatial",
143 |     "in between": "spatial",
144 |     "behind": "spatial",
145 |     "growing near": "spatial",
146 |     "painted on": "spatial",
147 |     "driving down": "spatial",
148 |     "parked next to": "spatial",
149 |     "touching": "interactional",
150 |     "parked by": "interactional",
151 |     "walking to": "spatial",
152 |     "posing with": "social",
153 |     "standing beside": "spatial",
154 |     "standing on": "spatial",
155 |     "using": "interactional",
156 |     "mounted on": "spatial",
157 |     "walking by": "spatial",
158 |     "playing on": "interactional",
159 |     "blowing out": "interactional",
160 |     "sitting near": "interactional",
161 |     "crossing": "spatial",
162 |     "to the left of": "spatial",
163 |     "cooked in": "functional",
164 |     "eating at": "interactional",
165 |     "walking towards": "interactional",
166 |     "floating in": "spatial",
167 |     "hang from": "spatial",
168 |     "photographing": "interactional",
169 |     "sniffing": "interactional",
170 |     "stuck on": "interactional",
171 |     "walking toward": "interactional",
172 |     "looking down at": "interactional",
173 |     "traveling on": "spatial",
174 |     "typing on": "interactional",
175 |     "guiding": "interactional",
176 |     "shining through": "spatial",
177 |     "jumping over": "interactional",
178 |     "following": "interactional",
179 |     "dragging": "interactional",
180 |     "on the front of": "spatial",
181 |     "standing next to": "interactional",
182 |     "reflected on": "spatial",
183 |     "on the other side of": "spatial",
184 |     "lying in": "spatial",
185 |     "boarding": "interactional",
186 |     "pointing at": "interactional",
187 |     "draped over": "spatial",
188 |     "observing": "interactional",
189 |     "working in": "interactional",
190 |     "followed by": "interactional",
191 |     "chasing": "interactional",
192 |     "wrapped in": "spatial",
193 |     "leaning on": "spatial",
194 |     "sitting at": "spatial",
195 |     "parked on": "spatial",
196 |     "piled on": "spatial",
197 |     "walking with": "interactional",
198 |     "carrying": "interactional",
199 |     "beneath": "spatial",
200 |     "served on": "functional",
201 |     "wading in": "interactional",
202 |     "walking into": "spatial",
203 |     "sitting inside": "spatial",
204 |     "holding": "interactional",
205 |     "enclosing": "spatial",
206 |     "looking out": "interactional",
207 |     "standing near": "interactional",
208 |     "of": "spatial",
209 |     "to the right of": "spatial",
210 |     "walking next to": "interactional",
211 |     "petting": "interactional",
212 |     "driving on": "spatial",
213 |     "standing in": "spatial",
214 |     "hidden by": "spatial",
215 |     "flying through": "spatial",
216 |     "hanging over": "spatial",
217 |     "playing": "interactional",
218 |     "covered by": "spatial",
219 |     "stuck in": "spatial",
220 |     "attached to": "spatial",
221 |     "facing": "interactional",
222 |     "stacked on": "interactional",
223 |     "walking near": "spatial",
224 |     "wrapped around": "spatial",
225 |     "higher than": "spatial",
226 |     "chewing": "interactional",
227 |     "parked near": "spatial",
228 |     "preparing": "interactional",
229 |     "skiing in": "interactional",
230 |     "jumping in": "interactional",
231 |     "flying": "interactional",
232 |     "leaning over": "interactional",
233 |     "picking up": "interactional",
234 |     "walking through": "interactional",
235 |     "in front of": "spatial",
236 |     "decorated by": "functional",
237 |     "growing on": "interactional",
238 |     "standing around": "spatial",
239 |     "standing by": "spatial",
240 |     "going down": "spatial",
241 |     "grabbing": "interactional",
242 |     "eating": "interactional",
243 |     "walking behind": "interactional",
244 |     "in": "spatial",
245 |     "mixed with": "interactional",
246 |     "coming down": "spatial",
247 |     "cleaning": "interactional",
248 |     "adjusting": "interactional",
249 |     "perched on": "interactional",
250 |     "riding on": "interactional",
251 |     "sitting on": "spatial",
252 |     "parked alongside": "spatial",
253 |     "working on": "interactional",
254 |     "hanging on": "spatial",
255 |     "pulled by": "interactional",
256 |     "splashing": "interactional",
257 |     "hanging in": "spatial",
258 |     "tied to": "spatial",
259 |     "plugged into": "interactional",
260 |     "printed on": "spatial",
261 |     "decorated with": "interactional",
262 |     "on the back of": "spatial",
263 |     "on the edge of": "spatial",
264 |     "below": "spatial",
265 |     "sleeping on": "interactional",
266 |     "walking along": "spatial",
267 |     "hanging off": "spatial",
268 |     "walking on": "spatial",
269 |     "around": "spatial",
270 |     "looking in": "interactional",
271 |     "looking at": "interactional",
272 |     "near": "spatial",
273 |     "parked at": "spatial",
274 |     "staring at": "interactional",
275 |     "reading": "interactional",
276 |     "swinging": "interactional",
277 |     "wearing": "interactional",
278 |     "falling off": "interactional",
279 |     "selling": "interactional",
280 |     "above": "spatial",
281 |     "holding onto": "interactional",
282 |     "biting": "interactional",
283 |     "running on": "spatial",
284 |     "decorating": "interactional",
285 |     "leaving": "spatial",
286 |     "making": "interactional",
287 |     "balancing on": "interactional",
288 |     "running in": "spatial",
289 |     "flying above": "spatial",
290 |     "sitting around": "spatial",
291 |     "coming out of": "spatial",
292 |     "washing": "interactional",
293 |     "worn around": "interactional",
294 |     "sitting on top of": "spatial",
295 |     "skiing down": "interactional",
296 |     "kicking": "interactional",
297 |     "running across": "spatial",
298 |     "parked beside": "spatial",
299 |     "walking past": "interactional",
300 |     "reaching for": "interactional",
301 |     "displayed on": "interactional",
302 |     "serving": "interactional",
303 |     "smiling at": "emotional",
304 |     "trying to catch": "interactional",
305 |     "flying over": "spatial",
306 |     "watching": "interactional",
307 |     "shorter than": "spatial",
308 |     "smelling": "interactional",
309 |     "coming from": "spatial",
310 |     "sitting behind": "spatial",
311 |     "filled with": "interactional",
312 |     "writing on": "interactional",
313 |     "wiping": "interactional",
314 |     "having it on the back": "spatial",
315 |     "twisting": "interactional"
316 | }


--------------------------------------------------------------------------------
/tma/imageqa/metadata.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from itertools import product
  4 | from math import radians
  5 | from typing import List, Tuple
  6 | 
  7 | import networkx as nx
  8 | import pandas as pd
  9 | 
 10 | from ..metadata import CategoryMetaData
 11 | 
 12 | ambiguous_colors = [
 13 | 	["red", "pink", "purple"],
 14 | 	["yellow", "orange", "brown", "gold", "beige"],
 15 | ]
 16 | 
 17 | 
 18 | def get_confusing_colors(color):
 19 | 	for colors in ambiguous_colors:
 20 | 		if color in colors:
 21 | 			return colors
 22 | 	return [color]
 23 | 
 24 | 
 25 | def remove_skip_edge(edges):
 26 | 	G = nx.DiGraph()
 27 | 	G.add_edges_from(edges)
 28 | 	new_edges = []
 29 | 	for source, target in edges:
 30 | 		G.remove_edge(source, target)
 31 | 		if not nx.has_path(G, source, target):
 32 | 			G.add_edge(source, target)
 33 | 			new_edges.append((source, target))
 34 | 	return new_edges
 35 | 
 36 | 
 37 | def remove_nodes(G, nodes):
 38 | 	for node in nodes:
 39 | 		successors = list(G.successors(node))
 40 | 		predecessors = list(G.predecessors(node))
 41 | 		G.remove_node(node)
 42 | 		for s in successors:
 43 | 			for p in predecessors:
 44 | 				G.add_edge(p, s)
 45 | 	return G
 46 | 
 47 | 
 48 | def build_taxonomy(path_to_metadata, mode):
 49 | 	assert mode in ['objaverse', 'scene_graph']
 50 | 
 51 | 	cateid_to_concept = json.load(open(os.path.join(path_to_metadata, 'cateid_to_concept.json')))
 52 | 	taxonomy = json.load(open(os.path.join(path_to_metadata, 'taxonomy.json')))
 53 | 	edges, nodes = taxonomy['edges'], taxonomy['nodes']
 54 | 	G = nx.DiGraph()
 55 | 	G.add_edges_from(remove_skip_edge(edges))
 56 | 
 57 | 	nodes_to_remove = []
 58 | 	categories_with_object = set([k for k, v in cateid_to_concept.items() if len(v[mode]) > 0])
 59 | 	for node in G.nodes():
 60 | 		if node not in categories_with_object and len(nx.descendants(G, node) & categories_with_object) == 0:
 61 | 			nodes_to_remove.append(node)
 62 | 	G = remove_nodes(G, nodes_to_remove)
 63 | 	G.add_nodes_from(categories_with_object)
 64 | 
 65 | 	categories, category_info = [], {}
 66 | 	for node in G.nodes():
 67 | 		categories.append(node)
 68 | 		if node in cateid_to_concept:
 69 | 			category_info[node] = cateid_to_concept[node]
 70 | 		else:
 71 | 			category_info[node] = nodes[node]
 72 | 	categories = sorted(categories)
 73 | 
 74 | 	return G, categories, category_info, categories_with_object
 75 | 
 76 | 
 77 | class ObjaverseMetaData(CategoryMetaData):
 78 | 	def __init__(self, path_to_metadata):
 79 | 		super().__init__()
 80 | 
 81 | 		self.taxonomy, self.categories, self.category_info, categories_with_object = \
 82 | 			build_taxonomy(path_to_metadata, 'objaverse')
 83 | 
 84 | 		cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json')))
 85 | 
 86 | 		def get_category_objects(category):
 87 | 			if category in cateid_to_objects:
 88 | 				return list(cateid_to_objects[category].keys())
 89 | 			else:
 90 | 				return []
 91 | 
 92 | 		cateid_to_objid = {}
 93 | 		for cateid in self.categories:
 94 | 			objs = get_category_objects(cateid)
 95 | 			for c in nx.descendants(self.taxonomy, cateid) & categories_with_object:
 96 | 				objs.extend(get_category_objects(c))
 97 | 			cateid_to_objid[cateid] = objs
 98 | 			assert len(objs) > 0
 99 | 			assert len(objs) == len(set(objs))
100 | 
101 | 		self.attribute_vocab, objid_to_attribute = {}, {}
102 | 		for cateid in cateid_to_objects:
103 | 			for objid in cateid_to_objects[cateid]:
104 | 				objid_to_attribute[objid] = cateid_to_objects[cateid][objid]["attributes"]
105 | 				for attr, values in cateid_to_objects[cateid][objid]["attributes"].items():
106 | 					if attr not in self.attribute_vocab:
107 | 						self.attribute_vocab[attr] = set()
108 | 					self.attribute_vocab[attr].update(values)
109 | 
110 | 		data = []
111 | 		for cateid, objs in cateid_to_objid.items():
112 | 			for objid in objs:
113 | 				attribute_data = []
114 | 				for attr in self.attribute_vocab:
115 | 					values = objid_to_attribute[objid].get(attr, [])
116 | 					if len(values) == 0:
117 | 						values = [None]
118 | 					attribute_data.append(values)
119 | 
120 | 				for attribute_combination in product(*attribute_data):
121 | 					data.append([objid, cateid] + list(attribute_combination))
122 | 
123 | 		self.df = pd.DataFrame(data, columns=['object', 'category'] + list(self.attribute_vocab.keys()))
124 | 
125 | 	def check_object_attribute(self, objid, attributes):
126 | 		for attr, values in attributes.items():
127 | 			for value in values:
128 | 				if value not in self.df[self.df['object'] == objid][attr].unique():
129 | 					return False
130 | 		return True
131 | 
132 | 	def and_query(self, conditions: List[Tuple]) -> str:
133 | 		q = set()
134 | 		for k, v, i in conditions:
135 | 			# k: column name; v: value; i: is equal
136 | 			if v is None:
137 | 				if i:
138 | 					q.add(f'{k} in [None]')
139 | 				else:
140 | 					q.add(f'{k} not in [None]')
141 | 			else:
142 | 				if i:
143 | 					q.add(f'{k} == {repr(v)}')
144 | 				else:
145 | 					if k == 'category':
146 | 						# exclude all relevant categories
147 | 						for c in self.get_relevant_categories(v):
148 | 							q.add(f'{k} != {repr(c)}')
149 | 					elif k == 'color':
150 | 						# exclude all confusing colors
151 | 						for c in get_confusing_colors(v):
152 | 							q.add(f'{k} != {repr(c)}')
153 | 					else:
154 | 						q.add(f'{k} != {repr(v)}')
155 | 		return ' and '.join(q)
156 | 
157 | 	def or_query(self, conditions: List[str]) -> str:
158 | 		conditions = [f'({c})' for c in conditions if len(c) > 0]
159 | 		return ' or '.join(conditions)
160 | 
161 | 	def query_metadata(self, target, query: str):
162 | 		if len(query) == 0:
163 | 			return sorted(self.df[target].dropna().unique())
164 | 		else:
165 | 			return sorted(self.df.query(query)[target].dropna().unique().tolist())
166 | 
167 | 	def sample(self, rng, n, target, query: str):
168 | 		if n == 1:
169 | 			return rng.choice(self.query_metadata(target, query))
170 | 		else:
171 | 			candidates = self.query_metadata(target, query)
172 | 			return rng.choice(candidates, n, replace=len(candidates) < n).tolist()
173 | 
174 | 	def sample_category_for_object(self, rng, objid, exclude_category=None):
175 | 		candidates = self.query_metadata("category", self.and_query([("object", objid, True)]))
176 | 		if exclude_category is not None:
177 | 			exclude_category = self.get_relevant_categories(exclude_category)
178 | 			candidates = [c for c in candidates if c not in exclude_category]
179 | 		return rng.choice(candidates)
180 | 
181 | 	def get_category_attribute_dict(self, cateid):
182 | 		attribute_dict = {}
183 | 		for attr in self.attribute_vocab:
184 | 			attribute_dict[attr] = self.query_metadata(attr, self.and_query([("category", cateid, True)]))
185 | 		return attribute_dict
186 | 
187 | 
188 | class Objaverse2DMetaData(ObjaverseMetaData):
189 | 	def __init__(self, path_to_metadata, image_folder):
190 | 		super().__init__(path_to_metadata)
191 | 
192 | 		self.image_folder = image_folder
193 | 		cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json')))
194 | 
195 | 		self.objid_to_images = {}
196 | 		for cateid in cateid_to_objects:
197 | 			for objid in cateid_to_objects[cateid]:
198 | 				self.objid_to_images[objid] = [os.path.join(image_folder, cateid, objid, i)
199 | 											   for i in cateid_to_objects[cateid][objid]["images"]]
200 | 
201 | 	def sample_image(self, rng, objid):
202 | 		return rng.choice(self.objid_to_images[objid])
203 | 
204 | 
205 | class Objaverse3DMetaData(ObjaverseMetaData):
206 | 	def __init__(self, path_to_metadata, blender_path, assets_path, render_device='cpu', blender_cache='./blender_cache'):
207 | 		super().__init__(path_to_metadata)
208 | 		self.assets_path = assets_path
209 | 		self.blender_path = blender_path
210 | 		self.blender_cache = blender_cache
211 | 		self.render_device = render_device
212 | 		plane_dir = os.path.join(assets_path, "plane_glbs")
213 | 		self.plane_texture_path = [os.path.join(plane_dir, f) for f in os.listdir(plane_dir) if f.endswith(".glb")]
214 | 		hdri_dir = os.path.join(assets_path, "hdri")
215 | 		self.hdri_path = [os.path.join(hdri_dir, f) for f in os.listdir(hdri_dir) if f.endswith(".exr")]
216 | 
217 | 		cateid_to_objects = json.load(open(os.path.join(path_to_metadata, 'cateid_to_objects.json')))
218 | 		self.object_to_angles = {objid: cateid_to_objects[cateid][objid]['angles']
219 | 								 for cateid in cateid_to_objects for objid in cateid_to_objects[cateid]}
220 | 
221 | 	def get_object_path(self, objid):
222 | 		return os.path.join(self.assets_path, "objects", objid + ".glb")
223 | 
224 | 	def sample_object_angle(self, rng, objid):
225 | 		angles = self.object_to_angles[objid]
226 | 		return angles[rng.choice(len(angles))]
227 | 
228 | 	def sample_blender_configuration(self, rng):
229 | 		orientation = rng.choice([-1, 1])
230 | 		key_light_horizontal_angle = orientation * radians(rng.uniform(15, 45))
231 | 		fill_light_horizontal_angle = - orientation * radians(rng.uniform(15, 60))
232 | 		key_light_vertical_angle = -radians(rng.uniform(15, 45))
233 | 		fill_light_vertical_angle = -radians(rng.uniform(0, 30))
234 | 
235 | 		sun_x, sun_y = radians(rng.uniform(0, 45)), radians(rng.uniform(0, 45))
236 | 		sun_energy = rng.uniform(1.0, 6.0)
237 | 
238 | 		plane_texture_path = rng.choice(self.plane_texture_path)
239 | 		hdri_path = rng.choice(self.hdri_path)
240 | 
241 | 		return {
242 | 			"key_light_horizontal_angle" : key_light_horizontal_angle,
243 | 			"fill_light_horizontal_angle": fill_light_horizontal_angle,
244 | 			"key_light_vertical_angle"   : key_light_vertical_angle,
245 | 			"fill_light_vertical_angle"  : fill_light_vertical_angle,
246 | 			"sun_x"                      : sun_x,
247 | 			"sun_y"                      : sun_y,
248 | 			"sun_energy"                 : sun_energy,
249 | 			"plane_texture_path"         : plane_texture_path,
250 | 			"hdri_path"                  : hdri_path
251 | 		}
252 | 
253 | 
254 | def load_scene_graph(scene_graph_folder):
255 | 	image_folder = os.path.join(scene_graph_folder, "images/images")
256 | 	sg_json_folder = os.path.join(scene_graph_folder, "sceneGraphs")
257 | 	# train_scene_graphs = json.load(open(os.path.join(sg_json_folder, "train_sceneGraphs.json")))
258 | 	val_scene_graphs = json.load(open(os.path.join(sg_json_folder, "val_sceneGraphs.json")))
259 | 	scene_graphs = val_scene_graphs  # TODO: first only use val_scene_graphs
260 | 	return image_folder, scene_graphs
261 | 
262 | 
263 | class SceneGraphMetaData(CategoryMetaData):
264 | 	def __init__(self, path_to_metadata, scene_graph_folder):
265 | 		super().__init__()
266 | 		self.taxonomy, self.categories, self.category_info, self.categories_with_object = \
267 | 			build_taxonomy(path_to_metadata, 'scene_graph')
268 | 
269 | 		self.type_to_attribute = json.load(open(os.path.join(path_to_metadata, 'attribute_category.json')))
270 | 		self.attribute_to_type = {attr: k for k, vs in self.type_to_attribute.items() for attr in vs}
271 | 
272 | 		self.image_folder, self.scene_graphs = load_scene_graph(scene_graph_folder)
273 | 		self.scene_graphs_list = list(self.scene_graphs.keys())
274 | 		self.sg_object_to_cateid = {}
275 | 		for k, v in self.category_info.items():
276 | 			if k in self.categories_with_object:
277 | 				for sg_object in v['scene_graph']:
278 | 					self.sg_object_to_cateid[sg_object] = k
279 | 
280 | 		relations = set()
281 | 		for sg in self.scene_graphs.values():
282 | 			for obj in sg['objects'].values():
283 | 				for rel in obj['relations']:
284 | 					relations.add(rel['name'])
285 | 		self.relations = list(relations)
286 | 
287 | 	def check_object_in_category(self, object_name):
288 | 		return object_name in self.sg_object_to_cateid
289 | 
290 | 	def object_name_to_cateid(self, object_name):
291 | 		return self.sg_object_to_cateid[object_name]
292 | 
293 | 	def get_attribute_type(self, attribute):
294 | 		return self.attribute_to_type.get(attribute, "other")
295 | 
296 | 	def get_image_path(self, scene_graph_id):
297 | 		return os.path.join(self.image_folder, scene_graph_id + ".jpg")
298 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/annotations/attribute_category.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "color": [
  3 |     "white",
  4 |     "yellow",
  5 |     "maroon",
  6 |     "navy",
  7 |     "purple",
  8 |     "light brown",
  9 |     "green",
 10 |     "pink",
 11 |     "blue",
 12 |     "light blue",
 13 |     "red",
 14 |     "dark",
 15 |     "black and white",
 16 |     "dark colored",
 17 |     "brunette",
 18 |     "dark brown",
 19 |     "transparent",
 20 |     "bronze",
 21 |     "gold",
 22 |     "beige",
 23 |     "gray",
 24 |     "brown",
 25 |     "opaque",
 26 |     "copper",
 27 |     "dark blue",
 28 |     "orange",
 29 |     "neon",
 30 |     "cream colored",
 31 |     "rainbow colored",
 32 |     "silver",
 33 |     "blond",
 34 |     "khaki",
 35 |     "black",
 36 |     "tan",
 37 |     "translucent"
 38 |   ],
 39 |   "other": [
 40 |     "sweet",
 41 |     "clear",
 42 |     "wii",
 43 |     "tinted",
 44 |     "analog",
 45 |     "powerful",
 46 |     "made",
 47 |     "scarce",
 48 |     "power",
 49 |     "electric",
 50 |     "christmas",
 51 |     "public",
 52 |     "wine",
 53 |     "tennis",
 54 |     "urban",
 55 |     "roman",
 56 |     "abundant",
 57 |     "commercial",
 58 |     "deciduous",
 59 |     "bright",
 60 |     "toy",
 61 |     "cordless",
 62 |     "real",
 63 |     "tail",
 64 |     "computer",
 65 |     "mixed",
 66 |     "evergreen",
 67 |     "portable",
 68 |     "fluorescent",
 69 |     "strong",
 70 |     "regular",
 71 |     "kitchen",
 72 |     "digital",
 73 |     "exterior",
 74 |     "oriental",
 75 |     "abstract",
 76 |     "adidas",
 77 |     "telephone",
 78 |     "baseball",
 79 |     "support",
 80 |     "chinese",
 81 |     "soccer",
 82 |     "wireless",
 83 |     "asian",
 84 |     "tropical",
 85 |     "railroad",
 86 |     "wired",
 87 |     "rustic",
 88 |     "professional",
 89 |     "toilet",
 90 |     "military",
 91 |     "simple",
 92 |     "bathroom",
 93 |     "safety",
 94 |     "disposable",
 95 |     "license",
 96 |     "calico",
 97 |     "birthday",
 98 |     "directional",
 99 |     "fancy",
100 |     "nike",
101 |     "sharp",
102 |     "industrial",
103 |     "ski",
104 |     "american",
105 |     "office",
106 |     "capital",
107 |     "garbage",
108 |     "assorted",
109 |     "electronic",
110 |     "tasty",
111 |     "ocean",
112 |     "artificial",
113 |     "caucasian",
114 |     "protective",
115 |     "foreign",
116 |     "double decker",
117 |     "french",
118 |     "fake",
119 |     "formal",
120 |     "designed",
121 |     "tabby",
122 |     "delicious",
123 |     "polar",
124 |     "typical",
125 |     "trash",
126 |     "wrist",
127 |     "street",
128 |     "park",
129 |     "wild",
130 |     "sparse",
131 |     "wedding",
132 |     "ugly",
133 |     "winter",
134 |     "polo",
135 |     "sturdy",
136 |     "traffic",
137 |     "new",
138 |     "burning",
139 |     "lined",
140 |     "intricate",
141 |     "on",
142 |     "dangling",
143 |     "breaking",
144 |     "paved",
145 |     "loose",
146 |     "high",
147 |     "beautiful",
148 |     "short",
149 |     "long",
150 |     "outdoor",
151 |     "crouched",
152 |     "mature",
153 |     "checkered",
154 |     "chain-link",
155 |     "gloomy",
156 |     "attached",
157 |     "pastel",
158 |     "wide",
159 |     "slanted",
160 |     "fine",
161 |     "weathered",
162 |     "healthy",
163 |     "cracked",
164 |     "heavy",
165 |     "athletic",
166 |     "used",
167 |     "rocky",
168 |     "floating",
169 |     "plain",
170 |     "lush",
171 |     "halved",
172 |     "pointing",
173 |     "outstretched",
174 |     "still",
175 |     "old fashioned",
176 |     "shallow",
177 |     "cut",
178 |     "chocolate",
179 |     "off",
180 |     "young",
181 |     "eaten",
182 |     "ivory",
183 |     "discolored",
184 |     "light",
185 |     "decorative",
186 |     "dense",
187 |     "baby",
188 |     "low",
189 |     "pulled back",
190 |     "teal",
191 |     "alert",
192 |     "spread",
193 |     "perched",
194 |     "immature",
195 |     "textured",
196 |     "outdoors",
197 |     "collared",
198 |     "shaped",
199 |     "inflatable",
200 |     "elevated",
201 |     "strawberry",
202 |     "narrow",
203 |     "reflected",
204 |     "thin",
205 |     "vanilla",
206 |     "parked",
207 |     "indoors",
208 |     "sheer",
209 |     "rippling",
210 |     "pale",
211 |     "hard",
212 |     "antique",
213 |     "warm",
214 |     "dull",
215 |     "pretty",
216 |     "comfortable",
217 |     "wooded",
218 |     "funny",
219 |     "colorful",
220 |     "handmade",
221 |     "curly",
222 |     "groomed",
223 |     "displayed",
224 |     "corded",
225 |     "straight",
226 |     "uneven",
227 |     "tilted",
228 |     "complete",
229 |     "modern",
230 |     "vibrant",
231 |     "homemade",
232 |     "vintage",
233 |     "rippled",
234 |     "balding",
235 |     "adult",
236 |     "forested",
237 |     "deep",
238 |     "tall",
239 |     "tangled",
240 |     "wavy",
241 |     "elderly",
242 |     "sandy",
243 |     "thick",
244 |     "manicured",
245 |     "ornamental",
246 |     "light colored",
247 |     "old"
248 |   ],
249 |   "size": [
250 |     "huge",
251 |     "miniature",
252 |     "tiny",
253 |     "giant",
254 |     "little",
255 |     "massive",
256 |     "oversized",
257 |     "large",
258 |     "small",
259 |     "skinny",
260 |     "chubby",
261 |     "vast",
262 |     "fat"
263 |   ],
264 |   "activity": [
265 |     "walking",
266 |     "sliding",
267 |     "having meeting",
268 |     "posing",
269 |     "skiing",
270 |     "sitting",
271 |     "squatting",
272 |     "hitting",
273 |     "blowing",
274 |     "drinking",
275 |     "waving",
276 |     "looking up",
277 |     "blooming",
278 |     "driving",
279 |     "crashing",
280 |     "staring",
281 |     "laughing",
282 |     "standing",
283 |     "cooking",
284 |     "riding",
285 |     "skating",
286 |     "performing trick",
287 |     "snowboarding",
288 |     "kneeling",
289 |     "crouching",
290 |     "talking",
291 |     "batting",
292 |     "smiling",
293 |     "looking down",
294 |     "bending",
295 |     "hanging",
296 |     "playing",
297 |     "skateboarding",
298 |     "running",
299 |     "flying",
300 |     "eating",
301 |     "grazing",
302 |     "waiting",
303 |     "jumping",
304 |     "splashing",
305 |     "spinning",
306 |     "resting",
307 |     "swinging",
308 |     "reading",
309 |     "spraying",
310 |     "surfing",
311 |     "sleeping",
312 |     "lying",
313 |     "swimming"
314 |   ],
315 |   "state": [
316 |     "sliced",
317 |     "lighted",
318 |     "toasted",
319 |     "ripe",
320 |     "fried",
321 |     "shaved",
322 |     "abandoned",
323 |     "trimmed",
324 |     "fenced",
325 |     "painted",
326 |     "dried",
327 |     "juicy",
328 |     "diced",
329 |     "barefoot",
330 |     "bunched",
331 |     "drawn",
332 |     "suspended",
333 |     "seasoned",
334 |     "shirtless",
335 |     "rolled",
336 |     "potted",
337 |     "uncomfortable",
338 |     "overcast",
339 |     "grated",
340 |     "stained",
341 |     "chopped",
342 |     "messy",
343 |     "crowded",
344 |     "raised",
345 |     "vacant",
346 |     "crossed",
347 |     "cushioned",
348 |     "faded",
349 |     "decorated",
350 |     "shadowed",
351 |     "piled",
352 |     "powdered",
353 |     "padded",
354 |     "shredded",
355 |     "wrapped",
356 |     "sealed",
357 |     "mowed",
358 |     "barren",
359 |     "clean",
360 |     "turned",
361 |     "overgrown",
362 |     "framed",
363 |     "breakable",
364 |     "chipped",
365 |     "damaged",
366 |     "crumbled",
367 |     "hazy",
368 |     "edged",
369 |     "sunny",
370 |     "partly cloudy",
371 |     "cloudy",
372 |     "gloved",
373 |     "clumped",
374 |     "patched",
375 |     "dirty",
376 |     "full",
377 |     "inflated",
378 |     "snowy",
379 |     "short sleeved",
380 |     "packed",
381 |     "sunlit",
382 |     "uncooked",
383 |     "roasted",
384 |     "rotten",
385 |     "glazed",
386 |     "scattered",
387 |     "bald",
388 |     "grouped",
389 |     "torn",
390 |     "glowing",
391 |     "unoccupied",
392 |     "hollow",
393 |     "scrambled",
394 |     "illuminated",
395 |     "rimmed",
396 |     "tied",
397 |     "leafless",
398 |     "peeled",
399 |     "sculpted",
400 |     "fallen",
401 |     "upholstered",
402 |     "fresh",
403 |     "unpeeled",
404 |     "half full",
405 |     "packaged",
406 |     "open",
407 |     "melting",
408 |     "closed",
409 |     "unripe",
410 |     "covered",
411 |     "mounted",
412 |     "worn",
413 |     "sprinkled",
414 |     "foggy",
415 |     "sleeveless",
416 |     "unlit",
417 |     "cluttered",
418 |     "carved",
419 |     "grilled",
420 |     "frozen",
421 |     "baked",
422 |     "iced",
423 |     "incomplete",
424 |     "steamed",
425 |     "blurry",
426 |     "boiled",
427 |     "stormy",
428 |     "lit",
429 |     "shut",
430 |     "written",
431 |     "unhealthy",
432 |     "blank",
433 |     "neat",
434 |     "bare",
435 |     "connected",
436 |     "folding",
437 |     "wet",
438 |     "shaded",
439 |     "peeling",
440 |     "folded",
441 |     "muscular",
442 |     "filled",
443 |     "stuffed",
444 |     "tight",
445 |     "empty",
446 |     "shining",
447 |     "long sleeved",
448 |     "stacked",
449 |     "browned",
450 |     "cloudless",
451 |     "printed",
452 |     "busy",
453 |     "misty",
454 |     "rainy",
455 |     "murky",
456 |     "raw",
457 |     "burnt",
458 |     "recessed",
459 |     "choppy",
460 |     "melted",
461 |     "cooked",
462 |     "broken",
463 |     "docked"
464 |   ],
465 |   "material": [
466 |     "water",
467 |     "rock",
468 |     "bamboo",
469 |     "soap",
470 |     "paper",
471 |     "wood",
472 |     "metal",
473 |     "hardwood",
474 |     "cardboard",
475 |     "cheese",
476 |     "tomato",
477 |     "apple",
478 |     "gas",
479 |     "tin",
480 |     "aluminum",
481 |     "cotton",
482 |     "asphalt",
483 |     "mesh",
484 |     "styrofoam",
485 |     "silk",
486 |     "banana",
487 |     "granite",
488 |     "wicker",
489 |     "steel",
490 |     "crystal",
491 |     "vinyl",
492 |     "concrete",
493 |     "leather",
494 |     "porcelain",
495 |     "beer",
496 |     "plastic",
497 |     "diamond",
498 |     "straw",
499 |     "rubber",
500 |     "fire",
501 |     "iron",
502 |     "pine",
503 |     "glass",
504 |     "palm",
505 |     "wire",
506 |     "cobblestone",
507 |     "wool",
508 |     "jeans",
509 |     "gravel",
510 |     "soda",
511 |     "cloth",
512 |     "stainless steel",
513 |     "denim",
514 |     "brick",
515 |     "pepper",
516 |     "coffee",
517 |     "lace",
518 |     "brass",
519 |     "ceramic",
520 |     "clay",
521 |     "chrome",
522 |     "marble",
523 |     "chalk",
524 |     "pizza",
525 |     "snow",
526 |     "stone"
527 |   ],
528 |   "texture": [
529 |     "plaid",
530 |     "frosted",
531 |     "crumpled",
532 |     "braided",
533 |     "quilted",
534 |     "wrinkled",
535 |     "paneled",
536 |     "knotted",
537 |     "crispy",
538 |     "crusty",
539 |     "beaded",
540 |     "muddy",
541 |     "barbed",
542 |     "foamy",
543 |     "reflective",
544 |     "unpaved",
545 |     "bushy",
546 |     "creamy",
547 |     "ruffled",
548 |     "furry",
549 |     "carpeted",
550 |     "flowered",
551 |     "polished",
552 |     "jagged",
553 |     "coarse",
554 |     "fuzzy",
555 |     "dusty",
556 |     "soft",
557 |     "puffy",
558 |     "dry",
559 |     "wrinkly",
560 |     "glossy",
561 |     "wispy",
562 |     "tiled",
563 |     "shaggy",
564 |     "greasy",
565 |     "patchy",
566 |     "hairy",
567 |     "fluffy",
568 |     "plush",
569 |     "woven",
570 |     "floral",
571 |     "shiny",
572 |     "shingled",
573 |     "rugged",
574 |     "ridged",
575 |     "rusty",
576 |     "dotted",
577 |     "spiky",
578 |     "patterned",
579 |     "speckled",
580 |     "grassy",
581 |     "feathered",
582 |     "smooth",
583 |     "crisp",
584 |     "floppy",
585 |     "ornate",
586 |     "knit",
587 |     "leafy",
588 |     "rough",
589 |     "striped"
590 |   ],
591 |   "mood": [
592 |     "sad",
593 |     "angry",
594 |     "sleepy",
595 |     "happy",
596 |     "unhappy",
597 |     "curious",
598 |     "calm"
599 |   ],
600 |   "shape": [
601 |     "crooked",
602 |     "triangular",
603 |     "pointy",
604 |     "elongated",
605 |     "oblong",
606 |     "octagonal",
607 |     "sloped",
608 |     "curved",
609 |     "round",
610 |     "domed",
611 |     "rounded",
612 |     "bent",
613 |     "curled",
614 |     "winding",
615 |     "angled",
616 |     "spiral",
617 |     "rectangular",
618 |     "twisted",
619 |     "irregular",
620 |     "steep",
621 |     "square",
622 |     "flat",
623 |     "cylindrical",
624 |     "arched",
625 |     "curvy"
626 |   ],
627 |   "orientation": [
628 |     "horizontal",
629 |     "down",
630 |     "upside down",
631 |     "overhead",
632 |     "upper",
633 |     "vertical",
634 |     "up",
635 |     "lower"
636 |   ],
637 |   "gender": [
638 |     "female",
639 |     "male"
640 |   ]
641 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TaskMeAnything
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | <h2 align="center"> Task Me Anything</h2>
  8 | 
  9 | <p align="center">
 10 |     <img src="teaser.png" width="1000" style="margin-bottom: 0.2;"/>
 11 | <p>
 12 | 
 13 | <h2 align="center"> <a href="https://www.task-me-anything.org/">🌐 Website</a> | <a href="https://arxiv.org/abs/2406.11775">📑 Paper</a> | <a href="https://huggingface.co/collections/jieyuz2/taskmeanything-664ebf028ab2524c0380526a">🤗 Huggingface</a> | <a href="https://huggingface.co/spaces/zixianma/TaskMeAnything-UI">💻 Interface</a></h2>
 14 |     
 15 | <h5 align="center"> If you like our project, please give us a star ⭐ on GitHub for latest update.  </h2>
 16 | 
 17 | 
 18 | 
 19 | ## 🔔News
 20 |  **🔥[2024-09-26]: Task Me Anything got accepted by NeurIPS 2024 Dataset & Benchmark track!**
 21 |  
 22 |  **🔥[2024-08-03]: TaskMeAnything-v1-2024 released! A benchmark for reflecting the current progress of MLMs by `automatically` finding tasks that popular MLMs struggle with using the `TaskMeAnything Top-K query and query approximation algorithms`. This includes [12,270 ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2024) and [3,567 VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024) questions that TaskMeAnything automatically approximated as challenging.**
 23 |  
 24 |  **🔥[2024-07-04]: Demo for TaskMeAnything released! checkout our demo for [generating customized ImageQa, VideoQA benchmarks](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/generate) and [model evaluation query](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/query)!**
 25 |  
 26 |  **🔥[2024-06-17]: Paper arXived!**
 27 |  
 28 |  **🔥[2024-06-01]: Code released!**
 29 | 
 30 | ## What's TaskMeAnything?
 31 | TaskMeAnything is a benchmark generation engine which produces a benchmark for large multimodal language models (MLMs) tailored to a user's needs. 
 32 | In particular, TaskMeAnything maintains an extendable taxonomy of visual assets and can programmatically generate a vast number of task instances. 
 33 | Additionally, it algorithmically addresses user queries regarding MLM performance efficiently within a computational budget. 
 34 | The current version can generate > 750M image/video question-answering pairs, which focus on evaluating MLM perceptual capabilities.
 35 | 
 36 | :exclamation: **TaskMeAnything does NOT involve any AI model during image/video, question, and answer generation, so the generated tasks do NOT suffer from model imperfection or hallucinations.**
 37 | 
 38 | We release the following resources: 
 39 | 1. [**TaskMeAnything-v1**](https://github.com/JieyuZ2/TaskMeAnything): the first version of TaskMeAnything, includes 28 task generators which can generate over 750M VQA task.
 40 | 2. **TaskMeAnything-v1-Random**[[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random)]: A randomly selected from TaskMeAnything-v1, including 5,700 ImageQA and 1,800 VideoQA task instances.
 41 | 3. **TaskMeAnything-v1-2024**[[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2924)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024)]: A benchmark for reflecting the current progress of MLMs by `automatically` finding tasks that popular MLMs struggle with using the TaskMeAnything Top-K query and query approximation algorithms. This includes [12,270 ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2024) and [3,567 VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024) questions that TaskMeAnything automatically approximated as challenging for over 20 popular MLMs.
 42 | 4. [**TaskMeAnything-DB**](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-eval-db): A database for TaskMeAnything, which stores the evaluation results of 13 open-source MLMs over 1M VQA task instances.
 43 | 5. [**TaskMeAnything-UI**](): An interactive graphical interface built upon TaskMeAnything-DB, which allows users to interact with the performance of models on TaskMeAnything-v1 in a intuitve way.
 44 | 
 45 |    
 46 | 
 47 | ## TaskMeAnything-v1
 48 | 
 49 | ### Usage
 50 | Demo for TaskMeAnything released! checkout our demo for 
 51 | * [generating customized ImageQa, VideoQA benchmarks](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/generate) 
 52 | * [model evaluation query](https://github.com/JieyuZ2/TaskMeAnything/tree/main/demo/query)
 53 |   
 54 | Notice: If you want to evaluate videoqa models, please check our [videoqa model branch](https://github.com/JieyuZ2/TaskMeAnything/tree/videoqa_model)
 55 | 
 56 | 
 57 | ### Installation
 58 | You can easily download the repo and set up the environments via:
 59 | ```
 60 | git clone https://github.com/JieyuZ2/TaskMeAnything.git
 61 | cd ./TaskMeAnything
 62 | 
 63 | pip install -r requirements.txt
 64 | ```
 65 | 
 66 | Notice: if you want to render 3D images/videos by `Blender` locally or use `Internvl-chat-v1.5-24B` that required `flash-attn` which hard to install by pip, you can use the docker image we provide. 
 67 | You can pull the docker image from DockerHub which includes all the dependencies like `Blender`, `flash-attn`, `cuda driver`, `nvcc`, etc.
 68 | ```
 69 | docker pull weikaih/ubuntu20.4_internvl_blender_v1.2:latest
 70 | docker run --gpus all -it weikaih/ubuntu20.4_internvl_blender_v1.2:latest /bin/bash # run the docker image with GPU support
 71 | 
 72 | git clone https://github.com/JieyuZ2/TaskMeAnything.git
 73 | cd ./TaskMeAnything
 74 | 
 75 | pip install -r requirements.txt
 76 | ```
 77 | 
 78 | 
 79 | ### Source data
 80 | Source data is stored in [HuggingFace](https://huggingface.co/datasets/jieyuz2/TaskMeAnything-v1-source). It includes `3d_assets`, `agqa_video`, and `object_images`.
 81 | 
 82 | For real image with scene graphs, please download the images and scene graphs from the following links: [SceneGraph](https://downloads.cs.stanford.edu/nlp/data/gqa/sceneGraphs.zip), [Image](https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip). 
 83 | After downloading, move the scene graphs and images into the source data folder, and arrange them as format below.
 84 | ```
 85 | TaskMeAnything-v1-source/vg/sceneGraphs: move scene graphs files to this folder (e.g. TaskMeAnything-v1-source/vg/sceneGraphs/train_sceneGraphs.json).
 86 | TaskMeAnything-v1-source/vg/images/images: move all the images to this folder (e.g. TaskMeAnything-v1-source/vg/images/images/2323739.jpg).
 87 | ```
 88 | 
 89 | 
 90 | ### Task Generator
 91 | We have 28 task generators in TaskMeAnything-v1, across 5 Scenarios:
 92 | 1. `2D Sticker Image`: grid-how-many, grid-what, grid-where, grid-what-attribute, grid-where-attribute
 93 | 2. `3D Tabletop Image`: 3d-what, 3d-where, 3d-what-attribute, 3d-where-attribute, 3d-how-many, 3d-what-size, 3d-where-size, 3d-what-attribute-size, 3d-what-distance, 3d-where-distance, 3d-what-attribute-distance
 94 | 3. `3D Tabletop Video`: video-3d-what-move, video-3d-where-move, video-3d-what-attribute-move, video-3d-what-rotate, video-3d-where-rotate, video-3d-what-attribute-rotate
 95 | 4. `Real Images`: sg-what-object, sg-what-relation, sg-what-attribute
 96 | 5. `Real Videos`: video-sg-what-object, video-sg-what-relation, video-sg-what-action
 97 | 
 98 | ### Tested Models 
 99 | We support the following ImageQA and VideoQA models: 
100 | - `ImageQA`: qwenvl-chat, qwenvl, llavav1.5-7b, llavav1.5-13b, instructblip-vicuna7b, instructblip-vicuna13b, internvl-chat-v1.5, gemini-vision-pro, qwen-vl-max, gpt4v, gpt4o
101 | - `VideoQA`: video-llama2-7b, video-llama2-13b, video-llava-7b, chat-univi-7b, chat-univi-13b, video-chatgpt-7b, video-chat2-7b
102 | 
103 | 
104 | 
105 | 
106 | You can also use our unified vqa interface for inference:
107 | ```python
108 | from PIL import Image
109 | from tma.models.qa_model import ImageQAModel
110 | # from tma.models.qa_model.prompt import succinct_prompt
111 | from tma.models.qa_model.prompt import detailed_imageqa_prompt
112 | 
113 | model = ImageQAModel(
114 |     model_name= "llava-v1.5-7b",
115 |     prompt_name= "detailed",
116 |     prompt_func= detailed_imageqa_prompt
117 | )
118 | 
119 | image = './path/to/image.jpg'
120 | # or image = Image.open(image_path)
121 | question = "Describe the image."
122 | 
123 | model.qa(image, question)
124 | ```
125 | Or check [videoqa model branch](https://github.com/JieyuZ2/TaskMeAnything/tree/videoqa_model) for videoqa models qa inference.
126 | 
127 | ## TaskMeAnything-v1 Benchmark
128 | Currently, we provide two versions of TaskMeAnything-v1 benchmark:
129 | * TaskMeAnything-v1-Random: [[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random)]: A randomly selected from TaskMeAnything-v1, including 5,700 ImageQA and 1,800 VideoQA task instances.
130 | * TaskMeAnything-v1-2024: [[ImageQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-2924)|[VideoQA](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-2024)]: A benchmark for reflecting the current progress of MLMs by `automatically` finding tasks that popular MLMs struggle with using the TaskMeAnything Top-K query and query approximation algorithms. 
131 | 
132 | ### Load TaskMeAnything-v1 ImageQA Dataset
133 | ```python
134 | import datasets
135 | dataset_name = 'weikaih/TaskMeAnything-v1-imageqa-random'
136 | #dataset_name = 'weikaih/TaskMeAnything-v1-imageqa-2024'
137 | dataset = datasets.load_dataset(dataset_name, split = TASK_GENERATOR_SPLIT)
138 | ```
139 | where `TASK_GENERATOR_SPLIT` is one of the task generators, eg, `2d_how_many`.
140 | 
141 | 
142 | ### Load TaskMeAnything-v1 VideoQA Dataset and Convert Video Binary Stream to mp4
143 | * Since Huggingface does not support saving .mp4 files in datasets, we save videos in the format of binary streams. After loading, you can convert the video binary stream to .mp4 using the following method.
144 | ```python
145 | import datasets
146 | 
147 | dataset_name = 'weikaih/TaskMeAnything-v1-videoqa-random'
148 | #dataset_name = 'weikaih/TaskMeAnything-v1-videoqa-2024'
149 | dataset = datasets.load_dataset(dataset_name, split = TASK_GENERATOR_SPLIT)
150 | 
151 | # example: convert binary stream in dataset to .mp4 files
152 | video_binary = dataset[0]['video']
153 | with open('/path/save/video.mp4', 'wb') as f:
154 |     f.write(video_binary)
155 | ```
156 | 
157 | ### Evalution results in TaskMeAnything-v1 benchmark
158 | * ImageQA in Random
159 | <p align="center">
160 |     <img src="assets/random-imageqa-result.png" width="700" style="margin-bottom: 0.2;"/>
161 | <p>
162 | 
163 | * VideoQA in Random
164 | <p align="center">
165 |     <img src="assets/random-videoqa-result.png" width="700" style="margin-bottom: 0.2;"/>
166 | <p>
167 | 
168 | * ImageQA in 2024
169 | <p align="center">
170 |     <img src="assets/2024-imageqa-result.png" width="700" style="margin-bottom: 0.2;"/>
171 | <p>
172 | 
173 | * VideoQA in 2024
174 | <p align="center">
175 |     <img src="assets/2024-videoqa-result.png" width="700" style="margin-bottom: 0.2;"/>
176 | <p>
177 | 
178 | * TaskMeAnything-v1-2024 v.s TaskMeAnything-v1-Random
179 | <p align="center">
180 |     <img src="assets/2024vsrandom-imageqa.png" width="700" style="margin-bottom: 0.2;"/>
181 | <p>
182 | <p align="center">
183 |     <img src="assets/2024vsrandom-videoqa.png" width="700" style="margin-bottom: 0.2;"/>
184 | <p>
185 | 
186 | * we can see that the performance drops are more significant in the 2024 version, which indicates that the 2024 version is more challenging for the models.
187 | 
188 | 
189 | For more details, please check out the [paper](https://arxiv.org/abs/2406.11775).
190 | 
191 | ## TaskMeAnything-DB
192 | **TaskMeAnything-DB** are stored in [HuggingFace](https://huggingface.co/datasets/jieyuz2/TaskMeAnything-v1-db)
193 | 
194 | ## TaskMeAnything-UI
195 | **TaskMeAnything-UI** are hosted in [HuggingFace](todo), check out our interactive interface to explore the performance of models on TaskMeAnything-v1 in your own way!
196 | 
197 | ## Disclaimers
198 | **TaskMeAnything** and its associated resources are provided for research and educational purposes only. 
199 | The authors and contributors make no warranties regarding the accuracy or reliability of the data and software. 
200 | Users are responsible for ensuring their use complies with applicable laws and regulations. 
201 | The project is not liable for any damages or losses resulting from the use of these resources.
202 | 
203 | 
204 | ## Contact
205 | 
206 | - Jieyu Zhang: jieyuz2@cs.washington.edu
207 | 
208 | ## Citation
209 | 
210 | **BibTeX:**
211 | 
212 | ```bibtex
213 | @inproceedings{zhang2024task,
214 |   title={Task Me Anything},
215 |   author={Zhang, Jieyu and Huang, Weikai and Ma, Zixian and Michel, Oscar and He, Dong and Gupta, Tanmay and Ma, Wei-Chiu and Farhadi, Ali and Kembhavi, Aniruddha and Krishna, Ranjay},
216 |   booktitle={Thirty-Eighth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
217 |   year={2024}
218 | }
219 | ```
220 | 
221 | 


--------------------------------------------------------------------------------
/tma/videoqa/scene_graph/single_video_task.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple
  2 | 
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | 
  6 | from ..metadata import VideoSceneGraphMetaData
  7 | from ...base import TaskGenerator
  8 | from ...task_store import TaskStore
  9 | 
 10 | 
 11 | def load_mp4_video(video_path):
 12 | 	with open(video_path, "rb") as file:
 13 | 		mp4_data = file.read()
 14 | 	return mp4_data
 15 | 
 16 | 
 17 | def enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type):
 18 | 	relation_to_actions = {}
 19 | 	video_scene_graph_keyframes = list(video_scene_graph.keys())
 20 | 
 21 | 	if temporal_reference_type == "before":
 22 | 		for idx, keyframe_name in enumerate(video_scene_graph_keyframes[:-1]):
 23 | 			next_keyframe_name = video_scene_graph_keyframes[idx + 1]
 24 | 			for relation, obj in video_scene_graph[keyframe_name][relation_type].items():
 25 | 				if relation not in video_scene_graph[next_keyframe_name][relation_type]:
 26 | 					if relation not in relation_to_actions:
 27 | 						relation_to_actions[(relation, obj)] = set()
 28 | 					for after_keyframe in video_scene_graph_keyframes[idx + 1:]:
 29 | 						for action in video_scene_graph[after_keyframe]['actions']:
 30 | 							if action not in video_scene_graph[keyframe_name]['actions']:
 31 | 								relation_to_actions[(relation, obj)].add(action)
 32 | 
 33 | 	elif temporal_reference_type == "after":
 34 | 		for idx, keyframe_name in enumerate(video_scene_graph_keyframes[1:], start=1):
 35 | 			previous_keyframe_name = video_scene_graph_keyframes[idx - 1]
 36 | 			for relation, obj in video_scene_graph[keyframe_name][relation_type].items():
 37 | 				if relation not in video_scene_graph[previous_keyframe_name][relation_type]:
 38 | 					if relation not in relation_to_actions:
 39 | 						relation_to_actions[(relation, obj)] = set()
 40 | 					for before_keyframe in video_scene_graph_keyframes[:idx]:
 41 | 						for action in video_scene_graph[before_keyframe]['actions']:
 42 | 							if action not in video_scene_graph[keyframe_name]['actions']:
 43 | 								relation_to_actions[(relation, obj)].add(action)
 44 | 
 45 | 	elif temporal_reference_type == "while":
 46 | 		for idx, keyframe_name in enumerate(video_scene_graph_keyframes):
 47 | 			for relation, obj in video_scene_graph[keyframe_name][relation_type].items():
 48 | 				if relation not in relation_to_actions:
 49 | 					relation_to_actions[(relation, obj)] = set()
 50 | 				for action in video_scene_graph[keyframe_name]['actions']:
 51 | 					relation_to_actions[(relation, obj)].add(action)
 52 | 
 53 | 	# Convert sets to lists for the output
 54 | 	relation_to_actions = {k: list(v) for k, v in relation_to_actions.items()}
 55 | 	return relation_to_actions
 56 | 
 57 | 
 58 | def enumerate_target_action_to_possible_reference_actions(video_scene_graph, temporal_reference_type):
 59 | 	action_to_actions = {}
 60 | 	video_scene_graph_keyframes = list(video_scene_graph.keys())
 61 | 
 62 | 	if temporal_reference_type == "before":
 63 | 		for idx, keyframe_name in enumerate(video_scene_graph_keyframes[:-1]):
 64 | 			next_keyframe_name = video_scene_graph_keyframes[idx + 1]
 65 | 			for action in video_scene_graph[keyframe_name]['actions']:
 66 | 				if action not in video_scene_graph[next_keyframe_name]['actions']:
 67 | 					if action not in action_to_actions:
 68 | 						action_to_actions[action] = set()
 69 | 					for after_keyframe in video_scene_graph_keyframes[idx + 1:]:
 70 | 						for reference_action in video_scene_graph[after_keyframe]['actions']:
 71 | 							if reference_action not in video_scene_graph[keyframe_name]['actions'] and reference_action != action:
 72 | 								action_to_actions[action].add(reference_action)
 73 | 
 74 | 	elif temporal_reference_type == "after":
 75 | 		for idx, keyframe_name in enumerate(video_scene_graph_keyframes[1:], start=1):
 76 | 			previous_keyframe_name = video_scene_graph_keyframes[idx - 1]
 77 | 			for action in video_scene_graph[keyframe_name]['actions']:
 78 | 				if action not in video_scene_graph[previous_keyframe_name]['actions']:
 79 | 					if action not in action_to_actions:
 80 | 						action_to_actions[action] = set()
 81 | 					for before_keyframe in video_scene_graph_keyframes[:idx]:
 82 | 						for reference_action in video_scene_graph[before_keyframe]['actions']:
 83 | 							if reference_action not in video_scene_graph[keyframe_name]['actions'] and reference_action != action:
 84 | 								action_to_actions[action].add(reference_action)
 85 | 
 86 | 	elif temporal_reference_type == "while":
 87 | 		for idx, keyframe_name in enumerate(video_scene_graph_keyframes):
 88 | 			for action in video_scene_graph[keyframe_name]['actions']:
 89 | 				if action not in action_to_actions:
 90 | 					action_to_actions[action] = set()
 91 | 				for reference_action in video_scene_graph[keyframe_name]['actions']:
 92 | 					if reference_action != action:
 93 | 						action_to_actions[action].add(reference_action)
 94 | 
 95 | 	# Convert sets to lists for the output
 96 | 	action_to_actions = {k: list(v) for k, v in action_to_actions.items()}
 97 | 	return action_to_actions
 98 | 
 99 | 
100 | def get_all_spatial_relations(video_scene_graph):
101 | 	relations = set()
102 | 	for keyframe_name, keyframe in video_scene_graph.items():
103 | 		relations.update(keyframe['spatial'])
104 | 	return relations
105 | 
106 | 
107 | def get_all_contact_relations(video_scene_graph):
108 | 	relations = set()
109 | 	for keyframe_name, keyframe in video_scene_graph.items():
110 | 		relations.update(keyframe['contact'])
111 | 	return relations
112 | 
113 | 
114 | def get_all_objects(video_scene_graph):
115 | 	objects = set()
116 | 	for keyframe_name, keyframe in video_scene_graph.items():
117 | 		for relation in keyframe['spatial']:
118 | 			objects.add(keyframe['spatial'][relation])
119 | 		for relation in keyframe['contact']:
120 | 			objects.add(keyframe['contact'][relation])
121 | 	return objects
122 | 
123 | 
124 | def get_all_actions(video_scene_graph):
125 | 	actions = set()
126 | 	for keyframe_name, keyframe in video_scene_graph.items():
127 | 		actions.update(keyframe['actions'])
128 | 	return actions
129 | 
130 | 
131 | class VideoSceneGraphTaskGenerator(TaskGenerator):
132 | 	metadata: VideoSceneGraphMetaData
133 | 
134 | 	embed_schema = [
135 | 		"task type",
136 | 		"object",
137 | 		"relation",
138 | 		"action",
139 | 		"reference action",
140 | 		"relation type",
141 | 		"temporal reference type",
142 | 	]
143 | 
144 | 	def __init__(self, metadata: VideoSceneGraphMetaData, seed=42):
145 | 		super().__init__(metadata, seed=seed)
146 | 
147 | 	def _generate_task(self, task_plan) -> Tuple[str, str, List[str], str]:
148 | 		"(Abstract method) generate task"
149 | 
150 | 	def _task_plan_to_str(self, task_plan) -> str:
151 | 		t = []
152 | 		for k, v in task_plan.items():
153 | 			if k in self.embed_schema:
154 | 				assert isinstance(v, str)
155 | 				t.append(f'{k}: {v}')
156 | 		return '\n'.join(t)
157 | 
158 | 	def generate(self, task_plan, return_data=True, seed=None):
159 | 		if seed is not None:
160 | 			self.rng = np.random.default_rng(seed=seed)
161 | 
162 | 		question, answer, options, video_scene_graph_id = self._generate_task(task_plan)
163 | 
164 | 		task = {
165 | 			"question"            : question,
166 | 			"answer"              : answer,
167 | 			"options"             : options,
168 | 			"task_plan"           : self._task_plan_to_str(task_plan),
169 | 			"video_scene_graph_id": video_scene_graph_id,
170 | 			'video'               : load_mp4_video(self.metadata.get_video_path(video_scene_graph_id)) if return_data else None
171 | 		}
172 | 		return task
173 | 
174 | 
175 | class WhatObjectVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator):
176 | 	schema = {
177 | 		"task type"          : "str",
178 | 		"object"                 : "str",
179 | 		"relation"               : "str",
180 | 		"reference action"       : "str",
181 | 		"relation type"          : "str",
182 | 		"temporal reference type": "str",
183 | 		"video scene graph id"   : "str",
184 | 	}
185 | 
186 | 	def enumerate_task_plans(self, task_store: TaskStore):
187 | 		for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what object video] task"):
188 | 			for relation_type in ["spatial", "contact"]:
189 | 				for temporal_reference_type in ["before", "after", "while"]:
190 | 					target_relation_to_possible_reference_actions = enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type)
191 | 					for (target_relation, target_object), possible_reference_actions in target_relation_to_possible_reference_actions.items():
192 | 						for reference_action in possible_reference_actions:
193 | 							task_plan = {
194 | 								"task type"          : "what object video",
195 | 								"video scene graph id"   : video_scene_graph_id,
196 | 								"object"                 : self.metadata.idx2name[target_object],
197 | 								"relation"               : self.metadata.idx2name[target_relation],
198 | 								'relation type'          : relation_type,
199 | 								"reference action"       : self.metadata.idx2name[reference_action],
200 | 								"temporal reference type": temporal_reference_type,
201 | 							}
202 | 							task_store.add(task_plan)
203 | 
204 | 	def _generate_task(self, task_plan):
205 | 		question = f"What is the object that the person is {task_plan['relation']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?"
206 | 
207 | 		answer = task_plan["object"]
208 | 		negatives = list(set(self.metadata.objects) - get_all_objects(self.metadata.video_scene_graphs[task_plan["video scene graph id"]]))
209 | 		negatives = [self.metadata.idx2name[neg] for neg in negatives]
210 | 
211 | 		options = self._compose_options(answer, negatives)
212 | 		return question, answer, options, task_plan["video scene graph id"]
213 | 
214 | 
215 | class WhatRelationVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator):
216 | 	schema = {
217 | 		"task type"          : "str",
218 | 		"object"                 : "str",
219 | 		"relation"               : "str",
220 | 		"reference action"       : "str",
221 | 		"relation type"          : "str",
222 | 		"temporal reference type": "str",
223 | 		"video scene graph id"   : "str",
224 | 	}
225 | 
226 | 	def enumerate_task_plans(self, task_store: TaskStore):
227 | 		for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what relation video] task"):
228 | 			for relation_type in ["spatial", "contact"]:
229 | 				for temporal_reference_type in ["before", "after", "while"]:
230 | 					target_relation_to_possible_reference_actions = enumerate_target_relation_to_possible_reference_actions(video_scene_graph, relation_type, temporal_reference_type)
231 | 					for (target_relation, target_object), possible_reference_actions in target_relation_to_possible_reference_actions.items():
232 | 						for reference_action in possible_reference_actions:
233 | 							task_plan = {
234 | 								"task type"          : "what relation video",
235 | 								"video scene graph id"   : video_scene_graph_id,
236 | 								"object"                 : self.metadata.idx2name[target_object],
237 | 								"relation"               : self.metadata.idx2name[target_relation],
238 | 								'relation type'          : relation_type,
239 | 								"reference action"       : self.metadata.idx2name[reference_action],
240 | 								"temporal reference type": temporal_reference_type,
241 | 							}
242 | 							task_store.add(task_plan)
243 | 
244 | 	def _generate_task(self, task_plan):
245 | 		if task_plan["relation type"] == "spatial":
246 | 			question = f"What is the spatial relation of the person to the {task_plan['object']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?"
247 | 			negatives = list(set(self.metadata.spatial_relations) - get_all_spatial_relations(self.metadata.video_scene_graphs[task_plan["video scene graph id"]]))
248 | 		elif task_plan["relation type"] == "contact":
249 | 			question = f"What is the person doing to the {task_plan['object']} {task_plan['temporal reference type']} the person {task_plan['reference action']}?"
250 | 			negatives = list(set(self.metadata.contact_relations) - get_all_contact_relations(self.metadata.video_scene_graphs[task_plan["video scene graph id"]]))
251 | 		else:
252 | 			raise ValueError(f"Unknown relation type: {task_plan['relation type']}")
253 | 
254 | 		answer = task_plan['relation']
255 | 		negatives = [self.metadata.idx2name[neg] for neg in negatives]
256 | 
257 | 		options = self._compose_options(answer, negatives)
258 | 		return question, answer, options, task_plan["video scene graph id"]
259 | 
260 | 
261 | class WhatActionVideoSceneGraphTaskGenerator(VideoSceneGraphTaskGenerator):
262 | 	schema = {
263 | 		"task type"          : "str",
264 | 		"action"                 : "str",
265 | 		"reference action"       : "str",
266 | 		"relation type"          : "str",
267 | 		"temporal reference type": "str",
268 | 		"video scene graph id"   : "str",
269 | 	}
270 | 
271 | 	def enumerate_task_plans(self, task_store: TaskStore):
272 | 		for video_scene_graph_id, video_scene_graph in tqdm(self.metadata.video_scene_graphs.items(), desc="enumerating [what action video] task"):
273 | 			for temporal_reference_type in ["before", "after", "while"]:
274 | 				target_action_to_possible_reference_actions = enumerate_target_action_to_possible_reference_actions(video_scene_graph, temporal_reference_type)
275 | 				for target_action, possible_reference_actions in target_action_to_possible_reference_actions.items():
276 | 					for reference_action in possible_reference_actions:
277 | 						task_plan = {
278 | 							"task type"          : "what action video",
279 | 							"video scene graph id"   : video_scene_graph_id,
280 | 							"action"                 : self.metadata.idx2name[target_action],
281 | 							"reference action"       : self.metadata.idx2name[reference_action],
282 | 							"temporal reference type": temporal_reference_type,
283 | 						}
284 | 						task_store.add(task_plan)
285 | 
286 | 	def _generate_task(self, task_plan):
287 | 		question = f"What action is the person doing {task_plan['temporal reference type']} the person {task_plan['reference action']}?"
288 | 
289 | 		answer = task_plan["action"]
290 | 		negatives = list(set(self.metadata.actions) - get_all_actions(self.metadata.video_scene_graphs[task_plan["video scene graph id"]]))
291 | 		negatives = [self.metadata.idx2name[neg] for neg in negatives]
292 | 
293 | 		options = self._compose_options(answer, negatives)
294 | 		return question, answer, options, task_plan["video scene graph id"]
295 | 


--------------------------------------------------------------------------------
/tma/videoqa/tabletop_3d/movement_single_video_task.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | 
  3 | from .single_video_task import GridVideoTaskGenerator
  4 | from .utils import relative_positions
  5 | from ..metadata import ObjaverseVideoMetaData as MetaData
  6 | from ...constant import NUM_OPTIONS, VIDEO_FPS, VIDEO_NUM_FRAMES
  7 | from ...task_store import TaskStore
  8 | 
  9 | grid_options = [2]
 10 | DEFAULT_OBJECT_SIZE_MULTIPLIER = 1.3
 11 | 
 12 | moving_options = {'left', 'right', 'up', 'down'}
 13 | 
 14 | 
 15 | def direction_to_keyframes(direction):
 16 | 	if direction == 'left':
 17 | 		return [{}, {}, {}, {}, {'movement': (0, 0.35)}]
 18 | 	elif direction == 'right':
 19 | 		return [{}, {}, {}, {}, {'movement': (0, -0.35)}]
 20 | 	elif direction == 'up':
 21 | 		return [{}, {}, {}, {}, {'movement': (0.45, 0)}]
 22 | 	elif direction == 'down':
 23 | 		return [{}, {}, {}, {}, {'movement': (-0.45, 0)}]
 24 | 
 25 | 
 26 | class MovementVideoGridTaskGenerator(GridVideoTaskGenerator):
 27 | 	def _make_video_metadata(self, grid_size, grids, queries, remaining_query=..., target_object_moving_direction='left', are_other_objects_moving="No", object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER):
 28 | 		objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries]
 29 | 		remaining_grids = [g for g in range(grid_size ** 2) if g not in grids]
 30 | 		for _ in remaining_grids:
 31 | 			uid = self.metadata.sample(self.rng, 1, "object", remaining_query)
 32 | 			objects.append(uid)
 33 | 
 34 | 		remaining_moving_direction = list(moving_options - {target_object_moving_direction})
 35 | 		keyframes = [direction_to_keyframes(target_object_moving_direction)]
 36 | 		if are_other_objects_moving == "Yes":
 37 | 			remaining_keyframes = [direction_to_keyframes(self.rng.choice(remaining_moving_direction, size=1)) for _ in range(len(remaining_grids))]
 38 | 		else:
 39 | 			remaining_keyframes = [[{}, {}, {}, {}, {}] for _ in range(len(remaining_grids))]
 40 | 
 41 | 		object_path = {k: self.metadata.get_object_path(k) for k in objects}
 42 | 		angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects]
 43 | 
 44 | 		video_metadata = {
 45 | 			'grid number'     : grid_size,
 46 | 			'objects'         : objects,
 47 | 			'object_path'     : object_path,
 48 | 			'object_angles'   : angles,
 49 | 			'grids'           : grids + remaining_grids,
 50 | 			'blender_config'  : self.metadata.sample_blender_configuration(self.rng),
 51 | 			'fps'             : VIDEO_FPS,
 52 | 			'total_num_frames': VIDEO_NUM_FRAMES,
 53 | 			'sizes'           : [object_size_multiplier for _ in objects],
 54 | 			'keyframes'       : keyframes + remaining_keyframes,
 55 | 		}
 56 | 		return video_metadata
 57 | 
 58 | 
 59 | class WhatMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator):
 60 | 	schema = {
 61 | 		'task type'           : 'str',
 62 | 		'grid number'             : 'int',
 63 | 		'target category'         : 'str',
 64 | 		'absolute position'       : 'str',
 65 | 		'attribute type'          : 'str',
 66 | 		'attribute value'         : 'str',
 67 | 		'moving direction'        : 'str',
 68 | 		'are other objects moving': 'str'
 69 | 	}
 70 | 
 71 | 	def enumerate_task_plans(self, task_store: TaskStore):
 72 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [what move video] task"):
 73 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
 74 | 			for attribute_type, attribute_values in attribute_dict.items():
 75 | 				for attribute_value in attribute_values:
 76 | 					for grid_size in grid_options:
 77 | 						for absolute_pos in self.grid_mappings[grid_size]:
 78 | 							for target_object_moving_direction in moving_options:
 79 | 								task_plan = {
 80 | 									'task type'           : 'what move video',
 81 | 									'grid number'             : grid_size,
 82 | 									'target category'         : target_category,
 83 | 									'absolute position'       : absolute_pos,
 84 | 									'attribute type'          : attribute_type,
 85 | 									'attribute value'         : attribute_value,
 86 | 									'moving direction'        : target_object_moving_direction,
 87 | 									'are other objects moving': "Yes"
 88 | 								}
 89 | 								task_store.add(task_plan)
 90 | 
 91 | 								task_plan = {
 92 | 									'task type'           : 'what move video',
 93 | 									'grid number'             : grid_size,
 94 | 									'target category'         : target_category,
 95 | 									'absolute position'       : absolute_pos,
 96 | 									'attribute type'          : attribute_type,
 97 | 									'attribute value'         : attribute_value,
 98 | 									'moving direction'        : target_object_moving_direction,
 99 | 									'are other objects moving': "No"
100 | 								}
101 | 								task_store.add(task_plan)
102 | 
103 | 			for grid_size in grid_options:
104 | 				for absolute_pos in self.grid_mappings[grid_size]:
105 | 					for target_object_moving_direction in moving_options:
106 | 						task_plan = {
107 | 							'task type'           : 'what move video',
108 | 							'grid number'             : grid_size,
109 | 							'target category'         : target_category,
110 | 							'absolute position'       : absolute_pos,
111 | 							'moving direction'        : target_object_moving_direction,
112 | 							'are other objects moving': "Yes"
113 | 						}
114 | 						task_store.add(task_plan)
115 | 
116 | 						task_plan = {
117 | 							'task type'           : 'what move video',
118 | 							'grid number'             : grid_size,
119 | 							'target category'         : target_category,
120 | 							'absolute position'       : absolute_pos,
121 | 							'moving direction'        : target_object_moving_direction,
122 | 							'are other objects moving': "No"
123 | 						}
124 | 						task_store.add(task_plan)
125 | 
126 | 	def _generate_task(self, task_plan):
127 | 		grid_size = task_plan['grid number']
128 | 		target_category = task_plan['target category']
129 | 		absolute_pos = task_plan['absolute position']
130 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
131 | 		target_object_moving_direction = task_plan['moving direction']
132 | 
133 | 		if task_plan['are other objects moving'] == "Yes":
134 | 			question = f"What is the object that is moving {target_object_moving_direction} in the video?"
135 | 		else:
136 | 			question = f"What is the moving object in the video?"
137 | 
138 | 		queries = [self._get_target_object_query(task_plan)]
139 | 
140 | 		remaining_query = self.metadata.and_query([("category", target_category, False)])
141 | 
142 | 		video_metadata = self._make_video_metadata(
143 | 			grid_size,
144 | 			grids,
145 | 			queries=queries,
146 | 			remaining_query=remaining_query,
147 | 			target_object_moving_direction=target_object_moving_direction,
148 | 			are_other_objects_moving=task_plan['are other objects moving'],
149 | 			object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER
150 | 		)
151 | 
152 | 		answer = self.metadata.get_surfacename(target_category)
153 | 		negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category))
154 | 					 for o in video_metadata['objects'][1:]]
155 | 		options = self._compose_options(answer, negatives)
156 | 
157 | 		return question, answer, options, video_metadata
158 | 
159 | 
160 | class WhatAttributeMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator):
161 | 	schema = {
162 | 		'task type'           : 'str',
163 | 		'grid number'             : 'int',
164 | 		'target category'         : 'str',
165 | 		'absolute position'       : 'str',
166 | 		'attribute type'          : 'str',
167 | 		'attribute value'         : 'str',
168 | 		'moving direction'        : 'str',
169 | 		'are other objects moving': 'str'
170 | 	}
171 | 
172 | 	def enumerate_task_plans(self, task_store: TaskStore):
173 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [what attribute move video] task"):
174 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
175 | 			for attribute_type, attribute_values in attribute_dict.items():
176 | 				for attribute_value in attribute_values:
177 | 					for grid_size in grid_options:
178 | 						for absolute_pos in self.grid_mappings[grid_size]:
179 | 							for target_object_moving_direction in moving_options:
180 | 								task_plan = {
181 | 									'task type'           : 'what attribute move video',
182 | 									'grid number'             : grid_size,
183 | 									'target category'         : target_category,
184 | 									'absolute position'       : absolute_pos,
185 | 									'attribute type'          : attribute_type,
186 | 									'attribute value'         : attribute_value,
187 | 									'moving direction'        : target_object_moving_direction,
188 | 									'are other objects moving': "Yes"
189 | 								}
190 | 								task_store.add(task_plan)
191 | 
192 | 								task_plan = {
193 | 									'task type'           : 'what attribute move video',
194 | 									'grid number'             : grid_size,
195 | 									'target category'         : target_category,
196 | 									'absolute position'       : absolute_pos,
197 | 									'attribute type'          : attribute_type,
198 | 									'attribute value'         : attribute_value,
199 | 									'moving direction'        : target_object_moving_direction,
200 | 									'are other objects moving': "No"
201 | 								}
202 | 								task_store.add(task_plan)
203 | 
204 | 	def _generate_task(self, task_plan):
205 | 		grid_size = task_plan['grid number']
206 | 
207 | 		attribute_type = task_plan['attribute type']
208 | 		absolute_pos = task_plan['absolute position']
209 | 		target_object_moving_direction = task_plan['moving direction']
210 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
211 | 
212 | 		queries = [self._get_target_object_query(task_plan)]
213 | 		if task_plan['are other objects moving'] == "Yes":
214 | 			question = f"What is the {attribute_type} of the object that is moving {target_object_moving_direction} in the video?"
215 | 		else:
216 | 			question = f"What is the {attribute_type} of the moving object in the video?"
217 | 
218 | 		video_metadata = self._make_video_metadata(
219 | 			grid_size,
220 | 			grids,
221 | 			queries=queries,
222 | 			target_object_moving_direction=target_object_moving_direction,
223 | 			are_other_objects_moving=task_plan['are other objects moving'],
224 | 			object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER
225 | 		)
226 | 
227 | 		answer = task_plan['attribute value']
228 | 		target_object = video_metadata['objects'][0]
229 | 		negative_query = self.metadata.and_query([
230 | 			(attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)]))
231 | 		])
232 | 		negatives = self.metadata.sample(
233 | 			self.rng,
234 | 			NUM_OPTIONS - 1,
235 | 			attribute_type,
236 | 			query=negative_query,
237 | 		)
238 | 		options = [answer] + negatives
239 | 		return question, answer, options, video_metadata
240 | 
241 | 
242 | class WhereMovementVideoGridTaskGenerator(MovementVideoGridTaskGenerator):
243 | 	schema = {
244 | 		'task type'           : 'str',
245 | 		'grid number'             : 'int',
246 | 		'target category'         : 'str',
247 | 		'absolute position'       : 'str',
248 | 		'attribute type'          : 'str',
249 | 		'attribute value'         : 'str',
250 | 		'moving direction'        : 'str',
251 | 		'are other objects moving': 'str'
252 | 	}
253 | 
254 | 	def __init__(self, metadata: MetaData, seed=42):
255 | 		super().__init__(metadata, seed=seed)
256 | 		self.relative_positions = relative_positions
257 | 
258 | 	def enumerate_task_plans(self, task_store: TaskStore):
259 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [where move video] task"):
260 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
261 | 			for attribute_type, attribute_values in attribute_dict.items():
262 | 				for attribute_value in attribute_values:
263 | 					for grid_size in grid_options:
264 | 						for absolute_pos in self.grid_mappings[grid_size]:
265 | 							for target_object_moving_direction in moving_options:
266 | 								task_plan = {
267 | 									'task type'           : 'where move video',
268 | 									'grid number'             : grid_size,
269 | 									'target category'         : target_category,
270 | 									'absolute position'       : absolute_pos,
271 | 									'attribute type'          : attribute_type,
272 | 									'attribute value'         : attribute_value,
273 | 									'moving direction'        : target_object_moving_direction,
274 | 									'are other objects moving': "Yes"
275 | 								}
276 | 								task_store.add(task_plan)
277 | 
278 | 								task_plan = {
279 | 									'task type'           : 'where move video',
280 | 									'grid number'             : grid_size,
281 | 									'target category'         : target_category,
282 | 									'absolute position'       : absolute_pos,
283 | 									'attribute type'          : attribute_type,
284 | 									'attribute value'         : attribute_value,
285 | 									'moving direction'        : target_object_moving_direction,
286 | 									'are other objects moving': "No"
287 | 								}
288 | 								task_store.add(task_plan)
289 | 
290 | 			for grid_size in grid_options:
291 | 				for absolute_pos in self.grid_mappings[grid_size]:
292 | 					for target_object_moving_direction in moving_options:
293 | 						task_plan = {
294 | 							'task type'           : 'where move video',
295 | 							'grid number'             : grid_size,
296 | 							'target category'         : target_category,
297 | 							'absolute position'       : absolute_pos,
298 | 							'moving direction'        : target_object_moving_direction,
299 | 							'are other objects moving': "Yes"
300 | 						}
301 | 						task_store.add(task_plan)
302 | 
303 | 						task_plan = {
304 | 							'task type'           : 'where move video',
305 | 							'grid number'             : grid_size,
306 | 							'target category'         : target_category,
307 | 							'absolute position'       : absolute_pos,
308 | 							'moving direction'        : target_object_moving_direction,
309 | 							'are other objects moving': "No"
310 | 						}
311 | 						task_store.add(task_plan)
312 | 
313 | 	def _generate_task(self, task_plan):
314 | 		grid_size = task_plan['grid number']
315 | 
316 | 		target_category = task_plan['target category']
317 | 		categories = [target_category]
318 | 		queries = [self._get_target_object_query(task_plan)]
319 | 		absolute_pos = task_plan['absolute position']
320 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
321 | 		target_object_moving_direction = task_plan['moving direction']
322 | 
323 | 		if task_plan['are other objects moving'] == "Yes":
324 | 			question = f"Where is the object that is moving {target_object_moving_direction} located in the video?"
325 | 		else:
326 | 			question = f"Where is the moving object located in the video?"
327 | 		answer = absolute_pos
328 | 		negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer]
329 | 
330 | 		options = self._compose_options(answer, negatives)
331 | 		video_metadata = self._make_video_metadata(
332 | 			grid_size,
333 | 			grids,
334 | 			queries=queries,
335 | 			remaining_query=self.metadata.and_query([("category", c, False) for c in categories]),
336 | 			target_object_moving_direction=target_object_moving_direction,
337 | 			are_other_objects_moving=task_plan['are other objects moving'],
338 | 			object_size_multiplier=DEFAULT_OBJECT_SIZE_MULTIPLIER
339 | 		)
340 | 
341 | 		return question, answer, options, video_metadata
342 | 


--------------------------------------------------------------------------------
/tma/imageqa/tabletop_3d/run_blender.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import math
  4 | import os
  5 | import sys
  6 | import urllib.request
  7 | from math import radians
  8 | 
  9 | try:
 10 | 	import bpy
 11 | 	from mathutils import Vector, Matrix, Quaternion, Euler
 12 | except ImportError:
 13 | 	pass
 14 | 
 15 | 
 16 | def rotate(obj, degree):
 17 | 	"""Rotates around the z axis by theta"""
 18 | 	degree = -degree
 19 | 	bpy.ops.object.select_all(action='DESELECT')
 20 | 	obj.select_set(True)
 21 | 	bpy.context.view_layer.objects.active = obj
 22 | 	radian = radians(degree)
 23 | 	bpy.context.object.rotation_mode = 'XYZ'
 24 | 	rot_x, rot_y, rot_z = obj.rotation_euler
 25 | 	obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian))
 26 | 	freeze_transformation(obj)
 27 | 
 28 | 
 29 | def reset_scene():
 30 | 	# delete everything that isn't part of a camera or a light
 31 | 	bpy.ops.object.select_all(action="SELECT")
 32 | 	for obj in bpy.data.objects:
 33 | 		bpy.data.objects.remove(obj, do_unlink=True)
 34 | 	bpy.ops.ptcache.free_bake_all()
 35 | 
 36 | 
 37 | def select_hierarchy(obj):
 38 | 	"""Recursively select an object and all of its descendants."""
 39 | 	obj.select_set(True)
 40 | 	for child in obj.children:
 41 | 		select_hierarchy(child)
 42 | 
 43 | 
 44 | def load_object(object_path: str) -> None:
 45 | 	"""Loads a glb model into the scene."""
 46 | 	bpy.ops.object.select_all(action='DESELECT')
 47 | 	if object_path.endswith(".glb"):
 48 | 		bpy.ops.import_scene.gltf(filepath=object_path, merge_vertices=True)
 49 | 	elif object_path.endswith(".fbx"):
 50 | 		bpy.ops.import_scene.fbx(filepath=object_path)
 51 | 	else:
 52 | 		raise ValueError(f"Unsupported file type: {object_path}")
 53 | 
 54 | 	base_name = os.path.basename(object_path)
 55 | 	object_name, _ = os.path.splitext(base_name)
 56 | 	bpy.context.view_layer.objects.active.name = object_name
 57 | 	bpy.ops.object.select_all(action='DESELECT')
 58 | 
 59 | 	obj = bpy.data.objects.get(object_name)
 60 | 	# bpy.context.view_layer.objects.active = obj
 61 | 	select_hierarchy(obj)
 62 | 	bpy.ops.object.transform_apply(location=True, rotation=True, scale=True)
 63 | 	meshes = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"]
 64 | 	non_meshes = [obj for obj in bpy.context.selected_objects if obj.type != "MESH"]
 65 | 	bpy.ops.object.select_all(action="DESELECT")
 66 | 
 67 | 	# delete non-mesh and consolidate
 68 | 
 69 | 	for obj in non_meshes:
 70 | 		obj.select_set(True)
 71 | 	bpy.ops.object.delete()
 72 | 	bpy.ops.object.select_all(action="DESELECT")
 73 | 	for obj in meshes:
 74 | 		obj.select_set(True)
 75 | 	bpy.context.view_layer.objects.active = meshes[0]
 76 | 	bpy.ops.object.join()
 77 | 	bpy.context.view_layer.objects.active.name = object_name
 78 | 	bpy.ops.object.origin_set(type='GEOMETRY_ORIGIN', center='BOUNDS')
 79 | 	bpy.ops.object.select_all(action="DESELECT")
 80 | 
 81 | 	return object_name
 82 | 
 83 | 
 84 | def scene_meshes():
 85 | 	for obj in bpy.context.scene.objects.values():
 86 | 		if isinstance(obj.data, (bpy.types.Mesh)):
 87 | 			yield obj
 88 | 
 89 | 
 90 | def download_uid(uid_path, save_dir):
 91 | 	return download_object(uid_path, save_dir)
 92 | 
 93 | 
 94 | def download_object(object_url, save_dir) -> str:
 95 | 	"""Download the object and return the path."""
 96 | 	# uid = uuid.uuid4()
 97 | 	uid = object_url.split("/")[-1].split(".")[0]
 98 | 	tmp_local_path = os.path.join(save_dir, f"{uid}.glb" + ".tmp")
 99 | 	local_path = os.path.join(save_dir, f"{uid}.glb")
100 | 	# wget the file and put it in local_path
101 | 	os.makedirs(os.path.dirname(tmp_local_path), exist_ok=True)
102 | 	urllib.request.urlretrieve(object_url, tmp_local_path)
103 | 	os.rename(tmp_local_path, local_path)
104 | 	# get the absolute path
105 | 	local_path = os.path.abspath(local_path)
106 | 	return local_path
107 | 
108 | 
109 | def scene_bbox(single_obj=None, ignore_matrix=False):
110 | 	bbox_min = (math.inf,) * 3
111 | 	bbox_max = (-math.inf,) * 3
112 | 	found = False
113 | 	for obj in scene_meshes() if single_obj is None else [single_obj]:
114 | 		found = True
115 | 		for coord in obj.bound_box:
116 | 			coord = Vector(coord)
117 | 			if not ignore_matrix:
118 | 				coord = obj.matrix_world @ coord
119 | 			bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord))
120 | 			bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord))
121 | 	if not found:
122 | 		raise RuntimeError("no objects in scene to compute bounding box for")
123 | 	return Vector(bbox_min), Vector(bbox_max)
124 | 
125 | 
126 | def scene_root_objects():
127 | 	for obj in bpy.context.scene.objects.values():
128 | 		if not obj.parent:
129 | 			yield obj
130 | 
131 | 
132 | def freeze_transformation(obj):
133 | 	bpy.context.view_layer.objects.active = obj
134 | 	obj.select_set(True)
135 | 	bpy.ops.object.transform_apply(location=True, rotation=True, scale=True)
136 | 	bpy.ops.object.select_all(action='DESELECT')
137 | 
138 | 
139 | def scale(obj, scale_factor):
140 | 	bpy.ops.object.select_all(action='DESELECT')
141 | 	obj.select_set(True)
142 | 	bpy.ops.transform.resize(value=(scale_factor, scale_factor, scale_factor))
143 | 	bpy.ops.object.select_all(action='DESELECT')
144 | 	freeze_transformation(obj)
145 | 
146 | 
147 | def get_3d_dimensions(obj):
148 | 	# pdb.set_trace()
149 | 	max_x, max_y, max_z = float("-inf"), float("-inf"), float("-inf")
150 | 	min_x, min_y, min_z = float("inf"), float("inf"), float("inf")
151 | 
152 | 	for vertex in obj.data.vertices:
153 | 		v_world = obj.matrix_world @ vertex.co
154 | 		max_x, max_y, max_z = max(max_x, v_world.x), max(max_y, v_world.y), max(max_z, v_world.z)
155 | 		min_x, min_y, min_z = min(min_x, v_world.x), min(min_y, v_world.y), min(min_z, v_world.z)
156 | 
157 | 	return (max_x - min_x, max_y - min_y, max_z - min_z)
158 | 
159 | 
160 | def normalize_object(obj, factor=1.0):
161 | 	max_dimension = max(get_3d_dimensions(obj))
162 | 	scale_factor = factor * (1 / max_dimension)
163 | 	scale(obj, scale_factor)
164 | 
165 | 
166 | def move_to_xy(obj, x, y):
167 | 	min_z = float('inf')
168 | 	for vertex in obj.data.vertices:
169 | 		z = obj.matrix_world @ vertex.co
170 | 		min_z = min(min_z, z.z)
171 | 	obj.location -= Vector((0, 0, min_z))
172 | 	freeze_transformation(obj)
173 | 
174 | 	# move location x,y to sampled box center
175 | 	new_location = Vector((x, y, obj.location[2]))
176 | 	obj.location = new_location
177 | 	freeze_transformation(obj)
178 | 
179 | 
180 | def normalize_scene():
181 | 	bbox_min, bbox_max = scene_bbox()
182 | 	scale = 1 / max(bbox_max - bbox_min)
183 | 	for obj in scene_root_objects():
184 | 		obj.scale = obj.scale * scale
185 | 	# Apply scale to matrix_world.
186 | 	bpy.context.view_layer.update()
187 | 	bbox_min, bbox_max = scene_bbox()
188 | 	offset = -(bbox_min + bbox_max) / 2
189 | 	for obj in scene_root_objects():
190 | 		obj.matrix_world.translation += offset
191 | 	bpy.ops.object.select_all(action="DESELECT")
192 | 
193 | 
194 | def setup_plane_and_background(plane_texture_path, hdri_path):
195 | 	# load plane
196 | 	plane_name = load_object(plane_texture_path)
197 | 	plane = bpy.data.objects.get(plane_name)
198 | 	scale(plane, 0.5)
199 | 
200 | 	# load light map
201 | 	print(f"HDRI PATH: {hdri_path}")
202 | 	bpy.ops.image.open(filepath=hdri_path)
203 | 	if bpy.data.worlds.get("World") is None:
204 | 		bpy.data.worlds.new("World")
205 | 
206 | 	bpy.context.scene.world = bpy.data.worlds["World"]
207 | 
208 | 	bpy.context.scene.world.use_nodes = True
209 | 	tree = bpy.context.scene.world.node_tree
210 | 	tree.nodes.clear()
211 | 
212 | 	tex_env = tree.nodes.new(type="ShaderNodeTexEnvironment")
213 | 	tex_env.image = bpy.data.images[hdri_path.split('/')[-1]]  # Image name is typically the last part of the path
214 | 	background = tree.nodes.new(type="ShaderNodeBackground")
215 | 	output = tree.nodes.new(type="ShaderNodeOutputWorld")
216 | 
217 | 	tree.links.new(tex_env.outputs[0], background.inputs[0])
218 | 	tree.links.new(background.outputs[0], output.inputs[0])
219 | 
220 | 	return plane_texture_path + " " + hdri_path
221 | 
222 | 
223 | def setup_camera_and_lights(
224 | 		sun_x,
225 | 		sun_y,
226 | 		sun_energy,
227 | 		key_light_horizontal_angle,
228 | 		fill_light_horizontal_angle,
229 | 		key_light_vertical_angle,
230 | 		fill_light_vertical_angle
231 | ):
232 | 	# for seeting up the three point lighting, we mostly follow https://courses.cs.washington.edu/courses/cse458/05au/reading/3point_lighting.pdf
233 | 	# in order to keep lights and camera on the hemisphere pointing to origin, we use a hierarchy of empties
234 | 
235 | 	# create the sun
236 | 
237 | 	bpy.ops.object.light_add(type="SUN")
238 | 	sun = bpy.context.active_object
239 | 	sun.rotation_euler = Euler((sun_x, sun_y, 0), "XYZ")
240 | 	sun.data.energy = sun_energy
241 | 
242 | 	# create global empty
243 | 
244 | 	bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1))
245 | 	x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90)
246 | 	empty = bpy.context.scene.objects.get("Empty")
247 | 
248 | 	# create camera
249 | 
250 | 	# radius = random.uniform(1.8,2.2)
251 | 	radius = 2.5
252 | 
253 | 	bpy.ops.object.camera_add(enter_editmode=False, align='VIEW', location=(-radius, 0, 0), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1))
254 | 	cam = bpy.context.scene.objects.get("Camera")
255 | 	cam.data.lens = 35
256 | 	cam.data.sensor_width = 32
257 | 	bpy.context.scene.camera = cam
258 | 
259 | 	# create camera empty
260 | 
261 | 	bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1))
262 | 	x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90)
263 | 	cam_empty = bpy.context.scene.objects.get("Empty.001")
264 | 	cam_empty.name = "camera_empty"
265 | 
266 | 	# make camera empty parent of camera
267 | 
268 | 	bpy.ops.object.select_all(action='DESELECT')
269 | 	cam.select_set(True)
270 | 	cam_empty.select_set(True)
271 | 	bpy.context.view_layer.objects.active = cam_empty
272 | 	bpy.ops.object.parent_set()
273 | 	bpy.ops.object.select_all(action='DESELECT')
274 | 
275 | 	# make camera empty parent of global empty
276 | 
277 | 	bpy.ops.object.select_all(action='DESELECT')
278 | 	cam_empty.select_set(True)
279 | 	empty.select_set(True)
280 | 	bpy.context.view_layer.objects.active = empty
281 | 	bpy.ops.object.parent_set()
282 | 	bpy.ops.object.select_all(action='DESELECT')
283 | 
284 | 	light_names = ["key_light", "fill_light", "back_light"]
285 | 	light_energies = [1000., 300., 500.]
286 | 
287 | 	for light_name, light_energy in zip(light_names, light_energies):
288 | 		# create light empty
289 | 
290 | 		empty_name = light_name + "_empty"
291 | 		bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1))
292 | 		x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90)
293 | 		light_empty = bpy.context.scene.objects.get("Empty.001")
294 | 		light_empty.name = empty_name
295 | 
296 | 		# parent light empty to main (camera) empty
297 | 
298 | 		bpy.ops.object.select_all(action='DESELECT')
299 | 		light_empty.select_set(True)
300 | 		empty.select_set(True)
301 | 		bpy.context.view_layer.objects.active = empty
302 | 		bpy.ops.object.parent_set()
303 | 		bpy.ops.object.select_all(action='DESELECT')
304 | 
305 | 		# create light
306 | 
307 | 		x_loc, y_loc, z_loc = -radius, 0, 0
308 | 		bpy.ops.object.light_add(type='POINT', radius=1, align='WORLD', location=(x_loc, y_loc, z_loc), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1))
309 | 		bpy.data.objects["Point"].name = light_name
310 | 		light = bpy.data.objects[light_name]
311 | 		light.data.energy = light_energy
312 | 		# light.data.size = 0.5
313 | 
314 | 		# parent light empty to light
315 | 
316 | 		bpy.ops.object.select_all(action='DESELECT')
317 | 		light.select_set(True)
318 | 		light_empty.select_set(True)
319 | 		bpy.context.view_layer.objects.active = light_empty
320 | 		bpy.ops.object.parent_set()
321 | 		bpy.ops.object.select_all(action='DESELECT')
322 | 
323 | 	# rotate camera and lights around the z-axis
324 | 
325 | 	z_random_rot = radians(90)  # radians(random.uniform(0,360))
326 | 	empty.rotation_euler = Euler((0, 0, z_random_rot))
327 | 
328 | 	# # raise the camera while having it point to origin
329 | 
330 | 	# cam_y_random_rot = radians(random.uniform(10,50))
331 | 	# cam_empty.rotation_euler = Euler((0,cam_y_random_rot,0),"XYZ")
332 | 
333 | 	bpy.context.view_layer.update()
334 | 
335 | 	back_light_horizontal_angle = radians(180)
336 | 	light_horizontal_angles = [key_light_horizontal_angle, fill_light_horizontal_angle, back_light_horizontal_angle]
337 | 	for light_angle, light_name in zip(light_horizontal_angles, light_names):
338 | 		light_empty = bpy.data.objects[light_name + "_empty"]
339 | 		global_z = (light_empty.matrix_world.inverted() @ Vector((0.0, 0.0, 1.0, 0.0)))[:3]
340 | 		quat = Quaternion(global_z, light_angle)
341 | 		light_empty.rotation_euler = quat.to_euler()
342 | 
343 | 	back_light_vertical_angle = 0
344 | 	light_vertical_angles = [key_light_vertical_angle, fill_light_vertical_angle, back_light_vertical_angle]
345 | 	# light_vertical_angles = [radians(-45)]*3
346 | 
347 | 	for light_angle, light_name in zip(light_vertical_angles, light_names):
348 | 		light_empty = bpy.data.objects[light_name + "_empty"]
349 | 		global_x = (light_empty.matrix_world.inverted() @ Vector((1.0, 0.0, 0.0, 0.0)))[:3]
350 | 		quat = Quaternion(global_x, light_angle)
351 | 		euler_add = quat.to_euler()
352 | 		euler_current = light_empty.rotation_euler
353 | 		new_euler = Euler((euler_add[0] + euler_current[0], euler_add[1] + euler_current[1], euler_add[2] + euler_current[2]))
354 | 		light_empty.rotation_euler = new_euler
355 | 
356 | 	# bpy.context.view_layer.update()
357 | 
358 | 	return cam, empty
359 | 
360 | 
361 | def render(fp):
362 | 	# Render image
363 | 	bpy.context.view_layer.update()
364 | 	bpy.context.scene.render.filepath = fp
365 | 	bpy.ops.render.render(write_still=True)
366 | 
367 | 
368 | def setup_renderer(H, W, use_cpu=False):
369 | 	scene = bpy.context.scene
370 | 	render = bpy.context.scene.render
371 | 
372 | 	render.engine = "CYCLES"
373 | 	render.image_settings.file_format = "PNG"
374 | 	render.image_settings.color_mode = "RGBA"
375 | 	render.resolution_x = W
376 | 	render.resolution_y = H
377 | 	render.resolution_percentage = 100
378 | 
379 | 	scene.cycles.device = "CPU" if use_cpu else "GPU"
380 | 	scene.cycles.samples = 10 if use_cpu else 128
381 | 	scene.cycles.diffuse_bounces = 1
382 | 	scene.cycles.glossy_bounces = 1
383 | 	scene.cycles.transparent_max_bounces = 3
384 | 	scene.cycles.transmission_bounces = 3
385 | 	scene.cycles.filter_width = 0.01
386 | 	scene.cycles.use_denoising = True
387 | 	scene.render.film_transparent = False
388 | 
389 | 	bpy.context.preferences.addons["cycles"].preferences.get_devices()
390 | 	# Set the device_type
391 | 	bpy.context.preferences.addons[
392 | 		"cycles"
393 | 	].preferences.compute_device_type = "METAL" if use_cpu else "CUDA"
394 | 	bpy.context.scene.view_settings.view_transform = 'Filmic'
395 | 
396 | 
397 | # def randomize_camera_view(axis):
398 | # 	euler_y = radians(random.uniform(-90, 90))
399 | # 	euler_z = radians(random.uniform(0, 360))
400 | # 	axis.rotation_euler = Euler((0, euler_y, euler_z))
401 | 
402 | 
403 | def run_render(metadata, save_image_path, use_cpu):
404 | 	reset_scene()
405 | 
406 | 	objs = []
407 | 	for uid in metadata["objects"]:
408 | 		object_path = metadata["object_path"][uid]
409 | 		objs.append(bpy.data.objects.get(load_object(object_path)))
410 | 
411 | 	grid_number = metadata["grid number"]
412 | 
413 | 	if grid_number == 2:
414 | 		locations = {
415 | 			0: [0.7, 0.5],
416 | 			1: [0.7, -0.5],
417 | 			2: [-0.6, 0.5],
418 | 			3: [-0.6, -0.5]
419 | 		}
420 | 		scale_factor = 1 / 2
421 | 	elif grid_number == 3:
422 | 		locations = {
423 | 			0: [0.9, 0.6],
424 | 			1: [0.9, 0],
425 | 			2: [0.9, -0.6],
426 | 			3: [0.0, 0.6],
427 | 			4: [0.0, 0.0],
428 | 			5: [0.0, -0.6],
429 | 			6: [-0.9, 0.6],
430 | 			7: [-0.9, 0.0],
431 | 			8: [-0.9, -0.6]
432 | 		}
433 | 		scale_factor = 1 / 3
434 | 	else:
435 | 		raise ValueError(f"Expected grid number to be 2 or 3 but got {grid_number}")
436 | 
437 | 	# process rotate
438 | 	for idx, obj in enumerate(objs):
439 | 		rotate(obj, degree=metadata['object_angles'][idx])
440 | 
441 | 	# process scale
442 | 	if "sizes" in metadata:
443 | 		for idx, obj in enumerate(objs):
444 | 			normalize_object(obj, factor=metadata['sizes'][idx] * scale_factor)
445 | 	else:
446 | 		for obj in objs:
447 | 			normalize_object(obj, factor=scale_factor)
448 | 
449 | 	for pos, obj in zip(metadata["grids"], objs):
450 | 		x, y = locations[pos]
451 | 		move_to_xy(obj, x, y)
452 | 
453 | 	blender_config = metadata["blender_config"]
454 | 
455 | 	setup_plane_and_background(blender_config["plane_texture_path"], blender_config["hdri_path"])
456 | 	cam, axis = setup_camera_and_lights(
457 | 		blender_config["sun_x"],
458 | 		blender_config["sun_y"],
459 | 		blender_config["sun_energy"],
460 | 		blender_config["key_light_horizontal_angle"],
461 | 		blender_config["fill_light_horizontal_angle"],
462 | 		blender_config["key_light_vertical_angle"],
463 | 		blender_config["fill_light_vertical_angle"]
464 | 	)
465 | 	axis.rotation_euler = Euler((0, radians(45), 0))
466 | 	setup_renderer(H=metadata["H"], W=metadata["W"], use_cpu=use_cpu)
467 | 	render(save_image_path)
468 | 
469 | 
470 | if __name__ == "__main__":
471 | 	parser = argparse.ArgumentParser()
472 | 	parser.add_argument(
473 | 		"--save_local",
474 | 		type=str,
475 | 		default=""
476 | 	)
477 | 	parser.add_argument(
478 | 		"--save_image_path",
479 | 		type=str,
480 | 		default="render.png"
481 | 	)
482 | 	parser.add_argument(
483 | 		"--json_file",
484 | 		type=str,
485 | 		default="image_metadata.json"
486 | 	)
487 | 
488 | 	parser.add_argument(
489 | 		"--use_cpu",
490 | 		action="store_true",
491 | 		default=False
492 | 	)
493 | 
494 | 	argv = sys.argv[sys.argv.index("--") + 1:]
495 | 	args = parser.parse_args(argv)
496 | 
497 | 	with open(args.json_file, "r") as f:
498 | 		metadata = json.load(f)
499 | 
500 | 	run_render(metadata, args.save_image_path, args.use_cpu)
501 | 


--------------------------------------------------------------------------------
/tma/imageqa/scene_graph/single_image_task.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import json
  3 | from itertools import combinations
  4 | from typing import List, Tuple
  5 | 
  6 | import numpy as np
  7 | from PIL import Image
  8 | from tqdm import tqdm
  9 | 
 10 | from ..metadata import SceneGraphMetaData
 11 | from ...base import TaskGenerator
 12 | from ...task_store import TaskStore
 13 | 
 14 | 
 15 | def scene_graph_adjacent_objects(scene_graph, node):
 16 | 	adjacent_objects = {}
 17 | 	for edge in scene_graph["objects"][node]['relations']:
 18 | 		obj = edge['object']
 19 | 		if obj not in adjacent_objects:
 20 | 			adjacent_objects[obj] = []
 21 | 		adjacent_objects[obj].append((edge['name'], 0))
 22 | 
 23 | 	for obj, edges in scene_graph["objects"].items():
 24 | 		for edge in edges["relations"]:
 25 | 			if edge['object'] == node:
 26 | 				if obj not in adjacent_objects:
 27 | 					adjacent_objects[obj] = []
 28 | 				adjacent_objects[obj].append((edge['name'], 1))
 29 | 	return adjacent_objects
 30 | 
 31 | 
 32 | def subgraph_to_json_str(subgraph, scene_graph):
 33 | 	subgraph_json = {
 34 | 		"attributes"      : [],
 35 | 		"adjacent_objects": [],
 36 | 	}
 37 | 	adjacent_object_info = {}
 38 | 	for element in subgraph:
 39 | 		if isinstance(element, str):
 40 | 			subgraph_json["attributes"].append(element)
 41 | 		else:
 42 | 			if len(element) == 2:
 43 | 				obj, attr = element
 44 | 				if obj not in adjacent_object_info:
 45 | 					adjacent_object_info[obj] = {
 46 | 						"attributes": [attr],
 47 | 						"relation"  : None
 48 | 					}
 49 | 				else:
 50 | 					adjacent_object_info[obj]["attributes"].append(attr)
 51 | 			else:
 52 | 				obj, rel, direction = element
 53 | 				if obj not in adjacent_object_info:
 54 | 					adjacent_object_info[obj] = {
 55 | 						"attributes": [],
 56 | 						"relation"  : (rel, direction)
 57 | 					}
 58 | 				else:
 59 | 					adjacent_object_info[obj]["relation"] = (rel, direction)
 60 | 
 61 | 	for obj, info in adjacent_object_info.items():
 62 | 		subgraph_json["adjacent_objects"].append({
 63 | 			"object"    : scene_graph["objects"][obj]["name"],
 64 | 			"attributes": sorted(info["attributes"]),
 65 | 			"relation"  : info["relation"]
 66 | 		})
 67 | 	subgraph_json["attributes"] = sorted(subgraph_json["attributes"])
 68 | 	subgraph_json["adjacent_objects"] = sorted(subgraph_json["adjacent_objects"], key=lambda x: json.dumps(x))
 69 | 	return json.dumps(subgraph_json)
 70 | 
 71 | 
 72 | def constrained_combinations(n, k, constraints):
 73 | 	"""
 74 | 	Generate all combinations of k elements from n elements that satisfy the constraints
 75 | 	:param n:
 76 | 	:param k:
 77 | 	:param constraints: a list of tuples (i, j) that means that when i is not selected, i+1 ~ j should not be selected
 78 | 	:return: a binary array of shape (x, n) where each row represents a valid combination
 79 | 	"""
 80 | 	combo = np.array(list(combinations(range(n), k)))
 81 | 	selection = np.zeros((len(combo), n), dtype=bool)
 82 | 	selection[np.arange(len(combo))[:, None], combo] = 1
 83 | 	for start, end in constraints:
 84 | 		selection = selection[~((selection[:, start] == 0) & (np.any(selection[:, start + 1:end], axis=1)))]
 85 | 	return selection
 86 | 
 87 | 
 88 | def compose_parallel_phrase(phrases):
 89 | 	if len(phrases) == 0:
 90 | 		return ""
 91 | 	elif len(phrases) == 1:
 92 | 		return phrases[0]
 93 | 	elif len(phrases) == 2:
 94 | 		return f"{phrases[0]} and {phrases[1]}"
 95 | 	else:
 96 | 		phrases[-1] = "and " + phrases[-1]
 97 | 		return ", ".join(phrases)
 98 | 
 99 | 
100 | def compose_attributed_name(attributes, name):
101 | 	if len(attributes) > 0:
102 | 		attributes = compose_parallel_phrase(attributes)
103 | 		return f"{attributes} {name}"
104 | 	else:
105 | 		return name
106 | 
107 | 
108 | @functools.lru_cache(maxsize=100000)
109 | def compose_object_reference(subgraph: str):
110 | 	subgraph = json.loads(subgraph)
111 | 
112 | 	# Helper function to create relation phrases
113 | 	def create_relation_phrase(attributed_name, relation_name, is_forward=True):
114 | 		return f"is {relation_name} the {attributed_name}" if is_forward else f"the {attributed_name} is {relation_name}"
115 | 
116 | 	# Process relations
117 | 	forward_relations, backward_relations = [], []
118 | 
119 | 	for idx, node in enumerate(subgraph['adjacent_objects']):
120 | 		rel = node['relation']
121 | 		attributed_name = compose_attributed_name(node.get("attributes", []), node['object'])
122 | 		if rel[1] == 0:
123 | 			forward_relations.append(create_relation_phrase(attributed_name, rel[0], True))
124 | 		else:
125 | 			backward_relations.append(create_relation_phrase(attributed_name, rel[0], False))
126 | 
127 | 	# Combine relations into reference string
128 | 	reference = ""
129 | 	if forward_relations:
130 | 		reference += compose_parallel_phrase(forward_relations)
131 | 	if backward_relations:
132 | 		if forward_relations:
133 | 			reference += ", and also, "
134 | 		reference += compose_parallel_phrase(backward_relations)
135 | 	return reference
136 | 
137 | 
138 | def subgraph_contain_multiple_same_direction_relations(subgraph):
139 | 	out_rel = False
140 | 	in_rel = False
141 | 	for item in subgraph:
142 | 		if len(item) == 3:
143 | 			if item[2] == 0:
144 | 				if out_rel:
145 | 					return True
146 | 				out_rel = True
147 | 			else:
148 | 				if in_rel:
149 | 					return True
150 | 				in_rel = True
151 | 	return False
152 | 
153 | 
154 | def subgraph_contain_multiple_relations(subgraph):
155 | 	rel = False
156 | 	for item in subgraph:
157 | 		if isinstance(item, tuple) and len(item) == 3:
158 | 			if rel:
159 | 				return True
160 | 			rel = True
161 | 	return False
162 | 
163 | 
164 | class SceneGraphTaskGenerator(TaskGenerator):
165 | 	metadata: SceneGraphMetaData
166 | 
167 | 	embed_schema = [
168 | 		"task type",
169 | 		"object",
170 | 		"attribute value",
171 | 		"attribute type",
172 | 		"relation",
173 | 		"source object",
174 | 		"target object"
175 | 	]
176 | 
177 | 	def __init__(self, metadata: SceneGraphMetaData, subgraph_size=4, n_subgraph_per_answer=1, max_scene_graph_size=10000, seed=42):
178 | 		super().__init__(metadata, seed=seed)
179 | 		self.subgraph_size = subgraph_size
180 | 		self.n_subgraph_per_answer = n_subgraph_per_answer
181 | 		self.max_scene_graph_size = max_scene_graph_size
182 | 
183 | 	def _enumerate_subgraphs_w_object(
184 | 			self,
185 | 			scene_graph,
186 | 			start_node,
187 | 			exclude_attribute_type=None,
188 | 			exclude_object=[]
189 | 	):
190 | 
191 | 		stamp = []
192 | 		elements = [
193 | 			attr for attr in scene_graph["objects"][start_node]['attributes']
194 | 			if exclude_attribute_type is None or self.metadata.get_attribute_type(attr) != exclude_attribute_type
195 | 		]
196 | 		adjacent_objects = scene_graph_adjacent_objects(scene_graph, start_node)
197 | 		for obj in adjacent_objects:
198 | 			if obj not in exclude_object:
199 | 				start = len(elements)
200 | 				elements.append(obj)
201 | 				elements += [(obj, attr) for attr in scene_graph["objects"][obj]['attributes']]
202 | 				stamp.append((start, len(elements)))
203 | 		if len(elements) < self.subgraph_size:
204 | 			return []
205 | 
206 | 		# sample all subgraphs that contain the start node with the given size
207 | 		selection = constrained_combinations(len(elements), self.subgraph_size, stamp)
208 | 
209 | 		# distinguish subgraphs with and without the objects
210 | 		with_object_mask = np.any(selection[:, [start for start, _ in stamp]], axis=1)
211 | 		subgraphs_w_objects = [[elements[i] for i in np.where(indices)[0]] for indices in selection[with_object_mask]]
212 | 		subgraphs_wo_objects = [[elements[i] for i in np.where(indices)[0]] for indices in selection[~with_object_mask]]
213 | 
214 | 		# for subgraph with object, add its all possible relations to the start node
215 | 		for obj, rels in adjacent_objects.items():
216 | 			new_subgraphs = []
217 | 			for subgraph in subgraphs_w_objects:
218 | 				if obj in subgraph:
219 | 					obj_id = subgraph.index(obj)
220 | 					for rel, direction in rels:
221 | 						subgraph_rel = subgraph.copy()
222 | 						subgraph_rel[obj_id] = (obj, rel, direction)
223 | 						# remove subgraphs with multiple same-direction relations
224 | 						if not subgraph_contain_multiple_relations(subgraph_rel):
225 | 							new_subgraphs.append(subgraph_rel)
226 | 				else:
227 | 					new_subgraphs.append(subgraph)
228 | 			subgraphs_w_objects = new_subgraphs
229 | 
230 | 		subgraph_json_strs = [subgraph_to_json_str(subgraph, scene_graph)
231 | 							  for subgraph in subgraphs_w_objects + subgraphs_wo_objects]
232 | 
233 | 		return set(subgraph_json_strs)
234 | 
235 | 	def _task_plan_to_str(self, task_plan) -> str:
236 | 		t = []
237 | 		for k, v in task_plan.items():
238 | 			if k in self.embed_schema:
239 | 				assert isinstance(v, str)
240 | 				t.append(f'{k}: {v}')
241 | 		return '\n'.join(t)
242 | 
243 | 	def _generate_task(self, task_plan) -> Tuple[str, str, List[str], str]:
244 | 		"(Abstract method) generate task"
245 | 
246 | 	def generate(self, task_plan, return_data=True, seed=None):
247 | 		if seed is not None:
248 | 			self.rng = np.random.default_rng(seed=seed)
249 | 
250 | 		question, answer, options, scene_graph_id = self._generate_task(task_plan)
251 | 
252 | 		task = {
253 | 			"question"      : question.replace("_", " "),
254 | 			"answer"        : answer.replace("_", " "),
255 | 			"options"       : [o.replace("_", " ") for o in options],
256 | 			"task_plan"     : self._task_plan_to_str(task_plan),
257 | 			"scene_graph_id": scene_graph_id,
258 | 			'image'         : Image.open(self.metadata.get_image_path(scene_graph_id)) if return_data else None
259 | 		}
260 | 		return task
261 | 
262 | 
263 | class WhatObjectSceneGraphTaskGenerator(SceneGraphTaskGenerator):
264 | 	schema = {
265 | 		"task type"     : "str",
266 | 		"object"        : "str",
267 | 		"subgraph"      : "str",
268 | 		"scene graph id": "str",
269 | 		"answers"       : "list",
270 | 	}
271 | 
272 | 	def enumerate_object_subgraphs(self, scene_graph):
273 | 		subgraph_to_objects = {}
274 | 		for object, info in scene_graph["objects"].items():
275 | 			obj_name = info['name']
276 | 			if self.metadata.check_object_in_category(obj_name):
277 | 				subgraphs = self._enumerate_subgraphs_w_object(scene_graph, object)
278 | 				# subgraphs = self.rng.choice(list(subgraphs), min(self.n_subgraph_per_answer, len(subgraphs)), replace=False)
279 | 				for subgraph in subgraphs:
280 | 					if subgraph not in subgraph_to_objects:
281 | 						subgraph_to_objects[subgraph] = set()
282 | 					subgraph_to_objects[subgraph].add(obj_name)
283 | 		return subgraph_to_objects
284 | 
285 | 	def enumerate_task_plans(self, task_store: TaskStore):
286 | 
287 | 		for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what object] task"):
288 | 
289 | 			if len(scene_graph["objects"]) < self.max_scene_graph_size:
290 | 				subgraph_to_nodes = self.enumerate_object_subgraphs(scene_graph)
291 | 
292 | 				for subgraph_str, nodes in subgraph_to_nodes.items():
293 | 					answers = sorted(list(nodes))
294 | 					for node in nodes:
295 | 						task_plan = {
296 | 							"task type"     : "what object",
297 | 							"scene graph id": scene_graph_id,
298 | 							"subgraph"      : subgraph_str,
299 | 							"object"        : node,
300 | 							"answers"       : answers,
301 | 						}
302 | 						task_store.add(task_plan)
303 | 
304 | 	def _generate_task(self, task_plan):
305 | 		obj_reference = compose_object_reference(task_plan["subgraph"])
306 | 		subgraph = json.loads(task_plan["subgraph"])
307 | 		object = task_plan["object"]
308 | 		scene_graph_id = task_plan["scene graph id"]
309 | 
310 | 		attributed_name = compose_attributed_name(subgraph.get("attributes", []), "object")
311 | 
312 | 		if obj_reference != "":
313 | 			obj_reference = f" that {obj_reference}"
314 | 		question = f"What is the {attributed_name}{obj_reference}?"
315 | 
316 | 		answer = object
317 | 		exclude_categories = [self.metadata.sg_object_to_cateid[obj] for obj in task_plan["answers"]]
318 | 		negative_objects = [self.metadata.get_surfacename(cateid) for cateid in self.metadata.get_irrelevant_categories(exclude_categories)]
319 | 		options = self._compose_options(answer, negative_objects)
320 | 
321 | 		return question, answer, options, scene_graph_id
322 | 
323 | 
324 | class WhatAttributeSceneGraphTaskGenerator(SceneGraphTaskGenerator):
325 | 	schema = {
326 | 		"task type"      : "str",
327 | 		"attribute type" : "str",
328 | 		"attribute value": "str",
329 | 		"subgraph"       : "str",
330 | 		"scene graph id" : "str",
331 | 		"answers"        : "list",
332 | 	}
333 | 
334 | 	def enumerate_attribute_subgraphs(self, scene_graph):
335 | 		subgraph_to_nodes = {}
336 | 		for node, info in scene_graph["objects"].items():
337 | 			for attr in info['attributes']:
338 | 				attr_type = self.metadata.get_attribute_type(attr)
339 | 				subgraphs = self._enumerate_subgraphs_w_object(scene_graph, node, exclude_attribute_type=attr_type)
340 | 				# subgraphs = self.rng.choice(list(subgraphs), min(self.n_subgraph_per_answer, len(subgraphs)), replace=False)
341 | 				for subgraph in subgraphs:
342 | 					if subgraph not in subgraph_to_nodes:
343 | 						subgraph_to_nodes[subgraph] = {}
344 | 					if attr_type not in subgraph_to_nodes[subgraph]:
345 | 						subgraph_to_nodes[subgraph][attr_type] = set()
346 | 					subgraph_to_nodes[subgraph][attr_type].add(attr)
347 | 		return subgraph_to_nodes
348 | 
349 | 	def enumerate_task_plans(self, task_store: TaskStore):
350 | 		for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what attribute] task"):
351 | 			if len(scene_graph["objects"]) < self.max_scene_graph_size:
352 | 
353 | 				subgraphs_to_attrs = self.enumerate_attribute_subgraphs(scene_graph)
354 | 				for subgraph_str, attributes in subgraphs_to_attrs.items():
355 | 					for attribute_type, attribute_set in attributes.items():
356 | 						answers = sorted(list(attribute_set))
357 | 						for attribute in attribute_set:
358 | 							task_plan = {
359 | 								"task type"      : "what attribute",
360 | 								"scene graph id" : scene_graph_id,
361 | 								"subgraph"       : subgraph_str,
362 | 								"attribute value": attribute,
363 | 								"answers"        : answers,
364 | 								"attribute type" : attribute_type
365 | 							}
366 | 							task_store.add(task_plan)
367 | 
368 | 	def _generate_task(self, task_plan):
369 | 
370 | 		obj_reference = compose_object_reference(task_plan["subgraph"])
371 | 		subgraph = json.loads(task_plan["subgraph"])
372 | 
373 | 		scene_graph_id = task_plan["scene graph id"]
374 | 		attribute = task_plan["attribute value"]
375 | 		attribute_type = task_plan["attribute type"]
376 | 
377 | 		attributed_name = compose_attributed_name(subgraph.get("attributes", []), "object")
378 | 
379 | 		if obj_reference != "":
380 | 			obj_reference = f" that {obj_reference}"
381 | 
382 | 		attribute_type_word = lambda x: "attribute value" if x == "other" else x
383 | 		question = f"What is the {attribute_type_word(attribute_type)} of the {attributed_name}{obj_reference}?"
384 | 		answer = attribute
385 | 		negative_attributes = list(set(self.metadata.type_to_attribute[attribute_type]) - set(task_plan["answers"]))
386 | 		options = self._compose_options(answer, negative_attributes)
387 | 
388 | 		return question, answer, options, scene_graph_id
389 | 
390 | 
391 | class WhatRelationSceneGraphTaskGenerator(SceneGraphTaskGenerator):
392 | 	schema = {
393 | 		"task type"      : "str",
394 | 		"relation"       : "str",
395 | 		"source object"  : "str",
396 | 		"target object"  : "str",
397 | 		"source subgraph": "str",
398 | 		"target subgraph": "str",
399 | 		"scene graph id" : "str",
400 | 		"answers"        : "list"
401 | 
402 | 	}
403 | 
404 | 	def enumerate_relation_subgraphs(self, scene_graph):
405 | 		subgraph_to_nodes_cnt = {}
406 | 		for node, info in scene_graph["objects"].items():
407 | 			subgraphs = self._enumerate_subgraphs_w_object(scene_graph, node)
408 | 			for subgraph in subgraphs:
409 | 				if subgraph not in subgraph_to_nodes_cnt:
410 | 					subgraph_to_nodes_cnt[subgraph] = 0
411 | 				subgraph_to_nodes_cnt[subgraph] += 1
412 | 
413 | 		relations = {}
414 | 		for node, info in scene_graph["objects"].items():
415 | 			for rel in info['relations']:
416 | 				obj2 = rel['object']
417 | 				if (node, obj2) not in relations:
418 | 					relations[(node, obj2)] = set()
419 | 				relations[(node, obj2)].add(rel['name'])
420 | 
421 | 		subgraph_to_relation = {}
422 | 		for (obj1, obj2), rels in relations.items():
423 | 
424 | 			subgraphs1 = self._enumerate_subgraphs_w_object(scene_graph, obj1, exclude_object=[obj2])
425 | 			subgraphs1 = [subgraph for subgraph in subgraphs1 if subgraph_to_nodes_cnt[subgraph] == 1]
426 | 			subgraphs1 = self.rng.choice(list(subgraphs1), min(self.n_subgraph_per_answer, len(subgraphs1)), replace=False)
427 | 
428 | 			subgraphs2 = self._enumerate_subgraphs_w_object(scene_graph, obj2, exclude_object=[obj1])
429 | 			subgraphs2 = [subgraph for subgraph in subgraphs2 if subgraph_to_nodes_cnt[subgraph] == 1]
430 | 			subgraphs2 = self.rng.choice(list(subgraphs2), min(self.n_subgraph_per_answer, len(subgraphs2)), replace=False)
431 | 
432 | 			obj1_name = scene_graph["objects"][obj1]["name"]
433 | 			obj2_name = scene_graph["objects"][obj2]["name"]
434 | 			for subgraph1 in subgraphs1:
435 | 				for subgraph2 in subgraphs2:
436 | 					subgraph_to_relation[(subgraph1, subgraph2)] = (rels, obj1_name, obj2_name)
437 | 
438 | 		return subgraph_to_relation
439 | 
440 | 	def enumerate_task_plans(self, task_store: TaskStore):
441 | 		for scene_graph_id, scene_graph in tqdm(self.metadata.scene_graphs.items(), desc="enumerating [what relation] task"):
442 | 			if len(scene_graph["objects"]) < self.max_scene_graph_size:
443 | 				subgraphs_to_rels = self.enumerate_relation_subgraphs(scene_graph)
444 | 				for subgraph, (rels, obj1, obj2) in subgraphs_to_rels.items():
445 | 					answers = sorted(list(rels))
446 | 					for rel in rels:
447 | 						task_plan = {
448 | 							"task type"      : "what relation",
449 | 							"relation"       : rel,
450 | 							"source object"  : obj1,
451 | 							"target object"  : obj2,
452 | 							"scene graph id" : scene_graph_id,
453 | 							"source subgraph": subgraph[0],
454 | 							"target subgraph": subgraph[1],
455 | 							"answers"        : answers,
456 | 						}
457 | 						task_store.add(task_plan)
458 | 
459 | 	def _generate_task(self, task_plan):
460 | 		source_obj_reference = compose_object_reference(task_plan["source subgraph"])
461 | 		target_obj_reference = compose_object_reference(task_plan["target subgraph"])
462 | 
463 | 		source_subgraph = json.loads(task_plan["source subgraph"])
464 | 		target_subgraph = json.loads(task_plan["target subgraph"])
465 | 		relation = task_plan["relation"]
466 | 		scene_graph_id = task_plan["scene graph id"]
467 | 
468 | 		source_attributed_name = compose_attributed_name(source_subgraph.get("attributes", []), "object")
469 | 		target_attributed_name = compose_attributed_name(target_subgraph.get("attributes", []), "object")
470 | 
471 | 		if source_obj_reference != "":
472 | 			source_obj_reference = f", which {source_obj_reference}"
473 | 		if target_obj_reference != "":
474 | 			target_obj_reference = f", which {target_obj_reference}"
475 | 
476 | 		question = f"What is the relation from the {source_attributed_name}{source_obj_reference}, to the {target_attributed_name}{target_obj_reference}?"
477 | 		answer = relation
478 | 		negatives = list(set(self.metadata.relations) - set(task_plan["answers"]))
479 | 		options = self._compose_options(answer, negatives)
480 | 
481 | 		return question, answer, options, scene_graph_id
482 | 


--------------------------------------------------------------------------------
/tma/imageqa/tabletop_3d/size_single_image_task.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | 
  3 | from .single_image_task import _3DGridTaskGenerator
  4 | from .utils import relative_positions, reverse_relative_positions
  5 | from ..metadata import Objaverse3DMetaData
  6 | from ...constant import NUM_OPTIONS
  7 | from ...task_store import TaskStore
  8 | 
  9 | largest_size = 1.5
 10 | smallest_size = 0.5
 11 | all_size_options = set([0.5, 1.0, 1.5])
 12 | grid_options = [2]
 13 | 
 14 | 
 15 | class Size3DGridTaskGenerator(_3DGridTaskGenerator):
 16 | 	def __init__(self, metadata: Objaverse3DMetaData, seed=42):
 17 | 		super().__init__(metadata, seed=seed)
 18 | 		self.grid_options = grid_options
 19 | 
 20 | 	def _make_image_metadata(self, grid_size, sizes, size_options, grids, queries, remaining_query=...):
 21 | 		objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries]
 22 | 
 23 | 		remaining_grids = [g for g in range(grid_size ** 2) if g not in grids]
 24 | 		for _ in remaining_grids:
 25 | 			uid = self.metadata.sample(self.rng, 1, "object", remaining_query)
 26 | 			objects.append(uid)
 27 | 		remaining_sizes = list(self.rng.choice(size_options, replace=True, size=len(remaining_grids)))
 28 | 
 29 | 		object_path = {k: self.metadata.get_object_path(k) for k in objects}
 30 | 		angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects]
 31 | 
 32 | 		image_metadata = {
 33 | 			'grid number'   : grid_size,
 34 | 			'objects'       : objects,
 35 | 			'object_path'   : object_path,
 36 | 			'object_angles' : angles,
 37 | 			'grids'         : grids + remaining_grids,
 38 | 			'blender_config': self.metadata.sample_blender_configuration(self.rng),
 39 | 			'sizes'         : sizes + remaining_sizes
 40 | 		}
 41 | 		return image_metadata
 42 | 
 43 | 
 44 | class WhatSize3DGridTaskGenerator(Size3DGridTaskGenerator):
 45 | 	schema = {
 46 | 		'task type'    : 'str',
 47 | 		'size'             : 'str',
 48 | 		'grid number'      : 'int',
 49 | 		'target category'  : 'str',
 50 | 		'absolute position': 'str',
 51 | 		'attribute type'   : 'str',
 52 | 		'attribute value'  : 'str',
 53 | 	}
 54 | 
 55 | 	def enumerate_task_plans(self, task_store: TaskStore):
 56 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [what size] task"):
 57 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
 58 | 			for attribute_type, attribute_values in attribute_dict.items():
 59 | 				for attribute_value in attribute_values:
 60 | 					for grid_size in self.grid_options:
 61 | 						for absolute_pos in self.grid_mappings[grid_size]:
 62 | 							task_plan = {
 63 | 								'task type'    : 'what size',
 64 | 								'size'             : 'largest',
 65 | 								'grid number'      : grid_size,
 66 | 								'target category'  : target_category,
 67 | 								'absolute position': absolute_pos,
 68 | 								'attribute type'   : attribute_type,
 69 | 								'attribute value'  : attribute_value,
 70 | 							}
 71 | 							task_store.add(task_plan)
 72 | 
 73 | 							task_plan = {
 74 | 								'task type'    : 'what size',
 75 | 								'size'             : 'smallest',
 76 | 								'grid number'      : grid_size,
 77 | 								'target category'  : target_category,
 78 | 								'absolute position': absolute_pos,
 79 | 								'attribute type'   : attribute_type,
 80 | 								'attribute value'  : attribute_value,
 81 | 							}
 82 | 							task_store.add(task_plan)
 83 | 
 84 | 			for grid_size in self.grid_options:
 85 | 				for absolute_pos in self.grid_mappings[grid_size]:
 86 | 					task_plan = {
 87 | 						'task type'    : 'what size',
 88 | 						'size'             : 'largest',
 89 | 						'grid number'      : grid_size,
 90 | 						'target category'  : target_category,
 91 | 						'absolute position': absolute_pos,
 92 | 					}
 93 | 					task_store.add(task_plan)
 94 | 
 95 | 					task_plan = {
 96 | 						'task type'    : 'what size',
 97 | 						'size'             : 'smallest',
 98 | 						'grid number'      : grid_size,
 99 | 						'target category'  : target_category,
100 | 						'absolute position': absolute_pos,
101 | 					}
102 | 					task_store.add(task_plan)
103 | 
104 | 	def _generate_task(self, task_plan):
105 | 		grid_size = task_plan['grid number']
106 | 		target_category = task_plan['target category']
107 | 		absolute_pos = task_plan['absolute position']
108 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
109 | 
110 | 		if task_plan['size'] == 'largest':
111 | 			sizes = [largest_size]
112 | 			size_options = list(all_size_options - {largest_size})
113 | 			question = f"What is the largest object in the image?"
114 | 		else:
115 | 			sizes = [smallest_size]
116 | 			size_options = list(all_size_options - {smallest_size})
117 | 			question = f"What is the smallest object in the image?"
118 | 
119 | 		queries = [self._get_target_object_query(task_plan)]
120 | 
121 | 		remaining_query = self.metadata.and_query([("category", target_category, False)])
122 | 
123 | 		image_metadata = self._make_image_metadata(
124 | 			grid_size,
125 | 			sizes,
126 | 			size_options,
127 | 			grids,
128 | 			queries=queries,
129 | 			remaining_query=remaining_query,
130 | 		)
131 | 
132 | 		answer = self.metadata.get_surfacename(target_category)
133 | 		negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category))
134 | 					 for o in image_metadata['objects'][1:]]
135 | 		options = self._compose_options(answer, negatives)
136 | 
137 | 		return question, answer, options, image_metadata
138 | 
139 | 
140 | class WhatAttributeSize3DGridTaskGenerator(Size3DGridTaskGenerator):
141 | 	schema = {
142 | 		'task type'    : 'str',
143 | 		'size'             : 'str',
144 | 		'grid number'      : 'int',
145 | 		'target category'  : 'str',
146 | 		'absolute position': 'str',
147 | 		'attribute type'   : 'str',
148 | 		'attribute value'  : 'str',
149 | 	}
150 | 
151 | 	def enumerate_task_plans(self, task_store: TaskStore):
152 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [what size attribute] task"):
153 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
154 | 			for attribute_type, attribute_values in attribute_dict.items():
155 | 				for attribute_value in attribute_values:
156 | 					for grid_size in self.grid_options:
157 | 						for absolute_pos in self.grid_mappings[grid_size]:
158 | 							task_plan = {
159 | 								'task type'    : 'what attribute size',
160 | 								'size'             : 'largest',
161 | 								'grid number'      : grid_size,
162 | 								'target category'  : target_category,
163 | 								'absolute position': absolute_pos,
164 | 								'attribute type'   : attribute_type,
165 | 								'attribute value'  : attribute_value,
166 | 							}
167 | 							task_store.add(task_plan)
168 | 
169 | 							task_plan = {
170 | 								'task type'    : 'what attribute size',
171 | 								'size'             : 'smallest',
172 | 								'grid number'      : grid_size,
173 | 								'target category'  : target_category,
174 | 								'absolute position': absolute_pos,
175 | 								'attribute type'   : attribute_type,
176 | 								'attribute value'  : attribute_value,
177 | 							}
178 | 							task_store.add(task_plan)
179 | 
180 | 	def _generate_task(self, task_plan):
181 | 		grid_size = task_plan['grid number']
182 | 
183 | 		attribute_type = task_plan['attribute type']
184 | 
185 | 		absolute_pos = task_plan['absolute position']
186 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
187 | 
188 | 		queries = [self._get_target_object_query(task_plan)]
189 | 
190 | 		if task_plan['size'] == 'largest':
191 | 			sizes = [largest_size]
192 | 			size_options = list(all_size_options - {largest_size})
193 | 			question = f"What is the {attribute_type} of the largest object in the image?"
194 | 		else:
195 | 			sizes = [smallest_size]
196 | 			size_options = list(all_size_options - {smallest_size})
197 | 			question = f"What is the {attribute_type} of the smallest object in the image?"
198 | 
199 | 		image_metadata = self._make_image_metadata(
200 | 			grid_size,
201 | 			sizes,
202 | 			size_options,
203 | 			grids,
204 | 			queries=queries,
205 | 		)
206 | 
207 | 		answer = task_plan['attribute value']
208 | 		target_object = image_metadata['objects'][0]
209 | 		negative_query = self.metadata.and_query([
210 | 			(attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)]))
211 | 		])
212 | 		negatives = self.metadata.sample(
213 | 			self.rng,
214 | 			NUM_OPTIONS - 1,
215 | 			attribute_type,
216 | 			query=negative_query,
217 | 		)
218 | 		options = [answer] + negatives
219 | 
220 | 		return question, answer, options, image_metadata
221 | 
222 | 
223 | class WhereSize3DGridTaskGenerator(Size3DGridTaskGenerator):
224 | 	schema = {
225 | 		'task type'         : 'str',
226 | 		'size'                  : 'str',
227 | 		'grid number'           : 'int',
228 | 		'target category'       : 'str',
229 | 		'absolute position'     : 'str',
230 | 		'attribute type'        : 'str',
231 | 		'attribute value'       : 'str',
232 | 		'reference category'    : 'str',
233 | 		'reference position'    : 'str',
234 | 		'target-reference order': 'str'
235 | 	}
236 | 
237 | 	def __init__(self, metadata: Objaverse3DMetaData, seed=42):
238 | 		super().__init__(metadata, seed=seed)
239 | 		self.relative_positions = relative_positions
240 | 
241 | 	def enumerate_task_plans(self, task_store: TaskStore):
242 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [where size] task"):
243 | 			irrelevant_categories = self.metadata.get_irrelevant_categories(target_category)
244 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
245 | 			for attribute_type, attribute_values in attribute_dict.items():
246 | 				for attribute_value in attribute_values:
247 | 					for grid_size in self.grid_options:
248 | 						for absolute_pos in self.grid_mappings[grid_size]:
249 | 							task_plan = {
250 | 								'task type'    : 'where size',
251 | 								'size'             : 'largest',
252 | 								'grid number'      : grid_size,
253 | 								'target category'  : target_category,
254 | 								'absolute position': absolute_pos,
255 | 								'attribute type'   : attribute_type,
256 | 								'attribute value'  : attribute_value,
257 | 							}
258 | 							task_store.add(task_plan)
259 | 
260 | 							task_plan = {
261 | 								'task type'    : 'where size',
262 | 								'size'             : 'smallest',
263 | 								'grid number'      : grid_size,
264 | 								'target category'  : target_category,
265 | 								'absolute position': absolute_pos,
266 | 								'attribute type'   : attribute_type,
267 | 								'attribute value'  : attribute_value,
268 | 							}
269 | 							task_store.add(task_plan)
270 | 
271 | 							grid = self.grid_mappings[grid_size][absolute_pos]
272 | 
273 | 							for reference_category in irrelevant_categories:
274 | 								for reference_pos in self.relative_positions:
275 | 									ref_grid = self._relative_grid(grid_size, grid, reference_pos)
276 | 									if ref_grid >= 0:
277 | 										task_plan = {
278 | 											'task type'         : 'where size',
279 | 											'size'                  : 'largest',
280 | 											'grid number'           : grid_size,
281 | 											'target category'       : target_category,
282 | 											'absolute position'     : absolute_pos,
283 | 											'reference category'    : reference_category,
284 | 											'reference position'    : reference_pos,
285 | 											'attribute type'        : attribute_type,
286 | 											'attribute value'       : attribute_value,
287 | 											'target-reference order': 'target first'
288 | 										}
289 | 										task_store.add(task_plan)
290 | 
291 | 										task_plan = {
292 | 											'task type'         : 'where size',
293 | 											'size'                  : 'largest',
294 | 											'grid number'           : grid_size,
295 | 											'target category'       : target_category,
296 | 											'absolute position'     : absolute_pos,
297 | 											'reference category'    : reference_category,
298 | 											'reference position'    : reference_pos,
299 | 											'attribute type'        : attribute_type,
300 | 											'attribute value'       : attribute_value,
301 | 											'target-reference order': 'reference first'
302 | 										}
303 | 										task_store.add(task_plan)
304 | 
305 | 										task_plan = {
306 | 											'task type'         : 'where size',
307 | 											'size'                  : 'smallest',
308 | 											'grid number'           : grid_size,
309 | 											'target category'       : target_category,
310 | 											'absolute position'     : absolute_pos,
311 | 											'reference category'    : reference_category,
312 | 											'reference position'    : reference_pos,
313 | 											'attribute type'        : attribute_type,
314 | 											'attribute value'       : attribute_value,
315 | 											'target-reference order': 'target first'
316 | 										}
317 | 										task_store.add(task_plan)
318 | 
319 | 										task_plan = {
320 | 											'task type'         : 'where size',
321 | 											'size'                  : 'smallest',
322 | 											'grid number'           : grid_size,
323 | 											'target category'       : target_category,
324 | 											'absolute position'     : absolute_pos,
325 | 											'reference category'    : reference_category,
326 | 											'reference position'    : reference_pos,
327 | 											'attribute type'        : attribute_type,
328 | 											'attribute value'       : attribute_value,
329 | 											'target-reference order': 'reference first'
330 | 										}
331 | 										task_store.add(task_plan)
332 | 
333 | 			for grid_size in self.grid_options:
334 | 				for absolute_pos in self.grid_mappings[grid_size]:
335 | 					task_plan = {
336 | 						'task type'    : 'where size',
337 | 						'size'             : 'largest',
338 | 						'grid number'      : grid_size,
339 | 						'target category'  : target_category,
340 | 						'absolute position': absolute_pos,
341 | 					}
342 | 					task_store.add(task_plan)
343 | 
344 | 					task_plan = {
345 | 						'task type'    : 'where size',
346 | 						'size'             : 'smallest',
347 | 						'grid number'      : grid_size,
348 | 						'target category'  : target_category,
349 | 						'absolute position': absolute_pos,
350 | 					}
351 | 					task_store.add(task_plan)
352 | 
353 | 					grid = self.grid_mappings[grid_size][absolute_pos]
354 | 
355 | 					for reference_category in irrelevant_categories:
356 | 						for reference_pos in self.relative_positions:
357 | 							ref_grid = self._relative_grid(grid_size, grid, reference_pos)
358 | 							if ref_grid >= 0:
359 | 								task_plan = {
360 | 									'task type'         : 'where size',
361 | 									'size'                  : 'largest',
362 | 									'grid number'           : grid_size,
363 | 									'target category'       : target_category,
364 | 									'absolute position'     : absolute_pos,
365 | 									'reference category'    : reference_category,
366 | 									'reference position'    : reference_pos,
367 | 									'target-reference order': 'target first'
368 | 								}
369 | 								task_store.add(task_plan)
370 | 
371 | 								task_plan = {
372 | 									'task type'         : 'where size',
373 | 									'size'                  : 'largest',
374 | 									'grid number'           : grid_size,
375 | 									'target category'       : target_category,
376 | 									'absolute position'     : absolute_pos,
377 | 									'reference category'    : reference_category,
378 | 									'reference position'    : reference_pos,
379 | 									'target-reference order': 'reference first'
380 | 								}
381 | 								task_store.add(task_plan)
382 | 
383 | 								task_plan = {
384 | 									'task type'         : 'where size',
385 | 									'size'                  : 'smallest',
386 | 									'grid number'           : grid_size,
387 | 									'target category'       : target_category,
388 | 									'absolute position'     : absolute_pos,
389 | 									'reference category'    : reference_category,
390 | 									'reference position'    : reference_pos,
391 | 									'target-reference order': 'target first'
392 | 								}
393 | 								task_store.add(task_plan)
394 | 
395 | 								task_plan = {
396 | 									'task type'         : 'where size',
397 | 									'size'                  : 'smallest',
398 | 									'grid number'           : grid_size,
399 | 									'target category'       : target_category,
400 | 									'absolute position'     : absolute_pos,
401 | 									'reference category'    : reference_category,
402 | 									'reference position'    : reference_pos,
403 | 									'target-reference order': 'reference first'
404 | 								}
405 | 								task_store.add(task_plan)
406 | 
407 | 	def _generate_task(self, task_plan):
408 | 		grid_size = task_plan['grid number']
409 | 
410 | 		target_category = task_plan['target category']
411 | 		categories = [target_category]
412 | 		queries = [self._get_target_object_query(task_plan)]
413 | 		absolute_pos = task_plan['absolute position']
414 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
415 | 
416 | 		if 'reference category' in task_plan:
417 | 			reference_pos = task_plan['reference position']
418 | 			reference_category = task_plan['reference category']
419 | 			categories.append(reference_category)
420 | 			queries.append(self.metadata.and_query([("category", reference_category, True)]))
421 | 
422 | 			ref_grid = self._relative_grid(grid_size, grids[0], reference_pos)
423 | 			assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid)
424 | 			grids.append(ref_grid)
425 | 
426 | 			if task_plan['target-reference order'] == 'target first':
427 | 				if task_plan['size'] == 'largest':
428 | 					question = f"Where is the largest object in the image with respect to the {self.metadata.get_surfacename(reference_category)}?"
429 | 				else:
430 | 					question = f"Where is the smallest object in the image with respect to the {self.metadata.get_surfacename(reference_category)}?"
431 | 				answer = reference_pos
432 | 			else:
433 | 				if task_plan['size'] == 'largest':
434 | 					question = f"Where is the {self.metadata.get_surfacename(reference_category)} with respect to the largest object in the image?"
435 | 				else:
436 | 					question = f"Where is the {self.metadata.get_surfacename(reference_category)} with respect to the smallest object in the image?"
437 | 				answer = reverse_relative_positions[reference_pos]
438 | 			negatives = [o for o in self.relative_positions if o != answer]
439 | 		else:
440 | 			if task_plan['size'] == 'largest':
441 | 				question = f"Where is the largest object in the image?"
442 | 			else:
443 | 				question = f"Where is the smallest object in the image?"
444 | 			answer = absolute_pos
445 | 			negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer]
446 | 
447 | 		if task_plan['size'] == 'largest':
448 | 			sizes = [largest_size]
449 | 			size_options = list(all_size_options - {largest_size})
450 | 		else:
451 | 			sizes = [smallest_size]
452 | 			size_options = list(all_size_options - {smallest_size})
453 | 		sizes += list(self.rng.choice(size_options, replace=True, size=1))
454 | 
455 | 		options = self._compose_options(answer, negatives)
456 | 		image_metadata = self._make_image_metadata(
457 | 			grid_size,
458 | 			sizes,
459 | 			size_options,
460 | 			grids,
461 | 			queries=queries,
462 | 			remaining_query=self.metadata.and_query([("category", c, False) for c in categories])
463 | 		)
464 | 
465 | 		return question, answer, options, image_metadata
466 | 


--------------------------------------------------------------------------------
/tma/imageqa/tabletop_3d/distance_single_image_task.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | 
  3 | from .single_image_task import _3DGridTaskGenerator
  4 | from .utils import relative_positions
  5 | from ..metadata import Objaverse3DMetaData
  6 | from ...constant import NUM_OPTIONS
  7 | from ...task_store import TaskStore
  8 | 
  9 | grid_options = [3]
 10 | 
 11 | relative_distance = {
 12 | 	'0': [[1, 3], [4], [2, 6], [5, 7], [8]],
 13 | 	'1': [[0, 4, 2], [3, 5], [7], [6, 8]],
 14 | 	'2': [[1, 5], [4], [0, 8], [3, 7], [6]],
 15 | 	'3': [[0, 4, 6], [1, 7], [5], [2, 8]],
 16 | 	'4': [[1, 3, 5, 7], [0, 2, 6, 8]],
 17 | 	'5': [[2, 4, 8], [1, 7], [3], [0, 6]],
 18 | 	'6': [[3, 7], [4], [0, 8], [1, 5], [2]],
 19 | 	'7': [[4, 6, 8], [3, 5], [1], [2, 0]],
 20 | 	'8': [[5, 7], [4], [2, 6], [1, 3], [0]],
 21 | }
 22 | 
 23 | 
 24 | def _get_relative_distance_level(ref, target):
 25 | 	for idx, level in enumerate(relative_distance[str(ref)]):
 26 | 		if target in level:
 27 | 			return idx
 28 | 
 29 | 
 30 | def _get_max_distance_level(ref):
 31 | 	return len(relative_distance[str(ref)]) - 1
 32 | 
 33 | 
 34 | def _get_farther_grids(ref, target):
 35 | 	ref_level = _get_relative_distance_level(ref, target)
 36 | 	farther_grids = []
 37 | 	for level in relative_distance[str(ref)][ref_level + 1:]:
 38 | 		farther_grids.extend(level)
 39 | 	return farther_grids
 40 | 
 41 | 
 42 | def _get_closer_grids(ref, target):
 43 | 	ref_level = _get_relative_distance_level(ref, target)
 44 | 	closer_grids = []
 45 | 	for level in relative_distance[str(ref)][:ref_level]:
 46 | 		closer_grids.extend(level)
 47 | 	return closer_grids
 48 | 
 49 | 
 50 | class Distance3DGridTaskGenerator(_3DGridTaskGenerator):
 51 | 
 52 | 	def __init__(self, metadata: Objaverse3DMetaData, max_num_distracting_object=2, seed=42):
 53 | 		super().__init__(metadata, seed=seed)
 54 | 		self.grid_options = grid_options
 55 | 		self.max_num_distracting_object = max_num_distracting_object
 56 | 
 57 | 	def _make_image_metadata(self, grid_size, distance_type, grids, queries, remaining_query=...):
 58 | 		target_grid = grids[0]
 59 | 		ref_grid = grids[1]
 60 | 		objects = [self.metadata.sample(self.rng, 1, "object", q) for q in queries]
 61 | 		if distance_type == 'farthest':
 62 | 			possible_closer_grids = _get_closer_grids(ref_grid, target_grid)
 63 | 			remaining_grids = self.rng.choice(possible_closer_grids, replace=False, size=min(self.max_num_distracting_object, len(possible_closer_grids)))
 64 | 		else:
 65 | 			possible_farther_grids = _get_farther_grids(ref_grid, target_grid)
 66 | 			remaining_grids = self.rng.choice(possible_farther_grids, replace=False, size=min(self.max_num_distracting_object, len(possible_farther_grids)))
 67 | 
 68 | 		remaining_grids = [int(grid) for grid in remaining_grids]  # convert numpy.int64 to int to feed into json
 69 | 
 70 | 		for _ in remaining_grids:
 71 | 			uid = self.metadata.sample(self.rng, 1, "object", remaining_query)
 72 | 			objects.append(uid)
 73 | 
 74 | 		object_path = {k: self.metadata.get_object_path(k) for k in objects}
 75 | 		angles = [self.metadata.sample_object_angle(self.rng, obj) for obj in objects]
 76 | 
 77 | 		image_metadata = {
 78 | 			'grid number'   : grid_size,
 79 | 			'objects'       : objects,
 80 | 			'object_path'   : object_path,
 81 | 			'object_angles' : angles,
 82 | 			'grids'         : grids + remaining_grids,
 83 | 			'blender_config': self.metadata.sample_blender_configuration(self.rng),
 84 | 		}
 85 | 		return image_metadata
 86 | 
 87 | 
 88 | class WhatDistance3DGridTaskGenerator(Distance3DGridTaskGenerator):
 89 | 	schema = {
 90 | 		'task type'     : 'str',
 91 | 		'distance type'     : 'str',
 92 | 		'grid number'       : 'int',
 93 | 		'target category'   : 'str',
 94 | 		'absolute position' : 'str',
 95 | 		'attribute type'    : 'str',
 96 | 		'attribute value'   : 'str',
 97 | 		'reference category': 'str',
 98 | 		'reference position': 'str',
 99 | 	}
100 | 
101 | 	def __init__(self, metadata: Objaverse3DMetaData, seed=42):
102 | 		super().__init__(metadata, seed=seed)
103 | 		self.relative_positions = relative_positions
104 | 
105 | 	def enumerate_task_plans(self, task_store: TaskStore):
106 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [what distance] task"):
107 | 			irrelevant_categories = self.metadata.get_irrelevant_categories(target_category)
108 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
109 | 			for attribute_type, attribute_values in attribute_dict.items():
110 | 				for attribute_value in attribute_values:
111 | 					for grid_size in self.grid_options:
112 | 						for absolute_pos in self.grid_mappings[grid_size]:
113 | 							grid = self.grid_mappings[grid_size][absolute_pos]
114 | 							for reference_category in irrelevant_categories:
115 | 								for reference_pos in self.relative_positions:
116 | 									ref_grid = self._relative_grid(grid_size, grid, reference_pos)
117 | 									if ref_grid >= 0:
118 | 										if (_get_relative_distance_level(ref_grid, grid) > 0):
119 | 											task_plan = {
120 | 												'task type'     : 'what distance',
121 | 												'distance type'     : 'farthest',
122 | 												'grid number'       : grid_size,
123 | 												'target category'   : target_category,
124 | 												'absolute position' : absolute_pos,
125 | 												'reference category': reference_category,
126 | 												'reference position': reference_pos,
127 | 												'attribute type'    : attribute_type,
128 | 												'attribute value'   : attribute_value,
129 | 											}
130 | 											task_store.add(task_plan)
131 | 										if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)):
132 | 											task_plan = {
133 | 												'task type'     : 'what distance',
134 | 												'distance type'     : 'closest',
135 | 												'grid number'       : grid_size,
136 | 												'target category'   : target_category,
137 | 												'absolute position' : absolute_pos,
138 | 												'reference category': reference_category,
139 | 												'reference position': reference_pos,
140 | 												'attribute type'    : attribute_type,
141 | 												'attribute value'   : attribute_value,
142 | 											}
143 | 											task_store.add(task_plan)
144 | 
145 | 			for grid_size in self.grid_options:
146 | 				for absolute_pos in self.grid_mappings[grid_size]:
147 | 					grid = self.grid_mappings[grid_size][absolute_pos]
148 | 					for reference_category in irrelevant_categories:
149 | 						for reference_pos in self.relative_positions:
150 | 							ref_grid = self._relative_grid(grid_size, grid, reference_pos)
151 | 							if ref_grid >= 0:
152 | 								if (_get_relative_distance_level(ref_grid, grid) > 0):
153 | 									task_plan = {
154 | 										'task type'     : 'what distance',
155 | 										'distance type'     : 'farthest',
156 | 										'grid number'       : grid_size,
157 | 										'target category'   : target_category,
158 | 										'absolute position' : absolute_pos,
159 | 										'reference category': reference_category,
160 | 										'reference position': reference_pos,
161 | 									}
162 | 									task_store.add(task_plan)
163 | 								if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)):
164 | 									task_plan = {
165 | 										'task type'     : 'what distance',
166 | 										'distance type'     : 'closest',
167 | 										'grid number'       : grid_size,
168 | 										'target category'   : target_category,
169 | 										'absolute position' : absolute_pos,
170 | 										'reference category': reference_category,
171 | 										'reference position': reference_pos,
172 | 									}
173 | 									task_store.add(task_plan)
174 | 
175 | 	def _generate_task(self, task_plan):
176 | 		grid_size = task_plan['grid number']
177 | 
178 | 		target_category = task_plan['target category']
179 | 		absolute_pos = task_plan['absolute position']
180 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
181 | 		queries = [self._get_target_object_query(task_plan)]
182 | 
183 | 		remaining_query = [("category", target_category, False)]
184 | 
185 | 		reference_pos = task_plan['reference position']
186 | 		reference_category = task_plan['reference category']
187 | 
188 | 		queries.append(self.metadata.and_query([("category", reference_category, True)]))
189 | 		remaining_query += [("category", reference_category, False)]
190 | 
191 | 		ref_grid = self._relative_grid(grid_size, grids[0], reference_pos)
192 | 		assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid)
193 | 		grids.append(ref_grid)
194 | 
195 | 		if task_plan['distance type'] == 'farthest':
196 | 			question = f"What is the object that is farthest from the {self.metadata.get_surfacename(reference_category)}?"
197 | 		else:
198 | 			question = f"What is the object that is closest to the {self.metadata.get_surfacename(reference_category)}?"
199 | 
200 | 		image_metadata = self._make_image_metadata(
201 | 			grid_size,
202 | 			distance_type=task_plan['distance type'],
203 | 			grids=grids,
204 | 			queries=queries,
205 | 			remaining_query=self.metadata.and_query(remaining_query)
206 | 		)
207 | 
208 | 		answer = self.metadata.get_surfacename(target_category)
209 | 		negatives = [self.metadata.get_surfacename(self.metadata.sample_category_for_object(self.rng, o, target_category))
210 | 					 for o in image_metadata['objects'][1:]]
211 | 		options = self._compose_options(answer, negatives)
212 | 
213 | 		return question, answer, options, image_metadata
214 | 
215 | 
216 | class WhatAttributeDistance3DGridTaskGenerator(Distance3DGridTaskGenerator):
217 | 	schema = {
218 | 		'task type'     : 'str',
219 | 		'distance type'     : 'str',
220 | 		'grid number'       : 'int',
221 | 		'target category'   : 'str',
222 | 		'absolute position' : 'str',
223 | 		'attribute type'    : 'str',
224 | 		'attribute value'   : 'str',
225 | 		'reference category': 'str',
226 | 		'reference position': 'str',
227 | 	}
228 | 
229 | 	def __init__(self, metadata: Objaverse3DMetaData, seed=42):
230 | 		super().__init__(metadata, seed=seed)
231 | 		self.relative_positions = relative_positions
232 | 
233 | 	def enumerate_task_plans(self, task_store: TaskStore):
234 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [what attribute distance] task"):
235 | 			irrelevant_categories = self.metadata.get_irrelevant_categories(target_category)
236 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
237 | 			for attribute_type, attribute_values in attribute_dict.items():
238 | 				for attribute_value in attribute_values:
239 | 					for grid_size in self.grid_options:
240 | 						for absolute_pos in self.grid_mappings[grid_size]:
241 | 							grid = self.grid_mappings[grid_size][absolute_pos]
242 | 							for reference_category in irrelevant_categories:
243 | 								for reference_pos in self.relative_positions:
244 | 									ref_grid = self._relative_grid(grid_size, grid, reference_pos)
245 | 									if ref_grid >= 0:
246 | 										if (_get_relative_distance_level(ref_grid, grid) > 0):
247 | 											task_plan = {
248 | 												'task type'     : 'what attribute distance',
249 | 												'distance type'     : 'farthest',
250 | 												'grid number'       : grid_size,
251 | 												'target category'   : target_category,
252 | 												'absolute position' : absolute_pos,
253 | 												'reference category': reference_category,
254 | 												'reference position': reference_pos,
255 | 												'attribute type'    : attribute_type,
256 | 												'attribute value'   : attribute_value,
257 | 											}
258 | 											task_store.add(task_plan)
259 | 										if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)):
260 | 											task_plan = {
261 | 												'task type'     : 'what attribute distance',
262 | 												'distance type'     : 'closest',
263 | 												'grid number'       : grid_size,
264 | 												'target category'   : target_category,
265 | 												'absolute position' : absolute_pos,
266 | 												'reference category': reference_category,
267 | 												'reference position': reference_pos,
268 | 												'attribute type'    : attribute_type,
269 | 												'attribute value'   : attribute_value,
270 | 											}
271 | 											task_store.add(task_plan)
272 | 
273 | 	def _generate_task(self, task_plan):
274 | 		grid_size = task_plan['grid number']
275 | 
276 | 		attribute_type = task_plan['attribute type']
277 | 
278 | 		absolute_pos = task_plan['absolute position']
279 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
280 | 
281 | 		queries = [self._get_target_object_query(task_plan)]
282 | 
283 | 		reference_pos = task_plan['reference position']
284 | 		reference_category = task_plan['reference category']
285 | 		queries.append(self.metadata.and_query([("category", reference_category, True)]))
286 | 
287 | 		ref_grid = self._relative_grid(grid_size, grids[0], reference_pos)
288 | 		assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid)
289 | 
290 | 		grids.append(ref_grid)
291 | 		if task_plan['distance type'] == 'farthest':
292 | 			question = f"What is the {attribute_type} of the object that is farthest from the {self.metadata.get_surfacename(reference_category)}?"
293 | 		else:
294 | 			question = f"What is the {attribute_type} of the object that is closest to the {self.metadata.get_surfacename(reference_category)}?"
295 | 
296 | 		image_metadata = self._make_image_metadata(
297 | 			grid_size,
298 | 			distance_type=task_plan['distance type'],
299 | 			grids=grids,
300 | 			queries=queries,
301 | 		)
302 | 
303 | 		answer = task_plan['attribute value']
304 | 		target_object = image_metadata['objects'][0]
305 | 		negative_query = self.metadata.and_query([
306 | 			(attribute_type, a, False) for a in self.metadata.query_metadata(attribute_type, self.metadata.and_query([("object", target_object, True)]))
307 | 		])
308 | 		negatives = self.metadata.sample(
309 | 			self.rng,
310 | 			NUM_OPTIONS - 1,
311 | 			attribute_type,
312 | 			query=negative_query,
313 | 		)
314 | 		options = [answer] + negatives
315 | 
316 | 		return question, answer, options, image_metadata
317 | 
318 | 
319 | class WhereDistance3DGridTaskGenerator(Distance3DGridTaskGenerator):
320 | 	schema = {
321 | 		'task type'     : 'str',
322 | 		'distance type'     : 'str',
323 | 		'grid number'       : 'int',
324 | 		'target category'   : 'str',
325 | 		'absolute position' : 'str',
326 | 		'attribute type'    : 'str',
327 | 		'attribute value'   : 'str',
328 | 		'reference category': 'str',
329 | 		'reference position': 'str',
330 | 	}
331 | 
332 | 	def __init__(self, metadata: Objaverse3DMetaData, seed=42):
333 | 		super().__init__(metadata, seed=seed)
334 | 		self.relative_positions = relative_positions
335 | 
336 | 	def enumerate_task_plans(self, task_store: TaskStore):
337 | 		for target_category in tqdm(self.metadata.categories, desc="enumerating [where distance] task"):
338 | 			irrelevant_categories = self.metadata.get_irrelevant_categories(target_category)
339 | 			attribute_dict = self.metadata.get_category_attribute_dict(target_category)
340 | 			for attribute_type, attribute_values in attribute_dict.items():
341 | 				for attribute_value in attribute_values:
342 | 					for grid_size in self.grid_options:
343 | 						for absolute_pos in self.grid_mappings[grid_size]:
344 | 							grid = self.grid_mappings[grid_size][absolute_pos]
345 | 							for reference_category in irrelevant_categories:
346 | 								for reference_pos in self.relative_positions:
347 | 									ref_grid = self._relative_grid(grid_size, grid, reference_pos)
348 | 									if ref_grid >= 0:
349 | 										if (_get_relative_distance_level(ref_grid, grid) > 0):
350 | 											task_plan = {
351 | 												'task type'     : 'where distance',
352 | 												'distance type'     : 'farthest',
353 | 												'grid number'       : grid_size,
354 | 												'target category'   : target_category,
355 | 												'absolute position' : absolute_pos,
356 | 												'reference category': reference_category,
357 | 												'reference position': reference_pos,
358 | 												'attribute type'    : attribute_type,
359 | 												'attribute value'   : attribute_value,
360 | 											}
361 | 											task_store.add(task_plan)
362 | 										if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)):
363 | 											task_plan = {
364 | 												'task type'     : 'where distance',
365 | 												'distance type'     : 'closest',
366 | 												'grid number'       : grid_size,
367 | 												'target category'   : target_category,
368 | 												'absolute position' : absolute_pos,
369 | 												'reference category': reference_category,
370 | 												'reference position': reference_pos,
371 | 												'attribute type'    : attribute_type,
372 | 												'attribute value'   : attribute_value,
373 | 											}
374 | 											task_store.add(task_plan)
375 | 
376 | 			for grid_size in self.grid_options:
377 | 				for absolute_pos in self.grid_mappings[grid_size]:
378 | 					grid = self.grid_mappings[grid_size][absolute_pos]
379 | 					for reference_category in irrelevant_categories:
380 | 						for reference_pos in self.relative_positions:
381 | 							ref_grid = self._relative_grid(grid_size, grid, reference_pos)
382 | 							if ref_grid >= 0:
383 | 								if (_get_relative_distance_level(ref_grid, grid) > 0):
384 | 									task_plan = {
385 | 										'task type'     : 'where distance',
386 | 										'distance type'     : 'farthest',
387 | 										'grid number'       : grid_size,
388 | 										'target category'   : target_category,
389 | 										'absolute position' : absolute_pos,
390 | 										'reference category': reference_category,
391 | 										'reference position': reference_pos,
392 | 									}
393 | 									task_store.add(task_plan)
394 | 								if (_get_relative_distance_level(ref_grid, grid) < _get_max_distance_level(ref_grid)):
395 | 									task_plan = {
396 | 										'task type'     : 'where distance',
397 | 										'distance type'     : 'closest',
398 | 										'grid number'       : grid_size,
399 | 										'target category'   : target_category,
400 | 										'absolute position' : absolute_pos,
401 | 										'reference category': reference_category,
402 | 										'reference position': reference_pos,
403 | 									}
404 | 									task_store.add(task_plan)
405 | 
406 | 	def _generate_task(self, task_plan):
407 | 		grid_size = task_plan['grid number']
408 | 
409 | 		target_category = task_plan['target category']
410 | 		categories = [target_category]
411 | 		queries = [self._get_target_object_query(task_plan)]
412 | 		absolute_pos = task_plan['absolute position']
413 | 		grids = [self.grid_mappings[grid_size][absolute_pos]]
414 | 
415 | 		reference_pos = task_plan['reference position']
416 | 		reference_category = task_plan['reference category']
417 | 		categories.append(reference_category)
418 | 		queries.append(self.metadata.and_query([("category", reference_category, True)]))
419 | 
420 | 		ref_grid = self._relative_grid(grid_size, grids[0], reference_pos)
421 | 		assert ref_grid >= 0, "reference grid {} not allowed".format(ref_grid)
422 | 		grids.append(ref_grid)
423 | 
424 | 		if task_plan['distance type'] == 'farthest':
425 | 			question = f"Where is the object that is farthest from the {self.metadata.get_surfacename(reference_category)} in the image?"
426 | 		else:
427 | 			question = f"Where is the object that is closest to the {self.metadata.get_surfacename(reference_category)} in the image?"
428 | 		answer = absolute_pos
429 | 		negatives = [o for o in self.grid_mappings[grid_size].keys() if o != answer]
430 | 
431 | 		options = self._compose_options(answer, negatives)
432 | 		image_metadata = self._make_image_metadata(
433 | 			grid_size,
434 | 			distance_type=task_plan['distance type'],
435 | 			grids=grids,
436 | 			queries=queries,
437 | 			remaining_query=self.metadata.and_query([("category", c, False) for c in categories])
438 | 		)
439 | 
440 | 		return question, answer, options, image_metadata
441 | 


--------------------------------------------------------------------------------
/tma/videoqa/tabletop_3d/run_blender.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import math
  4 | import os
  5 | import sys
  6 | import urllib.request
  7 | from math import radians
  8 | 
  9 | try:
 10 | 	import bpy
 11 | 	from mathutils import Vector, Matrix, Quaternion, Euler
 12 | except ImportError:
 13 | 	pass
 14 | 
 15 | 
 16 | def get_exact_frame(current_keyframe_idx, total_num_frames):
 17 | 	return (current_keyframe_idx) * (total_num_frames // 4) + 1
 18 | 
 19 | 
 20 | def rotate(obj, degree):
 21 | 	"""Rotates around the z axis by theta"""
 22 | 	degree = -degree
 23 | 	bpy.ops.object.select_all(action='DESELECT')
 24 | 	obj.select_set(True)
 25 | 	bpy.context.view_layer.objects.active = obj
 26 | 	radian = radians(degree)
 27 | 	bpy.context.object.rotation_mode = 'XYZ'
 28 | 	rot_x, rot_y, rot_z = obj.rotation_euler
 29 | 	obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian))
 30 | 	freeze_transformation(obj)
 31 | 
 32 | 
 33 | def rotate_and_keyframe(obj, degree, frame):
 34 | 	degree = -degree
 35 | 	bpy.ops.object.select_all(action='DESELECT')
 36 | 	obj.select_set(True)
 37 | 	bpy.context.scene.frame_set(frame)
 38 | 
 39 | 	bpy.context.view_layer.objects.active = obj
 40 | 	bpy.ops.object.origin_set(type='ORIGIN_GEOMETRY')
 41 | 	bpy.context.object.rotation_mode = 'XYZ'
 42 | 	radian = radians(degree)
 43 | 	rot_x, rot_y, rot_z = obj.rotation_euler
 44 | 	obj.rotation_euler = Euler((rot_x, rot_y, rot_z + radian))
 45 | 	obj.keyframe_insert(data_path="rotation_euler", frame=frame)
 46 | 	bpy.ops.object.select_all(action='DESELECT')
 47 | 
 48 | 
 49 | def reset_scene():
 50 | 	# delete everything that isn't part of a camera or a light
 51 | 	bpy.ops.object.select_all(action="SELECT")
 52 | 	for obj in bpy.data.objects:
 53 | 		bpy.data.objects.remove(obj, do_unlink=True)
 54 | 	bpy.ops.ptcache.free_bake_all()
 55 | 
 56 | 
 57 | def select_hierarchy(obj):
 58 | 	"""Recursively select an object and all of its descendants."""
 59 | 	obj.select_set(True)
 60 | 	for child in obj.children:
 61 | 		select_hierarchy(child)
 62 | 
 63 | 
 64 | def load_object(object_path: str) -> None:
 65 | 	"""Loads a glb model into the scene."""
 66 | 	bpy.ops.object.select_all(action='DESELECT')
 67 | 	if object_path.endswith(".glb"):
 68 | 		bpy.ops.import_scene.gltf(filepath=object_path, merge_vertices=True)
 69 | 	elif object_path.endswith(".fbx"):
 70 | 		bpy.ops.import_scene.fbx(filepath=object_path)
 71 | 	else:
 72 | 		raise ValueError(f"Unsupported file type: {object_path}")
 73 | 
 74 | 	base_name = os.path.basename(object_path)
 75 | 	object_name, _ = os.path.splitext(base_name)
 76 | 	bpy.context.view_layer.objects.active.name = object_name
 77 | 	bpy.ops.object.select_all(action='DESELECT')
 78 | 
 79 | 	obj = bpy.data.objects.get(object_name)
 80 | 	# bpy.context.view_layer.objects.active = obj
 81 | 	select_hierarchy(obj)
 82 | 	bpy.ops.object.transform_apply(location=True, rotation=True, scale=True)
 83 | 	meshes = [obj for obj in bpy.context.selected_objects if obj.type == "MESH"]
 84 | 	non_meshes = [obj for obj in bpy.context.selected_objects if obj.type != "MESH"]
 85 | 	bpy.ops.object.select_all(action="DESELECT")
 86 | 
 87 | 	# delete non-mesh and consolidate
 88 | 
 89 | 	for obj in non_meshes:
 90 | 		obj.select_set(True)
 91 | 	bpy.ops.object.delete()
 92 | 	bpy.ops.object.select_all(action="DESELECT")
 93 | 	for obj in meshes:
 94 | 		obj.select_set(True)
 95 | 	bpy.context.view_layer.objects.active = meshes[0]
 96 | 	bpy.ops.object.join()
 97 | 	bpy.context.view_layer.objects.active.name = object_name
 98 | 	bpy.ops.object.origin_set(type='GEOMETRY_ORIGIN', center='BOUNDS')
 99 | 
100 | 	bpy.ops.object.select_all(action="DESELECT")
101 | 
102 | 	return object_name
103 | 
104 | 
105 | def scene_meshes():
106 | 	for obj in bpy.context.scene.objects.values():
107 | 		if isinstance(obj.data, (bpy.types.Mesh)):
108 | 			yield obj
109 | 
110 | 
111 | def download_uid(uid_path, save_dir):
112 | 	return download_object(uid_path, save_dir)
113 | 
114 | 
115 | def download_object(object_url, save_dir) -> str:
116 | 	"""Download the object and return the path."""
117 | 	# uid = uuid.uuid4()
118 | 	uid = object_url.split("/")[-1].split(".")[0]
119 | 	tmp_local_path = os.path.join(save_dir, f"{uid}.glb" + ".tmp")
120 | 	local_path = os.path.join(save_dir, f"{uid}.glb")
121 | 	# wget the file and put it in local_path
122 | 	os.makedirs(os.path.dirname(tmp_local_path), exist_ok=True)
123 | 	urllib.request.urlretrieve(object_url, tmp_local_path)
124 | 	os.rename(tmp_local_path, local_path)
125 | 	# get the absolute path
126 | 	local_path = os.path.abspath(local_path)
127 | 	return local_path
128 | 
129 | 
130 | def scene_bbox(single_obj=None, ignore_matrix=False):
131 | 	bbox_min = (math.inf,) * 3
132 | 	bbox_max = (-math.inf,) * 3
133 | 	found = False
134 | 	for obj in scene_meshes() if single_obj is None else [single_obj]:
135 | 		found = True
136 | 		for coord in obj.bound_box:
137 | 			coord = Vector(coord)
138 | 			if not ignore_matrix:
139 | 				coord = obj.matrix_world @ coord
140 | 			bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord))
141 | 			bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord))
142 | 	if not found:
143 | 		raise RuntimeError("no objects in scene to compute bounding box for")
144 | 	return Vector(bbox_min), Vector(bbox_max)
145 | 
146 | 
147 | def scene_root_objects():
148 | 	for obj in bpy.context.scene.objects.values():
149 | 		if not obj.parent:
150 | 			yield obj
151 | 
152 | 
153 | def freeze_transformation(obj):
154 | 	bpy.context.view_layer.objects.active = obj
155 | 	obj.select_set(True)
156 | 	bpy.ops.object.transform_apply(location=True, rotation=True, scale=True)
157 | 	bpy.ops.object.select_all(action='DESELECT')
158 | 
159 | 
160 | def scale(obj, scale_factor):
161 | 	bpy.ops.object.select_all(action='DESELECT')
162 | 	obj.select_set(True)
163 | 	bpy.ops.transform.resize(value=(scale_factor, scale_factor, scale_factor))
164 | 	bpy.ops.object.select_all(action='DESELECT')
165 | 	freeze_transformation(obj)
166 | 
167 | 
168 | def get_3d_dimensions(obj):
169 | 	# pdb.set_trace()
170 | 	max_x, max_y, max_z = float("-inf"), float("-inf"), float("-inf")
171 | 	min_x, min_y, min_z = float("inf"), float("inf"), float("inf")
172 | 
173 | 	for vertex in obj.data.vertices:
174 | 		v_world = obj.matrix_world @ vertex.co
175 | 		max_x, max_y, max_z = max(max_x, v_world.x), max(max_y, v_world.y), max(max_z, v_world.z)
176 | 		min_x, min_y, min_z = min(min_x, v_world.x), min(min_y, v_world.y), min(min_z, v_world.z)
177 | 
178 | 	return (max_x - min_x, max_y - min_y, max_z - min_z)
179 | 
180 | 
181 | def normalize_object(obj, factor=1.0):
182 | 	max_dimension = max(get_3d_dimensions(obj))
183 | 	scale_factor = factor * (1 / max_dimension)
184 | 	scale(obj, scale_factor)
185 | 
186 | 
187 | def move_to_xy(obj, x, y):
188 | 	min_z = float('inf')
189 | 	for vertex in obj.data.vertices:
190 | 		z = obj.matrix_world @ vertex.co
191 | 		min_z = min(min_z, z.z)
192 | 	obj.location -= Vector((0, 0, min_z))
193 | 	freeze_transformation(obj)
194 | 
195 | 	# move location x,y to sampled box center
196 | 	new_location = Vector((x, y, obj.location[2]))
197 | 	obj.location = new_location
198 | 	freeze_transformation(obj)
199 | 
200 | 
201 | def move_to_xy_at_frame(obj, movement, frame):
202 | 	# Set the scene to the specific frame
203 | 	x, y = movement[0], movement[1]
204 | 	bpy.context.scene.frame_set(frame)
205 | 	new_location = Vector((x, y, 0))
206 | 	obj.location = obj.location + new_location
207 | 	obj.keyframe_insert(data_path="location", frame=frame)
208 | 
209 | 
210 | def normalize_scene():
211 | 	bbox_min, bbox_max = scene_bbox()
212 | 	scale = 1 / max(bbox_max - bbox_min)
213 | 	for obj in scene_root_objects():
214 | 		obj.scale = obj.scale * scale
215 | 	# Apply scale to matrix_world.
216 | 	bpy.context.view_layer.update()
217 | 	bbox_min, bbox_max = scene_bbox()
218 | 	offset = -(bbox_min + bbox_max) / 2
219 | 	for obj in scene_root_objects():
220 | 		obj.matrix_world.translation += offset
221 | 	bpy.ops.object.select_all(action="DESELECT")
222 | 
223 | 
224 | def setup_plane_and_background(plane_texture_path, hdri_path):
225 | 	# load plane
226 | 	plane_name = load_object(plane_texture_path)
227 | 	plane = bpy.data.objects.get(plane_name)
228 | 	scale(plane, 0.5)
229 | 
230 | 	# load light map
231 | 	print(f"HDRI PATH: {hdri_path}")
232 | 	bpy.ops.image.open(filepath=hdri_path)
233 | 	if bpy.data.worlds.get("World") is None:
234 | 		bpy.data.worlds.new("World")
235 | 
236 | 	bpy.context.scene.world = bpy.data.worlds["World"]
237 | 
238 | 	bpy.context.scene.world.use_nodes = True
239 | 	tree = bpy.context.scene.world.node_tree
240 | 	tree.nodes.clear()
241 | 
242 | 	tex_env = tree.nodes.new(type="ShaderNodeTexEnvironment")
243 | 	tex_env.image = bpy.data.images[hdri_path.split('/')[-1]]  # Image name is typically the last part of the path
244 | 	background = tree.nodes.new(type="ShaderNodeBackground")
245 | 	output = tree.nodes.new(type="ShaderNodeOutputWorld")
246 | 
247 | 	tree.links.new(tex_env.outputs[0], background.inputs[0])
248 | 	tree.links.new(background.outputs[0], output.inputs[0])
249 | 
250 | 	return plane_texture_path + " " + hdri_path
251 | 
252 | 
253 | def setup_camera_and_lights(
254 | 		sun_x,
255 | 		sun_y,
256 | 		sun_energy,
257 | 		key_light_horizontal_angle,
258 | 		fill_light_horizontal_angle,
259 | 		key_light_vertical_angle,
260 | 		fill_light_vertical_angle
261 | ):
262 | 	# for seeting up the three point lighting, we mostly follow https://courses.cs.washington.edu/courses/cse458/05au/reading/3point_lighting.pdf
263 | 	# in order to keep lights and camera on the hemisphere pointing to origin, we use a hierarchy of empties
264 | 
265 | 	# create the sun
266 | 
267 | 	bpy.ops.object.light_add(type="SUN")
268 | 	sun = bpy.context.active_object
269 | 	sun.rotation_euler = Euler((sun_x, sun_y, 0), "XYZ")
270 | 	sun.data.energy = sun_energy
271 | 
272 | 	# create global empty
273 | 
274 | 	bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1))
275 | 	x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90)
276 | 	empty = bpy.context.scene.objects.get("Empty")
277 | 
278 | 	# create camera
279 | 
280 | 	# radius = random.uniform(1.8,2.2)
281 | 	radius = 2.5
282 | 
283 | 	bpy.ops.object.camera_add(enter_editmode=False, align='VIEW', location=(-radius, 0, 0), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1))
284 | 	cam = bpy.context.scene.objects.get("Camera")
285 | 	cam.data.lens = 35
286 | 	cam.data.sensor_width = 32
287 | 	bpy.context.scene.camera = cam
288 | 
289 | 	# create camera empty
290 | 
291 | 	bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1))
292 | 	x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90)
293 | 	cam_empty = bpy.context.scene.objects.get("Empty.001")
294 | 	cam_empty.name = "camera_empty"
295 | 
296 | 	# make camera empty parent of camera
297 | 
298 | 	bpy.ops.object.select_all(action='DESELECT')
299 | 	cam.select_set(True)
300 | 	cam_empty.select_set(True)
301 | 	bpy.context.view_layer.objects.active = cam_empty
302 | 	bpy.ops.object.parent_set()
303 | 	bpy.ops.object.select_all(action='DESELECT')
304 | 
305 | 	# make camera empty parent of global empty
306 | 
307 | 	bpy.ops.object.select_all(action='DESELECT')
308 | 	cam_empty.select_set(True)
309 | 	empty.select_set(True)
310 | 	bpy.context.view_layer.objects.active = empty
311 | 	bpy.ops.object.parent_set()
312 | 	bpy.ops.object.select_all(action='DESELECT')
313 | 
314 | 	light_names = ["key_light", "fill_light", "back_light"]
315 | 	light_energies = [1000., 300., 500.]
316 | 
317 | 	for light_name, light_energy in zip(light_names, light_energies):
318 | 		# create light empty
319 | 
320 | 		empty_name = light_name + "_empty"
321 | 		bpy.ops.object.empty_add(type='PLAIN_AXES', align='WORLD', location=(0, 0, 0), scale=(1, 1, 1))
322 | 		x_rot, y_rot, z_rot = radians(90), radians(0), radians(-90)
323 | 		light_empty = bpy.context.scene.objects.get("Empty.001")
324 | 		light_empty.name = empty_name
325 | 
326 | 		# parent light empty to main (camera) empty
327 | 
328 | 		bpy.ops.object.select_all(action='DESELECT')
329 | 		light_empty.select_set(True)
330 | 		empty.select_set(True)
331 | 		bpy.context.view_layer.objects.active = empty
332 | 		bpy.ops.object.parent_set()
333 | 		bpy.ops.object.select_all(action='DESELECT')
334 | 
335 | 		# create light
336 | 
337 | 		x_loc, y_loc, z_loc = -radius, 0, 0
338 | 		bpy.ops.object.light_add(type='POINT', radius=1, align='WORLD', location=(x_loc, y_loc, z_loc), rotation=Euler((x_rot, y_rot, z_rot), "XYZ"), scale=(1, 1, 1))
339 | 		bpy.data.objects["Point"].name = light_name
340 | 		light = bpy.data.objects[light_name]
341 | 		light.data.energy = light_energy
342 | 		# light.data.size = 0.5
343 | 
344 | 		# parent light empty to light
345 | 
346 | 		bpy.ops.object.select_all(action='DESELECT')
347 | 		light.select_set(True)
348 | 		light_empty.select_set(True)
349 | 		bpy.context.view_layer.objects.active = light_empty
350 | 		bpy.ops.object.parent_set()
351 | 		bpy.ops.object.select_all(action='DESELECT')
352 | 
353 | 	# rotate camera and lights around the z-axis
354 | 
355 | 	z_random_rot = radians(90)  # radians(random.uniform(0,360))
356 | 	empty.rotation_euler = Euler((0, 0, z_random_rot))
357 | 
358 | 	# # raise the camera while having it point to origin
359 | 
360 | 	# cam_y_random_rot = radians(random.uniform(10,50))
361 | 	# cam_empty.rotation_euler = Euler((0,cam_y_random_rot,0),"XYZ")
362 | 
363 | 	bpy.context.view_layer.update()
364 | 
365 | 	back_light_horizontal_angle = radians(180)
366 | 	light_horizontal_angles = [key_light_horizontal_angle, fill_light_horizontal_angle, back_light_horizontal_angle]
367 | 	for light_angle, light_name in zip(light_horizontal_angles, light_names):
368 | 		light_empty = bpy.data.objects[light_name + "_empty"]
369 | 		global_z = (light_empty.matrix_world.inverted() @ Vector((0.0, 0.0, 1.0, 0.0)))[:3]
370 | 		quat = Quaternion(global_z, light_angle)
371 | 		light_empty.rotation_euler = quat.to_euler()
372 | 
373 | 	back_light_vertical_angle = 0
374 | 	light_vertical_angles = [key_light_vertical_angle, fill_light_vertical_angle, back_light_vertical_angle]
375 | 	# light_vertical_angles = [radians(-45)]*3
376 | 
377 | 	for light_angle, light_name in zip(light_vertical_angles, light_names):
378 | 		light_empty = bpy.data.objects[light_name + "_empty"]
379 | 		global_x = (light_empty.matrix_world.inverted() @ Vector((1.0, 0.0, 0.0, 0.0)))[:3]
380 | 		quat = Quaternion(global_x, light_angle)
381 | 		euler_add = quat.to_euler()
382 | 		euler_current = light_empty.rotation_euler
383 | 		new_euler = Euler((euler_add[0] + euler_current[0], euler_add[1] + euler_current[1], euler_add[2] + euler_current[2]))
384 | 		light_empty.rotation_euler = new_euler
385 | 
386 | 	# bpy.context.view_layer.update()
387 | 
388 | 	return cam, empty
389 | 
390 | 
391 | def render_animation(fp):
392 | 	bpy.context.scene.render.filepath = fp
393 | 	bpy.context.scene.render.image_settings.file_format = 'FFMPEG'
394 | 	bpy.context.scene.render.ffmpeg.format = 'MPEG4'
395 | 	bpy.context.scene.render.ffmpeg.codec = 'H264'
396 | 	bpy.context.scene.render.ffmpeg.constant_rate_factor = 'MEDIUM'
397 | 	bpy.ops.render.render(animation=True)
398 | 
399 | 
400 | def setup_renderer(H, W, use_cpu=False):
401 | 	scene = bpy.context.scene
402 | 	render = bpy.context.scene.render
403 | 
404 | 	render.engine = "CYCLES"
405 | 	render.image_settings.file_format = "PNG"
406 | 	render.image_settings.color_mode = "RGBA"
407 | 	render.resolution_x = W
408 | 	render.resolution_y = H
409 | 	render.resolution_percentage = 100
410 | 
411 | 	scene.cycles.device = "CPU" if use_cpu else "GPU"
412 | 	scene.cycles.samples = 10 if use_cpu else 128
413 | 	scene.cycles.diffuse_bounces = 1
414 | 	scene.cycles.glossy_bounces = 1
415 | 	scene.cycles.transparent_max_bounces = 3
416 | 	scene.cycles.transmission_bounces = 3
417 | 	scene.cycles.filter_width = 0.01
418 | 	scene.cycles.use_denoising = True
419 | 	scene.render.film_transparent = False
420 | 
421 | 	bpy.context.preferences.addons["cycles"].preferences.get_devices()
422 | 	# Set the device_type
423 | 	bpy.context.preferences.addons[
424 | 		"cycles"
425 | 	].preferences.compute_device_type = "METAL" if use_cpu else "CUDA"
426 | 	bpy.context.scene.view_settings.view_transform = 'Filmic'
427 | 
428 | 
429 | # def randomize_camera_view(axis):
430 | # 	euler_y = radians(random.uniform(-90, 90))
431 | # 	euler_z = radians(random.uniform(0, 360))
432 | # 	axis.rotation_euler = Euler((0, euler_y, euler_z))
433 | 
434 | 
435 | def run_render(metadata, save_image_path, use_cpu):
436 | 	reset_scene()
437 | 
438 | 	bpy.context.scene.render.fps = metadata['fps']
439 | 	bpy.context.scene.frame_start = 1
440 | 	bpy.context.scene.frame_end = metadata['total_num_frames']
441 | 
442 | 	objs = []
443 | 	for uid in metadata["objects"]:
444 | 		object_path = metadata["object_path"][uid]
445 | 		objs.append(bpy.data.objects.get(load_object(object_path)))
446 | 
447 | 	grid_number = metadata["grid number"]
448 | 
449 | 	if grid_number == 2:
450 | 		locations = {
451 | 			0: [0.7, 0.5],
452 | 			1: [0.7, -0.5],
453 | 			2: [-0.6, 0.5],
454 | 			3: [-0.6, -0.5]
455 | 		}
456 | 		scale_factor = 1 / 2
457 | 	elif grid_number == 3:
458 | 		locations = {
459 | 			0: [0.9, 0.6],
460 | 			1: [0.9, 0],
461 | 			2: [0.9, -0.6],
462 | 			3: [0.0, 0.6],
463 | 			4: [0.0, 0.0],
464 | 			5: [0.0, -0.6],
465 | 			6: [-0.9, 0.6],
466 | 			7: [-0.9, 0.0],
467 | 			8: [-0.9, -0.6]
468 | 		}
469 | 		scale_factor = 1 / 3
470 | 	else:
471 | 		raise ValueError(f"Expected grid number to be 2 or 3 but got {grid_number}")
472 | 
473 | 	# process rotate
474 | 	for idx, obj in enumerate(objs):
475 | 		rotate(obj, degree=metadata['object_angles'][idx])
476 | 
477 | 	# process scale
478 | 	if "sizes" in metadata:
479 | 		for idx, obj in enumerate(objs):
480 | 			normalize_object(obj, factor=metadata['sizes'][idx] * scale_factor)
481 | 	else:
482 | 		for obj in objs:
483 | 			normalize_object(obj, factor=scale_factor)
484 | 
485 | 	for pos, obj in zip(metadata["grids"], objs):
486 | 		x, y = locations[pos]
487 | 		move_to_xy(obj, x, y)
488 | 
489 | 	# set the first keyframe of video
490 | 	for obj in objs:
491 | 		rotate_and_keyframe(obj, 0, 1)
492 | 		move_to_xy_at_frame(obj, (0, 0), 1)
493 | 
494 | 	# set other keyframes based on the metadata
495 | 	for idx, obj in enumerate(objs):
496 | 		for keyframe_order, keyframe_info in enumerate(metadata["keyframes"][idx]):
497 | 			if "rotation" in keyframe_info:
498 | 				rotate_and_keyframe(obj, keyframe_info["rotation"], get_exact_frame(keyframe_order, metadata['total_num_frames']))
499 | 			if "movement" in keyframe_info:
500 | 				move_to_xy_at_frame(obj, keyframe_info["movement"], get_exact_frame(keyframe_order, metadata['total_num_frames']))
501 | 
502 | 	blender_config = metadata["blender_config"]
503 | 
504 | 	setup_plane_and_background(blender_config["plane_texture_path"], blender_config["hdri_path"])
505 | 	cam, axis = setup_camera_and_lights(
506 | 		blender_config["sun_x"],
507 | 		blender_config["sun_y"],
508 | 		blender_config["sun_energy"],
509 | 		blender_config["key_light_horizontal_angle"],
510 | 		blender_config["fill_light_horizontal_angle"],
511 | 		blender_config["key_light_vertical_angle"],
512 | 		blender_config["fill_light_vertical_angle"]
513 | 	)
514 | 	axis.rotation_euler = Euler((0, radians(45), 0))
515 | 	setup_renderer(H=metadata["VIDEO_H"], W=metadata["VIDEO_W"], use_cpu=use_cpu)
516 | 	render_animation(save_image_path)
517 | 
518 | 
519 | if __name__ == "__main__":
520 | 	parser = argparse.ArgumentParser()
521 | 	parser.add_argument(
522 | 		"--save_local",
523 | 		type=str,
524 | 		default=""
525 | 	)
526 | 	parser.add_argument(
527 | 		"--save_video_path",
528 | 		type=str,
529 | 		default="render.png"
530 | 	)
531 | 	parser.add_argument(
532 | 		"--json_file",
533 | 		type=str,
534 | 		default="video_metadata.json"
535 | 	)
536 | 
537 | 	parser.add_argument(
538 | 		"--use_cpu",
539 | 		action="store_true",
540 | 		default=False
541 | 	)
542 | 
543 | 	argv = sys.argv[sys.argv.index("--") + 1:]
544 | 	args = parser.parse_args(argv)
545 | 
546 | 	with open(args.json_file, "r") as f:
547 | 		metadata = json.load(f)
548 | 
549 | 	run_render(metadata, args.save_video_path, args.use_cpu)
550 | 


--------------------------------------------------------------------------------