Skip to content

Disentangled VQA

This module aims to implement the Disentangled VQA metric inspired by Section 4.1 from the paper T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation.

Using the disentangled BLIP-VQA model for attribute-binding evaluation as proposed in T2I-CompBench
Weave gives us a holistic view of the evaluations to drill into individual ouputs and scores.

Example

Step 1: Generate evaluation dataset

Generate the dataset consisting of prompts in the format “a {adj_1} {noun_1} and a {adj_2} {noun_2}” and the corresponding metadata using an LLM capable of generating json objects like GPT4-O. The dataset is then published both as a W&B dataset artifact and as a weave dataset.

from hemm.metrics.attribute_binding import AttributeBindingDatasetGenerator

    dataset_generator = AttributeBindingDatasetGenerator(
        openai_model="gpt-4o",
        openai_seed=42,
        num_prompts_in_single_call=20,
        num_api_calls=50,
        project_name="disentangled_vqa",
    )

    dataset_generator(dump_dir="./dump")

Step 2: Evaluate

import wandb
import weave

wandb.init(project=project, entity=entity, job_type="evaluation")
weave.init(project_name=project)

diffusion_model = BaseDiffusionModel(
    diffusion_model_name_or_path=diffusion_model_address,
    enable_cpu_offfload=diffusion_model_enable_cpu_offfload,
    image_height=image_size[0],
    image_width=image_size[1],
)
evaluation_pipeline = EvaluationPipeline(model=diffusion_model)

judge = BlipVQAJudge()
metric = DisentangledVQAMetric(judge=judge, name="disentangled_blip_metric")
evaluation_pipeline.add_metric(metric)

evaluation_pipeline(dataset=dataset)

Metrics

DisentangledVQAMetric

Bases: BaseMetric

Disentangled VQA metric to evaluate the attribute-binding capability for image generation models as proposed in Section 4.1 from the paper T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation.

Sample usage
import wandb
import weave
from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline
from hemm.metrics.vqa import DisentangledVQAMetric
from hemm.metrics.vqa.judges import BlipVQAJudge

wandb.init(project=project, entity=entity, job_type="evaluation")
weave.init(project_name=project)

diffusion_model = BaseDiffusionModel(
    diffusion_model_name_or_path=diffusion_model_address,
    enable_cpu_offfload=diffusion_model_enable_cpu_offfload,
    image_height=image_size[0],
    image_width=image_size[1],
)
evaluation_pipeline = EvaluationPipeline(model=diffusion_model)

judge = BlipVQAJudge()
metric = DisentangledVQAMetric(judge=judge, name="disentangled_blip_metric")
evaluation_pipeline.add_metric(metric)

evaluation_pipeline(dataset=dataset)

Parameters:

Name Type Description Default
judge Union[Model, BlipVQAJudge]

The judge model to evaluate the attribute-binding capability.

required
name Optional[str]

The name of the metric. Defaults to "disentangled_vlm_metric".

'disentangled_vlm_metric'
Source code in hemm/metrics/vqa/disentangled_vqa.py
class DisentangledVQAMetric(BaseMetric):
    """Disentangled VQA metric to evaluate the attribute-binding capability
    for image generation models as proposed in Section 4.1 from the paper
    [T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation](https://arxiv.org/pdf/2307.06350).

    ??? example "Sample usage"
        ```python
        import wandb
        import weave
        from hemm.eval_pipelines import BaseDiffusionModel, EvaluationPipeline
        from hemm.metrics.vqa import DisentangledVQAMetric
        from hemm.metrics.vqa.judges import BlipVQAJudge

        wandb.init(project=project, entity=entity, job_type="evaluation")
        weave.init(project_name=project)

        diffusion_model = BaseDiffusionModel(
            diffusion_model_name_or_path=diffusion_model_address,
            enable_cpu_offfload=diffusion_model_enable_cpu_offfload,
            image_height=image_size[0],
            image_width=image_size[1],
        )
        evaluation_pipeline = EvaluationPipeline(model=diffusion_model)

        judge = BlipVQAJudge()
        metric = DisentangledVQAMetric(judge=judge, name="disentangled_blip_metric")
        evaluation_pipeline.add_metric(metric)

        evaluation_pipeline(dataset=dataset)
        ```

    Args:
        judge (Union[weave.Model, BlipVQAJudge]): The judge model to evaluate the attribute-binding capability.
        name (Optional[str]): The name of the metric. Defaults to "disentangled_vlm_metric".
    """

    def __init__(
        self,
        judge: Union[weave.Model, BlipVQAJudge],
        name: Optional[str] = "disentangled_vlm_metric",
    ) -> None:
        super().__init__()
        self.judge = judge
        self.config = self.judge.model_dump()
        self.scores = []
        self.name = name

    @weave.op()
    def evaluate(
        self,
        prompt: str,
        adj_1: str,
        noun_1: str,
        adj_2: str,
        noun_2: str,
        model_output: Dict[str, Any],
    ) -> Dict[str, Any]:
        """Evaluate the attribute-binding capability of the model.

        Args:
            prompt (str): The prompt for the model.
            adj_1 (str): The first adjective.
            noun_1 (str): The first noun.
            adj_2 (str): The second adjective.
            noun_2 (str): The second noun.
            model_output (Dict[str, Any]): The model output.

        Returns:
            Dict[str, Any]: The evaluation result.
        """
        _ = prompt
        judgement = self.judge.predict(
            adj_1, noun_1, adj_2, noun_2, model_output["image"]
        )
        self.scores.append(judgement)
        return judgement

    @weave.op()
    async def evaluate_async(
        self,
        prompt: str,
        adj_1: str,
        noun_1: str,
        adj_2: str,
        noun_2: str,
        model_output: Dict[str, Any],
    ) -> Dict[str, Any]:
        return self.evaluate(prompt, adj_1, noun_1, adj_2, noun_2, model_output)

evaluate(prompt, adj_1, noun_1, adj_2, noun_2, model_output)

Evaluate the attribute-binding capability of the model.

Parameters:

Name Type Description Default
prompt str

The prompt for the model.

required
adj_1 str

The first adjective.

required
noun_1 str

The first noun.

required
adj_2 str

The second adjective.

required
noun_2 str

The second noun.

required
model_output Dict[str, Any]

The model output.

required

Returns:

Type Description
Dict[str, Any]

Dict[str, Any]: The evaluation result.

Source code in hemm/metrics/vqa/disentangled_vqa.py
@weave.op()
def evaluate(
    self,
    prompt: str,
    adj_1: str,
    noun_1: str,
    adj_2: str,
    noun_2: str,
    model_output: Dict[str, Any],
) -> Dict[str, Any]:
    """Evaluate the attribute-binding capability of the model.

    Args:
        prompt (str): The prompt for the model.
        adj_1 (str): The first adjective.
        noun_1 (str): The first noun.
        adj_2 (str): The second adjective.
        noun_2 (str): The second noun.
        model_output (Dict[str, Any]): The model output.

    Returns:
        Dict[str, Any]: The evaluation result.
    """
    _ = prompt
    judgement = self.judge.predict(
        adj_1, noun_1, adj_2, noun_2, model_output["image"]
    )
    self.scores.append(judgement)
    return judgement

Judges

BlipVQAJudge

Bases: Model

Weave Model to judge the presence of entities in an image using the Blip-VQA model.

Parameters:

Name Type Description Default
blip_processor_address str

The address of the BlipProcessor model.

'Salesforce/blip-vqa-base'
blip_vqa_address str

The address of the BlipForQuestionAnswering model.

'Salesforce/blip-vqa-base'
device str

The device to use for inference

'cuda'
Source code in hemm/metrics/vqa/judges/blip_vqa.py
class BlipVQAJudge(weave.Model):
    """Weave Model to judge the presence of entities in an image using the
    [Blip-VQA model](https://huggingface.co/Salesforce/blip-vqa-base).

    Args:
        blip_processor_address (str): The address of the BlipProcessor model.
        blip_vqa_address (str): The address of the BlipForQuestionAnswering model.
        device (str): The device to use for inference
    """

    blip_processor_address: str
    blip_vqa_address: str
    device: str
    _torch_dtype: torch.dtype = torch.float32
    _blip_processor_model: BlipProcessor = None
    _blip_vqa_model: BlipForQuestionAnswering = None

    def __init__(
        self,
        blip_processor_address: str = "Salesforce/blip-vqa-base",
        blip_vqa_address: str = "Salesforce/blip-vqa-base",
        device: str = "cuda",
    ):
        super().__init__(
            blip_processor_address=blip_processor_address,
            blip_vqa_address=blip_vqa_address,
            device=device,
        )
        self._blip_processor_model = BlipProcessor.from_pretrained(
            self.blip_processor_address
        )
        self._blip_vqa_model = BlipForQuestionAnswering.from_pretrained(
            self.blip_vqa_address, torch_dtype=self._torch_dtype
        ).to(self.device)

    def _get_probability(self, target_token: str, scores: List[torch.Tensor]) -> float:
        target_token_id = self._blip_processor_model.tokenizer.convert_tokens_to_ids(
            target_token
        )
        probabilities = [F.softmax(score, dim=-1) for score in scores]
        target_token_probabilities = [
            prob[:, target_token_id].item() for prob in probabilities
        ]
        max_target_token_probability = max(target_token_probabilities)
        return max_target_token_probability

    @weave.op()
    def get_target_token_probability(
        self, question: str, image: Image.Image
    ) -> Dict[str, float]:
        inputs = self._blip_processor_model(image, question, return_tensors="pt").to(
            self.device
        )
        with torch.no_grad():
            generated_ids = self._blip_vqa_model.generate(
                **inputs, output_scores=True, return_dict_in_generate=True
            )
        scores = generated_ids.scores
        yes_probability = self._get_probability("yes", scores)
        no_probability = self._get_probability("no", scores)
        return {
            "yes_proba": yes_probability,
            "no_proba": no_probability,
            "present": yes_probability > no_probability,
        }

    @weave.op()
    def predict(
        self, adj_1: str, noun_1: str, adj_2: str, noun_2: str, image: Image.Image
    ) -> Dict:
        """Predict the probabilities presence of entities in an image using the Blip-VQA model.

        Args:
            adj_1 (str): The adjective of the first entity.
            noun_1 (str): The noun of the first entity.
            adj_2 (str): The adjective of the second entity.
            noun_2 (str): The noun of the second entity.
            image (Image.Image): The input image.

        Returns:
            Dict: The probabilities of the presence of the entities.
        """
        question_1 = f"is {adj_1} {noun_1} present in the picture?"
        question_2 = f"is {adj_2} {noun_2} present in the picture?"
        return {
            "entity_1": self.get_target_token_probability(question_1, image),
            "entity_2": self.get_target_token_probability(question_2, image),
        }

predict(adj_1, noun_1, adj_2, noun_2, image)

Predict the probabilities presence of entities in an image using the Blip-VQA model.

Parameters:

Name Type Description Default
adj_1 str

The adjective of the first entity.

required
noun_1 str

The noun of the first entity.

required
adj_2 str

The adjective of the second entity.

required
noun_2 str

The noun of the second entity.

required
image Image

The input image.

required

Returns:

Name Type Description
Dict Dict

The probabilities of the presence of the entities.

Source code in hemm/metrics/vqa/judges/blip_vqa.py
@weave.op()
def predict(
    self, adj_1: str, noun_1: str, adj_2: str, noun_2: str, image: Image.Image
) -> Dict:
    """Predict the probabilities presence of entities in an image using the Blip-VQA model.

    Args:
        adj_1 (str): The adjective of the first entity.
        noun_1 (str): The noun of the first entity.
        adj_2 (str): The adjective of the second entity.
        noun_2 (str): The noun of the second entity.
        image (Image.Image): The input image.

    Returns:
        Dict: The probabilities of the presence of the entities.
    """
    question_1 = f"is {adj_1} {noun_1} present in the picture?"
    question_2 = f"is {adj_2} {noun_2} present in the picture?"
    return {
        "entity_1": self.get_target_token_probability(question_1, image),
        "entity_2": self.get_target_token_probability(question_2, image),
    }

Dataset Generation

AttributeBindingDatasetGenerator

Dataset generator for evaluation of attribute binding capability of image-generation models. This class enables us to generate the dataset consisting of prompts in the format “a {adj_1} {noun_1} and a {adj_2} {noun_2}” and the corresponding metadata using an LLM capable of generating json objects like GPT4-O. The dataset is then published both as a W&B dataset artifact and as a weave dataset.

Sample usage
from hemm.metrics.vqa import AttributeBindingDatasetGenerator

dataset_generator = AttributeBindingDatasetGenerator(
    openai_model="gpt-4o",
    openai_seed=42,
    num_prompts_in_single_call=20,
    num_api_calls=50,
    project_name="disentangled_vqa",
)

dataset_generator(dump_dir="./dump")

Parameters:

Name Type Description Default
openai_model Optional[str]

The OpenAI model to use for generating prompts.

'gpt-3.5-turbo'
openai_seed Optional[Union[int, List[int]]]

Seed to use for generating prompts. If not provided, seeds will be auto-generated.

None
num_prompts_in_single_call Optional[int]

Number of prompts to generate in a single API call.

20
num_api_calls Optional[int]

Number of API calls to make.

50
project_name Optional[str]

Name of the Weave project to use for logging the dataset.

'diffusion_leaderboard'
Source code in hemm/metrics/vqa/dataset_generator/attribute_binding.py
class AttributeBindingDatasetGenerator:
    """Dataset generator for evaluation of attribute binding capability of image-generation models.
    This class enables us to generate the dataset consisting of prompts in the format
    `“a {adj_1} {noun_1} and a {adj_2} {noun_2}”` and the corresponding metadata using an LLM capable
    of generating json objects like GPT4-O. The dataset is then published both as a
    [W&B dataset artifact](https://docs.wandb.ai/guides/artifacts) and as a
    [weave dataset](https://wandb.github.io/weave/guides/core-types/datasets).

    ??? example "Sample usage"
        ```python
        from hemm.metrics.vqa import AttributeBindingDatasetGenerator

        dataset_generator = AttributeBindingDatasetGenerator(
            openai_model="gpt-4o",
            openai_seed=42,
            num_prompts_in_single_call=20,
            num_api_calls=50,
            project_name="disentangled_vqa",
        )

        dataset_generator(dump_dir="./dump")
        ```

    Args:
        openai_model (Optional[str]): The OpenAI model to use for generating prompts.
        openai_seed (Optional[Union[int, List[int]]]): Seed to use for generating prompts.
            If not provided, seeds will be auto-generated.
        num_prompts_in_single_call (Optional[int]): Number of prompts to generate in a single API call.
        num_api_calls (Optional[int]): Number of API calls to make.
        project_name (Optional[str]): Name of the Weave project to use for logging the dataset.
    """

    def __init__(
        self,
        openai_model: Optional[str] = "gpt-3.5-turbo",
        openai_seed: Optional[Union[int, List[int]]] = None,
        num_prompts_in_single_call: Optional[int] = 20,
        num_api_calls: Optional[int] = 50,
        project_name: Optional[str] = "diffusion_leaderboard",
    ) -> None:
        if not openai_seed:
            self.openai_seeds = [autogenerate_seed() for _ in range(num_api_calls)]
        elif isinstance(openai_seed, int):
            self.openai_seeds = [openai_seed] * num_api_calls
        elif isinstance(openai_seed, list) and len(openai_seed) != num_api_calls:
            raise ValueError(
                "Length of `openai_seed` should be equal to `num_api_calls`"
            )
        self.attribute_binding_model = AttributeBindingModel(
            openai_model=openai_model,
            num_prompts=num_prompts_in_single_call,
        )
        self.model_configs = self.attribute_binding_model.model_dump()
        self.attribute_binding_model._initialize()
        self.num_prompts_in_single_call = num_prompts_in_single_call
        self.num_api_calls = num_api_calls
        self.project_name = project_name
        self.dataset_rows: List[Dict] = []
        self.wandb_table = wandb.Table(
            columns=["prompt", "adj_1", "noun_1", "adj_2", "noun_2"]
        )

    def publish_dataset(self, dump_dir: str):
        os.makedirs(dump_dir, exist_ok=True)
        weave_dataset = weave.Dataset(
            name="attribute_binding_dataset", rows=self.dataset_rows
        )
        weave.publish(weave_dataset)
        with jsonlines.open(
            os.path.join(dump_dir, "dataset.jsonl"), mode="w"
        ) as writer:
            writer.write(self.dataset_rows)
        artifact = wandb.Artifact(name="attribute_binding_dataset", type="dataset")
        artifact.add_file(local_path=os.path.join(dump_dir, "dataset.jsonl"))
        wandb.log_artifact(artifact)
        wandb.log({"dataset/attribute_binding": self.wandb_table})

    @weave.op()
    async def evaluate_generated_response(
        self, prompt: str, model_output: Dict
    ) -> Dict:
        eval_response = AttributeBindingEvaluationResponse()
        model_output = str_to_json(model_output["response"])
        if model_output:
            if "data" not in model_output:
                return eval_response.model_dump()
            model_output = model_output["data"]
            eval_response.is_correct_json = True
            for idx in model_output:
                prompt = idx["sentence"]
                adj_1 = idx["metadata"]["adj_1"]
                adj_2 = idx["metadata"]["adj_2"]
                noun_1 = idx["metadata"]["noun_1"]
                noun_2 = idx["metadata"]["noun_2"]
                self.wandb_table.add_data(prompt, adj_1, noun_1, adj_2, noun_2)
                self.dataset_rows.append(
                    {
                        "prompt": prompt,
                        "adj_1": adj_1,
                        "noun_1": noun_1,
                        "adj_2": adj_2,
                        "noun_2": noun_2,
                    }
                )
                if f"a {adj_1} {noun_1} and a {adj_2} {noun_2}" == prompt:
                    eval_response.num_correct_predictions += 1
            eval_response.total_predictions = len(model_output)
            eval_response.accuracy = eval_response.num_correct_predictions / len(
                model_output
            )
        return eval_response.model_dump()

    def __call__(self, dump_dir: Optional[str] = "./dump") -> None:
        """Generate the dataset and publish it to Weave.

        Args:
            dump_dir (Optional[str]): Directory to dump the dataset.
        """
        wandb.init(
            project=self.project_name,
            job_type="attribute_binding_dataset",
            config=self.model_configs,
        )
        weave.init(project_name=self.project_name)
        evaluation = weave.Evaluation(
            dataset=[{"prompt": "", "seed": seed} for seed in self.openai_seeds],
            scorers=[self.evaluate_generated_response],
        )
        with weave.attributes(self.model_configs):
            asyncio.run(evaluation.evaluate(self.attribute_binding_model.predict))
        self.publish_dataset(dump_dir)

__call__(dump_dir='./dump')

Generate the dataset and publish it to Weave.

Parameters:

Name Type Description Default
dump_dir Optional[str]

Directory to dump the dataset.

'./dump'
Source code in hemm/metrics/vqa/dataset_generator/attribute_binding.py
def __call__(self, dump_dir: Optional[str] = "./dump") -> None:
    """Generate the dataset and publish it to Weave.

    Args:
        dump_dir (Optional[str]): Directory to dump the dataset.
    """
    wandb.init(
        project=self.project_name,
        job_type="attribute_binding_dataset",
        config=self.model_configs,
    )
    weave.init(project_name=self.project_name)
    evaluation = weave.Evaluation(
        dataset=[{"prompt": "", "seed": seed} for seed in self.openai_seeds],
        scorers=[self.evaluate_generated_response],
    )
    with weave.attributes(self.model_configs):
        asyncio.run(evaluation.evaluate(self.attribute_binding_model.predict))
    self.publish_dataset(dump_dir)

AttributeBindingModel

Bases: Model

Weave Model to generate prompts for evaluation of attribute binding capability of image-generation models using an OpenAI model.

Parameters:

Name Type Description Default
openai_model Optional[str]

The OpenAI model to use for generating prompts.

required
num_prompts Optional[int]

Number of prompts to generate.

required
Source code in hemm/metrics/vqa/dataset_generator/attribute_binding.py
class AttributeBindingModel(weave.Model):
    """Weave Model to generate prompts for evaluation of attribute binding capability of
    image-generation models using an OpenAI model.

    Args:
        openai_model (Optional[str]): The OpenAI model to use for generating prompts.
        num_prompts (Optional[int]): Number of prompts to generate.
    """

    openai_model: Optional[str] = "gpt-3.5-turbo"
    num_prompts: Optional[int] = 20
    _openai_client: Optional[OpenAI] = None
    _system_prompt: Optional[str] = None
    _user_prompt: Optional[str] = None

    def _initialize(self):
        if self._system_prompt is None:
            self._system_prompt = """
            You are a helpful assistant designed to generate some sentences and additional metadata in JSON format.
            """
        if self._user_prompt is None:
            self._user_prompt = (
                """
            Please generate prompts in the format of “a {adj_1} {noun_1} and a {adj_2} {noun_2}”
            by using the shape adj.: long, tall, short, big, small, cubic, cylindrical,
            pyramidal, round, circular, oval, oblong, spherical, triangular, square, rectangular,
            conical, pentagonal, teardrop, crescent, and diamond.

            The output should be a list of """
                + str(self.num_prompts)
                + """ JSONs like the following:

            \{
                "data": \[
                    \{
                        "sentence": "a long balloon and a short giraffe",
                        "metadata": \{
                            "adj_1": "long",
                            "noun_1": "balloon",
                            "adj_2": "short",
                            "noun_2": "giraffe"
                        \}
                    \},
                    \{
                        "sentence": "a tall suitcase and a small frog",
                        "metadata": \{
                            "adj_1": "tall",
                            "noun_1": "suitcase",
                            "adj_2": "small",
                            "noun_2": "frog"
                        \}
                    \},
                    \{
                        "sentence": "a big horse and a small man",
                        "metadata": \{
                            "adj_1": "big",
                            "noun_1": "horse",
                            "adj_2": "small",
                            "noun_2": "man",
                        \}
                    \}
                \],
            \}
            """
            )
        self._openai_client = OpenAI()

    @weave.op()
    def predict(self, seed: int) -> Dict[str, str]:
        """Generate prompts and corresponding metadata for evaluation of attribute binding
        capability of image-generation models.

        Args:
            seed (int): OpenAI seed to use for generating prompts.
        """
        return {
            "response": self._openai_client.chat.completions.create(
                model=self.openai_model,
                response_format={"type": "json_object"},
                seed=seed,
                messages=[
                    {
                        "role": "system",
                        "content": self._system_prompt,
                    },
                    {
                        "role": "user",
                        "content": self._user_prompt,
                    },
                ],
            )
            .choices[0]
            .message.content
        }

predict(seed)

Generate prompts and corresponding metadata for evaluation of attribute binding capability of image-generation models.

Parameters:

Name Type Description Default
seed int

OpenAI seed to use for generating prompts.

required
Source code in hemm/metrics/vqa/dataset_generator/attribute_binding.py
@weave.op()
def predict(self, seed: int) -> Dict[str, str]:
    """Generate prompts and corresponding metadata for evaluation of attribute binding
    capability of image-generation models.

    Args:
        seed (int): OpenAI seed to use for generating prompts.
    """
    return {
        "response": self._openai_client.chat.completions.create(
            model=self.openai_model,
            response_format={"type": "json_object"},
            seed=seed,
            messages=[
                {
                    "role": "system",
                    "content": self._system_prompt,
                },
                {
                    "role": "user",
                    "content": self._user_prompt,
                },
            ],
        )
        .choices[0]
        .message.content
    }