Skip to content

Hemm utilities

base64_decode_image(image)

Decodes a base64 encoded image string encoded using the function hemm.utils.base64_encode_image.

Parameters:

Name Type Description Default
image str

Base64 encoded image string encoded using the function hemm.utils.base64_encode_image.

required

Returns:

Type Description
Image

Image.Image: PIL Image object.

Source code in hemm/utils.py
def base64_decode_image(image: str) -> Image.Image:
    """Decodes a base64 encoded image string encoded using the function `hemm.utils.base64_encode_image`.

    Args:
        image (str): Base64 encoded image string encoded using the function `hemm.utils.base64_encode_image`.

    Returns:
        Image.Image: PIL Image object.
    """
    return Image.open(io.BytesIO(base64.b64decode(image.split(";base64,")[-1])))

base64_encode_image(image_path, mimetype=None)

Converts an image to base64 encoded string to be logged and rendered on Weave dashboard.

Parameters:

Name Type Description Default
image_path Union[str, Image]

Path to the image or PIL Image object.

required
mimetype Optional[str]

Mimetype of the image. Defaults to None.

None

Returns:

Name Type Description
str str

Base64 encoded image string.

Source code in hemm/utils.py
def base64_encode_image(
    image_path: Union[str, Image.Image], mimetype: Optional[str] = None
) -> str:
    """Converts an image to base64 encoded string to be logged and rendered on Weave dashboard.

    Args:
        image_path (Union[str, Image.Image]): Path to the image or PIL Image object.
        mimetype (Optional[str], optional): Mimetype of the image. Defaults to None.

    Returns:
        str: Base64 encoded image string.
    """
    image = Image.open(image_path) if isinstance(image_path, str) else image_path
    mimetype = (
        EXT_TO_MIMETYPE[Path(image_path).suffix]
        if isinstance(image_path, str)
        else "image/png"
    )
    byte_arr = io.BytesIO()
    image.save(byte_arr, format="PNG")
    encoded_string = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
    encoded_string = f"data:{mimetype};base64,{encoded_string}"
    return str(encoded_string)

publish_dataset_to_weave(dataset_path, dataset_name=None, prompt_column=None, ground_truth_image_column=None, split=None, data_limit=None, get_weave_dataset_reference=True, dataset_transforms=None, dump_dir='./dump', *args, **kwargs)

Publishes a HuggingFace dataset dictionary dataset as a Weave dataset.

Publish a subset of MSCOCO from Huggingface as a Weave Dataset
import weave
from hemm.utils import publish_dataset_to_weave

if __name__ == "__main__":
    weave.init(project_name="t2i_eval")

    def preprocess_sentences_column(example):
        example["sentences"] = example["sentences"]["raw"]
        return example


    dataset_reference = publish_dataset_to_weave(
        dataset_path="HuggingFaceM4/COCO",
        prompt_column="sentences",
        ground_truth_image_column="image",
        split="validation",
        dataset_transforms=preprocess_sentences_column,
        data_limit=10,
    )

Parameters:

Name Type Description Default
dataset_path [type]

Path to the HuggingFace dataset.

required
dataset_name Optional[str]

Name of the Weave dataset.

None
prompt_column Optional[str]

Column name for prompt.

None
ground_truth_image_column Optional[str]

Column name for ground truth image.

None
split Optional[str]

Split to be used.

None
data_limit Optional[int]

Limit the number of data items.

None
get_weave_dataset_reference bool

Whether to return the Weave dataset reference.

True
dataset_transforms Optional[List[Callable]]

List of dataset transforms.

None
dump_dir Optional[str]

Directory to dump the results.

'./dump'

Returns:

Type Description
Union[ObjectRef, None]

Union[ObjectRef, None]: Weave dataset reference if get_weave_dataset_reference is True.

Source code in hemm/utils.py
def publish_dataset_to_weave(
    dataset_path,
    dataset_name: Optional[str] = None,
    prompt_column: Optional[str] = None,
    ground_truth_image_column: Optional[str] = None,
    split: Optional[str] = None,
    data_limit: Optional[int] = None,
    get_weave_dataset_reference: bool = True,
    dataset_transforms: Optional[List[Callable]] = None,
    dump_dir: Optional[str] = "./dump",
    *args,
    **kwargs,
) -> Union[ObjectRef, None]:
    """Publishes a HuggingFace dataset dictionary dataset as a Weave dataset.

    ??? example "Publish a subset of MSCOCO from Huggingface as a Weave Dataset"
        ```python
        import weave
        from hemm.utils import publish_dataset_to_weave

        if __name__ == "__main__":
            weave.init(project_name="t2i_eval")

            def preprocess_sentences_column(example):
                example["sentences"] = example["sentences"]["raw"]
                return example


            dataset_reference = publish_dataset_to_weave(
                dataset_path="HuggingFaceM4/COCO",
                prompt_column="sentences",
                ground_truth_image_column="image",
                split="validation",
                dataset_transforms=preprocess_sentences_column,
                data_limit=10,
            )
        ```

    Args:
        dataset_path ([type]): Path to the HuggingFace dataset.
        dataset_name (Optional[str], optional): Name of the Weave dataset.
        prompt_column (Optional[str], optional): Column name for prompt.
        ground_truth_image_column (Optional[str], optional): Column name for ground truth image.
        split (Optional[str], optional): Split to be used.
        data_limit (Optional[int], optional): Limit the number of data items.
        get_weave_dataset_reference (bool, optional): Whether to return the Weave dataset reference.
        dataset_transforms (Optional[List[Callable]], optional): List of dataset transforms.
        dump_dir (Optional[str], optional): Directory to dump the results.

    Returns:
        Union[ObjectRef, None]: Weave dataset reference if get_weave_dataset_reference is True.
    """
    os.makedirs(dump_dir, exist_ok=True)
    dataset_name = dataset_name or Path(dataset_path).stem
    dataset_dict = load_dataset(dataset_path, *args, **kwargs)
    dataset_dict = dataset_dict[split] if split else dataset_dict["train"]
    dataset_dict = (
        dataset_dict.select(range(data_limit))
        if data_limit is not None and data_limit < len(dataset_dict)
        else dataset_dict
    )
    if dataset_transforms:
        for transform in dataset_transforms:
            dataset_dict = dataset_dict.map(transform)
    dataset_dict = (
        dataset_dict.rename_column(prompt_column, "prompt")
        if prompt_column
        else dataset_dict
    )
    dataset_dict = (
        dataset_dict.rename_column(ground_truth_image_column, "ground_truth_image")
        if ground_truth_image_column
        else dataset_dict
    )
    weave_dataset_rows = [data_item for data_item in tqdm(dataset_dict)]

    weave_dataset = weave.Dataset(name=dataset_name, rows=weave_dataset_rows)
    weave.publish(weave_dataset)
    return weave.ref(dataset_name).get() if get_weave_dataset_reference else None