Source code for deepvisiontools.models.yolo.yolo

from ultralytics.cfg import get_cfg
from ultralytics.nn.tasks import attempt_load_one_weight, DetectionModel
from ultralytics.utils import DEFAULT_CFG
from deepvisiontools.models.basemodel import BaseModel
from typing import Literal
from deepvisiontools import Configuration
import deepvisiontools.models.yolo.errors as er
import deepvisiontools.models.yolo.utils as ut
from deepvisiontools.formats import (
    BatchedFormat,
    BboxData,
    InstanceMaskFormat,
    BboxFormat,
)
import torch
from typing import Dict, Tuple, Union, List
from torch import Tensor
import torchvision.transforms.v2.functional as F



[docs]
class Yolo(DetectionModel, BaseModel):
    """Yolo detection model. data_type must be either bbox or instance_mask to use this model.

    Args:
        architecture (``Literal[&quot;yolon&quot;, &quot;yolom&quot;, &quot;yolol&quot;, &quot;yolox&quot;]``, **optional**): Yolo model size. You can add "-p2" or "-p6" to load the p2 or p6 variants. Defaults to "yolon".
        pretrained (``bool``, **optional**): Use pretrained weights. Defaults to True.
        reg_max (``int``, **optional**): reg_max argument of yolo models (impacts object size detection). See ultralytics for more information. Defaults to 16.
        loss_factor (``float``, **optional**): divide yolo loss value (important for mixed precision to keep it below a certain range). Defaults to 1.

    Attributes
    ----------

    Attributes:
        - criterion (``v8DetectionLoss``): Yolo loss from ultralytics.
        - args (``Any``) : ultralytics Yolo's configuration params.
        - pad_requirements (``int``) : pad requirements as per yolo (image shape multiple of 32 is the basic, but depends for p2 or p6). Note that is set automatically.

    Attributes
    ----------

    Properties:
        - device (``Literal["cuda", "cpu"]``): model's device


    **Methods**
    """

    def __init__(
        self,
        architecture: Literal[
            "yolo11n",
            "yolo11m",
            "yolo11l",
            "yolo11x",
            "yolov8n",
            "yolov8m",
            "yolov8l",
            "yoloxv8",
            "yolov8n-p6",
            "yolov8m-p6",
            "yolov8l-p6",
            "yolov8x-p6",
            "yolov8n-p2",
            "yolov8m-p2",
            "yolov8l-p2",
            "yolov8x-p2",
        ] = "yolov8n",
        pretrained: bool = True,
        reg_max=16,
        loss_factor: float = 1.0,
        *args,
        **kwargs,
    ):

        er.check_config(architecture, pretrained)
        config = Configuration()
        super().__init__(f"{architecture}.yaml", nc=config.num_classes, *args, **kwargs)
        self.args = get_cfg(DEFAULT_CFG)
        if pretrained:
            architecture = attempt_load_one_weight(
                f"{architecture}.pt",
            )
            self.load(architecture[0])
        self.criterion = self.init_criterion()
        self.device = config.device
        self.model[-1].reg_max = reg_max
        if "p6" in architecture:
            self.pad_requirements = 64
        elif "p2" in architecture:
            self.pad_requirements = 16
        else:
            self.pad_requirements = 32
        self.loss_factor = loss_factor

    # overwrite
    @property
    def device(self):
        return self._device

    # overwrite
    @device.setter
    def device(self, val):
        self.to(val)
        self.criterion = self.init_criterion()


[docs]
    def prepare_target(
        self, targets: BatchedFormat, img_size: Tuple[int, int]
    ) -> Dict[str, Tensor]:
        """Return target from BatchedFormat to ultralytics yolo format.

        Args:
            targets (BatchedFormat)
            img_size (Tuple[int, int])

        Returns:
            Dict[str, Tensor]: target as per ultralytics Yolo format.
        """
        # Convert to BboxFormat if there are InstanceMasks
        if any([isinstance(targ, InstanceMaskFormat) for targ in targets.formats]):
            forms = [
                (
                    BboxFormat.from_instance_mask(targ)
                    if isinstance(targ, InstanceMaskFormat)
                    else targ
                )
                for targ in targets
            ]
            targets = BatchedFormat(forms)
        targets.set_bboxes_format("CXCYWH")
        boxes = torch.cat([targ.data.value for targ in targets])
        boxes = ut.normalize_boxes(boxes, img_size)
        labels = torch.cat([targ.labels for targ in targets])[..., None]
        batch_idx = torch.cat(
            [torch.ones(targ.nb_object) * i for i, targ in enumerate(targets)]
        )[..., None]
        batch_idx = batch_idx.to(Configuration().device)
        return {"batch_idx": batch_idx, "cls": labels, "bboxes": boxes}



[docs]
    def prepare(
        self, images: Tensor, targets: Union[BatchedFormat, None] = None
    ) -> Union[Tuple[Tensor, Dict], Tensor]:
        """Pad image / targets to fit yolo divisibility by 32 criterium and move targets to yolo format.
        If no targets passed simply returns images

        Args:
            images (``Tensor``): batched images [N, 3, H, W]
            targets (``Union[BatchedFormat, None]``)

        Returns:
            ``Union[Tuple[Tensor, Dict], Tensor]``:
                - Either : images_padded, yolo_targets OR images_padded
        """
        h, w = images.shape[-2], images.shape[-1]
        (t, l, r, b) = ut.yolo_pad_requirements(h, w, required=self.pad_requirements)
        # Note the inversion for torchvision pad coord ordinates : t <-> l
        images = F.pad(images, list((l, t, r, b)))
        if targets != None:
            targets = BatchedFormat([targ.pad(t, l, b, r)[0] for targ in targets])
            targets = self.prepare_target(targets, (h, w))
            return images, targets
        else:
            return images



[docs]
    def build_results(
        self, raw_outputs: List[Tensor], prebuild_outputs: Tensor
    ) -> BatchedFormat:
        """Transform model outputs into Batch BboxFormat for results.

        Args:
            raw_outputs (``List[Tensor]``): Model outputs.
            prebuild_outputs (``Tensor``): Extracted boxes from outputs in eval mode.

        Returns:
            ``BatchedFormats``:
                - Batched predictions.
        """

        prebuild_outputs = prebuild_outputs.unbind()
        h, w = self.retrieve_spatial_size(raw_outputs)
        results = []
        # for each prediction
        for prediction in prebuild_outputs:
            # send pred in good pshape
            prediction = prediction.permute(1, 0)
            # get best class and corresponding score
            best_class = torch.argmax(prediction[:, 4:], dim=1)
            confidence, _ = torch.max(prediction[:, 4:], dim=1)
            # gather box cxcywh coordinates
            boxes = BboxData(prediction[:, :4], "CXCYWH", (h, w))
            # build result
            result = BboxFormat(boxes, best_class, scores=confidence)
            # objects selections
            result = ut.confidence_filter(result)
            result = ut.box_nms_filter(result)
            result, _ = result[: Configuration().model_max_detection]
            # stack batch results
            results.append(result)

        if len(results) == 0:
            results = []

        results = BatchedFormat(results)
        return results



[docs]
    def compute_loss(
        self, raw_outputs: Tensor, targets: Dict[str, Tensor]
    ) -> Dict[str, Tensor]:
        """Compute loss with predictions & targets.

        Args:
            raw_outputs (``Any``): Raw output of model.
            targets (``DetectionFormat``): Targets in YOLO format.

        Returns:
            ``Dict[str, Tensor]``:
                - Loss dict with total loss (key: "loss") & sublosses.
        """
        # yolo scale loss with batch size -> normalize it here and apply loss factor to keep it in the unit range
        # (for mixed precision optim it's important)
        batch_factor = targets["batch_idx"].unique().shape[0]
        loss, loss_detail = self.criterion(raw_outputs, targets)
        loss /= self.loss_factor * batch_factor
        loss_detail /= self.loss_factor * batch_factor
        loss_dict = {
            "loss": loss,
            "loss_box": loss_detail[0],
            "loss_cls": loss_detail[1],
            "loss_dfl": loss_detail[2],
        }
        return loss_dict



[docs]
    def run_forward(
        self,
        images: Tensor,
        targets: BatchedFormat,
    ) -> Union[Dict[str, Tensor], Tuple[Dict[str, Tensor], BatchedFormat]]:
        """Compute loss from images and if target passed, compute loss & return both loss dict
        and results.

        Args:
            images (``Tensor``): Batch RGB images.
            targets (``BatchedFormat``): Batch targets.

        Returns:
            ``Union[Dict[str, Tensor], Tuple[Dict[str, Tensor], BatchedFormat]]``:
                - Loss dict.
                - If predict: predictions.
        """
        # prepare inputs
        prepared_images, prepared_targets = self.prepare(images, targets=targets)
        # run forward pass
        if self.training:
            raw_outputs = self(prepared_images)
        else:
            prebuild_output, raw_outputs = self(prepared_images)
        # compute loss
        loss_dict = self.compute_loss(raw_outputs, prepared_targets)
        # return predictions if needed
        if not (self.training):
            predictions = self.build_results(raw_outputs, prebuild_output)
            # retrieve the padding from original img
            t, l, _, _ = ut.yolo_pad_requirements(
                images.shape[-2], images.shape[-1], required=self.pad_requirements
            )
            h, w = images.shape[-2:]
            # crop to original size
            predictions = BatchedFormat(
                [targ.crop(t, l, h, w)[0] for targ in predictions]
            )
            return loss_dict, predictions
        else:
            return loss_dict



[docs]
    def get_predictions(self, images: Tensor) -> BatchedFormat:
        """Prepare images, Apply YOLO forward pass and build results.

        Args:
            images (``Tensor``): RGB images Tensor.

        Returns:
            ``BatchedFormats``:
                - Predictions for images as BatchedFormats.
        """
        self.eval()
        # get original spatial size
        ori_h, ori_w = images.shape[-2:]
        # pad coord to return back to non yolo required / 32 criterium afterward
        top, left, _, _ = ut.yolo_pad_requirements(
            ori_h, ori_w, required=self.pad_requirements
        )
        # pad images
        images = self.prepare(images)
        # predict
        prebuild_output, raw_outputs = self(images)
        results = self.build_results(raw_outputs, prebuild_output)
        # crop to back at original spatial size
        results = BatchedFormat(
            [pred.crop(top, left, ori_h, ori_w)[0] for pred in results]
        )
        return results



[docs]
    def retrieve_spatial_size(self, raw_outputs: List[Tensor]) -> Tuple[int]:
        """Retrieve image shape from raw_outputs and stride values.

        Args:
            raw_outputs (``List[Tensor]``): Raw ouptuts from YOLO model.

        Returns:
            ``Tuple[int]``:
                - Size of input image (H, W).
        """
        h = int(raw_outputs[0].shape[-2] * self.stride[0])
        w = int(raw_outputs[0].shape[-1] * self.stride[0])
        return (h, w)