Source code for deepvisiontools.models.yolo.yolo

from ultralytics.cfg import get_cfg
from ultralytics.nn.tasks import attempt_load_one_weight, DetectionModel
from ultralytics.utils import DEFAULT_CFG
from deepvisiontools.models.basemodel import BaseModel
from typing import Literal
from deepvisiontools import Configuration
import deepvisiontools.models.yolo.errors as er
import deepvisiontools.models.yolo.utils as ut
from deepvisiontools.formats import (
    BatchedFormat,
    BboxData,
    InstanceMaskFormat,
    BboxFormat,
)
import torch
from typing import Dict, Tuple, Union, List
from torch import Tensor
import torchvision.transforms.v2.functional as F


[docs] class Yolo(DetectionModel, BaseModel): """Yolo detection model. data_type must be either bbox or instance_mask to use this model. Args: architecture (``Literal["yolon", "yolom", "yolol", "yolox"]``, **optional**): Yolo model size. You can add "-p2" or "-p6" to load the p2 or p6 variants. Defaults to "yolon". pretrained (``bool``, **optional**): Use pretrained weights. Defaults to True. reg_max (``int``, **optional**): reg_max argument of yolo models (impacts object size detection). See ultralytics for more information. Defaults to 16. loss_factor (``float``, **optional**): divide yolo loss value (important for mixed precision to keep it below a certain range). Defaults to 1. Attributes ---------- Attributes: - criterion (``v8DetectionLoss``): Yolo loss from ultralytics. - args (``Any``) : ultralytics Yolo's configuration params. - pad_requirements (``int``) : pad requirements as per yolo (image shape multiple of 32 is the basic, but depends for p2 or p6). Note that is set automatically. Attributes ---------- Properties: - device (``Literal["cuda", "cpu"]``): model's device **Methods** """ def __init__( self, architecture: Literal[ "yolo11n", "yolo11m", "yolo11l", "yolo11x", "yolov8n", "yolov8m", "yolov8l", "yoloxv8", "yolov8n-p6", "yolov8m-p6", "yolov8l-p6", "yolov8x-p6", "yolov8n-p2", "yolov8m-p2", "yolov8l-p2", "yolov8x-p2", ] = "yolov8n", pretrained: bool = True, reg_max=16, loss_factor: float = 1.0, *args, **kwargs, ): er.check_config(architecture, pretrained) config = Configuration() super().__init__(f"{architecture}.yaml", nc=config.num_classes, *args, **kwargs) self.args = get_cfg(DEFAULT_CFG) if pretrained: architecture = attempt_load_one_weight( f"{architecture}.pt", ) self.load(architecture[0]) self.criterion = self.init_criterion() self.device = config.device self.model[-1].reg_max = reg_max if "p6" in architecture: self.pad_requirements = 64 elif "p2" in architecture: self.pad_requirements = 16 else: self.pad_requirements = 32 self.loss_factor = loss_factor # overwrite @property def device(self): return self._device # overwrite @device.setter def device(self, val): self.to(val) self.criterion = self.init_criterion()
[docs] def prepare_target( self, targets: BatchedFormat, img_size: Tuple[int, int] ) -> Dict[str, Tensor]: """Return target from BatchedFormat to ultralytics yolo format. Args: targets (BatchedFormat) img_size (Tuple[int, int]) Returns: Dict[str, Tensor]: target as per ultralytics Yolo format. """ # Convert to BboxFormat if there are InstanceMasks if any([isinstance(targ, InstanceMaskFormat) for targ in targets.formats]): forms = [ ( BboxFormat.from_instance_mask(targ) if isinstance(targ, InstanceMaskFormat) else targ ) for targ in targets ] targets = BatchedFormat(forms) targets.set_bboxes_format("CXCYWH") boxes = torch.cat([targ.data.value for targ in targets]) boxes = ut.normalize_boxes(boxes, img_size) labels = torch.cat([targ.labels for targ in targets])[..., None] batch_idx = torch.cat( [torch.ones(targ.nb_object) * i for i, targ in enumerate(targets)] )[..., None] batch_idx = batch_idx.to(Configuration().device) return {"batch_idx": batch_idx, "cls": labels, "bboxes": boxes}
[docs] def prepare( self, images: Tensor, targets: Union[BatchedFormat, None] = None ) -> Union[Tuple[Tensor, Dict], Tensor]: """Pad image / targets to fit yolo divisibility by 32 criterium and move targets to yolo format. If no targets passed simply returns images Args: images (``Tensor``): batched images [N, 3, H, W] targets (``Union[BatchedFormat, None]``) Returns: ``Union[Tuple[Tensor, Dict], Tensor]``: - Either : images_padded, yolo_targets OR images_padded """ h, w = images.shape[-2], images.shape[-1] (t, l, r, b) = ut.yolo_pad_requirements(h, w, required=self.pad_requirements) # Note the inversion for torchvision pad coord ordinates : t <-> l images = F.pad(images, list((l, t, r, b))) if targets != None: targets = BatchedFormat([targ.pad(t, l, b, r)[0] for targ in targets]) targets = self.prepare_target(targets, (h, w)) return images, targets else: return images
[docs] def build_results( self, raw_outputs: List[Tensor], prebuild_outputs: Tensor ) -> BatchedFormat: """Transform model outputs into Batch BboxFormat for results. Args: raw_outputs (``List[Tensor]``): Model outputs. prebuild_outputs (``Tensor``): Extracted boxes from outputs in eval mode. Returns: ``BatchedFormats``: - Batched predictions. """ prebuild_outputs = prebuild_outputs.unbind() h, w = self.retrieve_spatial_size(raw_outputs) results = [] # for each prediction for prediction in prebuild_outputs: # send pred in good pshape prediction = prediction.permute(1, 0) # get best class and corresponding score best_class = torch.argmax(prediction[:, 4:], dim=1) confidence, _ = torch.max(prediction[:, 4:], dim=1) # gather box cxcywh coordinates boxes = BboxData(prediction[:, :4], "CXCYWH", (h, w)) # build result result = BboxFormat(boxes, best_class, scores=confidence) # objects selections result = ut.confidence_filter(result) result = ut.box_nms_filter(result) result, _ = result[: Configuration().model_max_detection] # stack batch results results.append(result) if len(results) == 0: results = [] results = BatchedFormat(results) return results
[docs] def compute_loss( self, raw_outputs: Tensor, targets: Dict[str, Tensor] ) -> Dict[str, Tensor]: """Compute loss with predictions & targets. Args: raw_outputs (``Any``): Raw output of model. targets (``DetectionFormat``): Targets in YOLO format. Returns: ``Dict[str, Tensor]``: - Loss dict with total loss (key: "loss") & sublosses. """ # yolo scale loss with batch size -> normalize it here and apply loss factor to keep it in the unit range # (for mixed precision optim it's important) batch_factor = targets["batch_idx"].unique().shape[0] loss, loss_detail = self.criterion(raw_outputs, targets) loss /= self.loss_factor * batch_factor loss_detail /= self.loss_factor * batch_factor loss_dict = { "loss": loss, "loss_box": loss_detail[0], "loss_cls": loss_detail[1], "loss_dfl": loss_detail[2], } return loss_dict
[docs] def run_forward( self, images: Tensor, targets: BatchedFormat, ) -> Union[Dict[str, Tensor], Tuple[Dict[str, Tensor], BatchedFormat]]: """Compute loss from images and if target passed, compute loss & return both loss dict and results. Args: images (``Tensor``): Batch RGB images. targets (``BatchedFormat``): Batch targets. Returns: ``Union[Dict[str, Tensor], Tuple[Dict[str, Tensor], BatchedFormat]]``: - Loss dict. - If predict: predictions. """ # prepare inputs prepared_images, prepared_targets = self.prepare(images, targets=targets) # run forward pass if self.training: raw_outputs = self(prepared_images) else: prebuild_output, raw_outputs = self(prepared_images) # compute loss loss_dict = self.compute_loss(raw_outputs, prepared_targets) # return predictions if needed if not (self.training): predictions = self.build_results(raw_outputs, prebuild_output) # retrieve the padding from original img t, l, _, _ = ut.yolo_pad_requirements( images.shape[-2], images.shape[-1], required=self.pad_requirements ) h, w = images.shape[-2:] # crop to original size predictions = BatchedFormat( [targ.crop(t, l, h, w)[0] for targ in predictions] ) return loss_dict, predictions else: return loss_dict
[docs] def get_predictions(self, images: Tensor) -> BatchedFormat: """Prepare images, Apply YOLO forward pass and build results. Args: images (``Tensor``): RGB images Tensor. Returns: ``BatchedFormats``: - Predictions for images as BatchedFormats. """ self.eval() # get original spatial size ori_h, ori_w = images.shape[-2:] # pad coord to return back to non yolo required / 32 criterium afterward top, left, _, _ = ut.yolo_pad_requirements( ori_h, ori_w, required=self.pad_requirements ) # pad images images = self.prepare(images) # predict prebuild_output, raw_outputs = self(images) results = self.build_results(raw_outputs, prebuild_output) # crop to back at original spatial size results = BatchedFormat( [pred.crop(top, left, ori_h, ori_w)[0] for pred in results] ) return results
[docs] def retrieve_spatial_size(self, raw_outputs: List[Tensor]) -> Tuple[int]: """Retrieve image shape from raw_outputs and stride values. Args: raw_outputs (``List[Tensor]``): Raw ouptuts from YOLO model. Returns: ``Tuple[int]``: - Size of input image (H, W). """ h = int(raw_outputs[0].shape[-2] * self.stride[0]) w = int(raw_outputs[0].shape[-1] * self.stride[0]) return (h, w)