feat(web): OCR overlay interactivity during zoom

Change-Id: Id62e1a0264df2de0f3177a59b24bc5176a6a6964
2026-03-24 02:54:22 -07:00 · 2026-03-23 13:16:08 +00:00
14 changed files with 770 additions and 246 deletions
--- a/e2e/src/ui/mock-network/ocr-network.ts
+++ b/e2e/src/ui/mock-network/ocr-network.ts
@@ -0,0 +1,55 @@
+import { faker } from '@faker-js/faker';
+import type { AssetOcrResponseDto } from '@immich/sdk';
+import { BrowserContext } from '@playwright/test';
+
+export type MockOcrBox = {
+  text: string;
+  x1: number;
+  y1: number;
+  x2: number;
+  y2: number;
+  x3: number;
+  y3: number;
+  x4: number;
+  y4: number;
+};
+
+export const createMockOcrData = (assetId: string, boxes: MockOcrBox[]): AssetOcrResponseDto[] => {
+  return boxes.map((box) => ({
+    id: faker.string.uuid(),
+    assetId,
+    x1: box.x1,
+    y1: box.y1,
+    x2: box.x2,
+    y2: box.y2,
+    x3: box.x3,
+    y3: box.y3,
+    x4: box.x4,
+    y4: box.y4,
+    boxScore: 0.95,
+    textScore: 0.9,
+    text: box.text,
+  }));
+};
+
+export const setupOcrMockApiRoutes = async (
+  context: BrowserContext,
+  ocrDataByAssetId: Map<string, AssetOcrResponseDto[]>,
+) => {
+  await context.route('**/assets/*/ocr', async (route, request) => {
+    if (request.method() !== 'GET') {
+      return route.fallback();
+    }
+    const url = new URL(request.url());
+    const segments = url.pathname.split('/');
+    const assetIdIndex = segments.indexOf('assets') + 1;
+    const assetId = segments[assetIdIndex];
+
+    const ocrData = ocrDataByAssetId.get(assetId) ?? [];
+    return route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      json: ocrData,
+    });
+  });
+};
--- a/e2e/src/ui/specs/asset-viewer/ocr.e2e-spec.ts
+++ b/e2e/src/ui/specs/asset-viewer/ocr.e2e-spec.ts
@@ -0,0 +1,300 @@
+import type { AssetOcrResponseDto, AssetResponseDto } from '@immich/sdk';
+import { expect, test } from '@playwright/test';
+import { toAssetResponseDto } from 'src/ui/generators/timeline';
+import {
+  createMockStack,
+  createMockStackAsset,
+  MockStack,
+  setupBrokenAssetMockApiRoutes,
+} from 'src/ui/mock-network/broken-asset-network';
+import { createMockOcrData, setupOcrMockApiRoutes } from 'src/ui/mock-network/ocr-network';
+import { assetViewerUtils } from '../timeline/utils';
+import { setupAssetViewerFixture } from './utils';
+
+test.describe.configure({ mode: 'parallel' });
+
+const PRIMARY_OCR_BOXES = [
+  { text: 'Hello World', x1: 0.1, y1: 0.1, x2: 0.4, y2: 0.1, x3: 0.4, y3: 0.15, x4: 0.1, y4: 0.15 },
+  { text: 'Immich Photo', x1: 0.2, y1: 0.3, x2: 0.6, y2: 0.3, x3: 0.6, y3: 0.36, x4: 0.2, y4: 0.36 },
+];
+
+const SECONDARY_OCR_BOXES = [
+  { text: 'Second Asset Text', x1: 0.15, y1: 0.2, x2: 0.55, y2: 0.2, x3: 0.55, y3: 0.26, x4: 0.15, y4: 0.26 },
+];
+
+test.describe('OCR bounding boxes', () => {
+  const fixture = setupAssetViewerFixture(920);
+
+  test.beforeEach(async ({ context }) => {
+    const primaryAssetDto = toAssetResponseDto(fixture.primaryAsset);
+    const ocrDataByAssetId = new Map<string, AssetOcrResponseDto[]>([
+      [primaryAssetDto.id, createMockOcrData(primaryAssetDto.id, PRIMARY_OCR_BOXES)],
+    ]);
+
+    await setupOcrMockApiRoutes(context, ocrDataByAssetId);
+  });
+
+  test('OCR bounding boxes appear when clicking OCR button', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    const ocrButton = page.getByLabel('Text recognition');
+    await expect(ocrButton).toBeVisible();
+    await ocrButton.click();
+
+    const ocrBoxes = page.locator('[data-viewer-content] [data-testid="ocr-box"]');
+    await expect(ocrBoxes).toHaveCount(2);
+
+    await expect(ocrBoxes.nth(0)).toContainText('Hello World');
+    await expect(ocrBoxes.nth(1)).toContainText('Immich Photo');
+  });
+
+  test('OCR bounding boxes toggle off on second click', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    const ocrButton = page.getByLabel('Text recognition');
+    await ocrButton.click();
+    await expect(page.locator('[data-viewer-content] [data-testid="ocr-box"]').first()).toBeVisible();
+
+    await ocrButton.click();
+    await expect(page.locator('[data-viewer-content] [data-testid="ocr-box"]')).toHaveCount(0);
+  });
+});
+
+test.describe('OCR with stacked assets', () => {
+  const fixture = setupAssetViewerFixture(921);
+  let mockStack: MockStack;
+  let primaryAssetDto: AssetResponseDto;
+  let secondAssetDto: AssetResponseDto;
+
+  test.beforeAll(async () => {
+    primaryAssetDto = toAssetResponseDto(fixture.primaryAsset);
+    secondAssetDto = createMockStackAsset(fixture.adminUserId);
+    secondAssetDto.originalFileName = 'second-ocr-asset.jpg';
+    mockStack = createMockStack(primaryAssetDto, [secondAssetDto], new Set());
+  });
+
+  test.beforeEach(async ({ context }) => {
+    await setupBrokenAssetMockApiRoutes(context, mockStack);
+
+    const ocrDataByAssetId = new Map<string, AssetOcrResponseDto[]>([
+      [primaryAssetDto.id, createMockOcrData(primaryAssetDto.id, PRIMARY_OCR_BOXES)],
+      [secondAssetDto.id, createMockOcrData(secondAssetDto.id, SECONDARY_OCR_BOXES)],
+    ]);
+
+    await setupOcrMockApiRoutes(context, ocrDataByAssetId);
+  });
+
+  test('different OCR boxes shown for different stacked assets', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    const ocrButton = page.getByLabel('Text recognition');
+    await expect(ocrButton).toBeVisible();
+    await ocrButton.click();
+
+    const ocrBoxes = page.locator('[data-viewer-content] [data-testid="ocr-box"]');
+    await expect(ocrBoxes).toHaveCount(2);
+    await expect(ocrBoxes.nth(0)).toContainText('Hello World');
+
+    const stackThumbnails = page.locator('#stack-slideshow [data-asset]');
+    await expect(stackThumbnails).toHaveCount(2);
+    await stackThumbnails.nth(1).click();
+
+    // refreshOcr() clears showOverlay when switching assets, so re-enable it
+    await expect(ocrBoxes).toHaveCount(0);
+    await expect(ocrButton).toBeVisible();
+    await ocrButton.click();
+
+    await expect(ocrBoxes).toHaveCount(1);
+    await expect(ocrBoxes.first()).toContainText('Second Asset Text');
+  });
+});
+
+test.describe('OCR boxes and zoom', () => {
+  const fixture = setupAssetViewerFixture(922);
+
+  test.beforeEach(async ({ context }) => {
+    const primaryAssetDto = toAssetResponseDto(fixture.primaryAsset);
+    const ocrDataByAssetId = new Map<string, AssetOcrResponseDto[]>([
+      [primaryAssetDto.id, createMockOcrData(primaryAssetDto.id, PRIMARY_OCR_BOXES)],
+    ]);
+
+    await setupOcrMockApiRoutes(context, ocrDataByAssetId);
+  });
+
+  test('OCR boxes scale with zoom', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    const ocrButton = page.getByLabel('Text recognition');
+    await expect(ocrButton).toBeVisible();
+    await ocrButton.click();
+
+    const ocrBox = page.locator('[data-viewer-content] [data-testid="ocr-box"]').first();
+    await expect(ocrBox).toBeVisible();
+
+    const initialBox = await ocrBox.boundingBox();
+    expect(initialBox).toBeTruthy();
+
+    const { width, height } = page.viewportSize()!;
+    await page.mouse.move(width / 2, height / 2);
+    await page.mouse.wheel(0, -3);
+
+    await expect(async () => {
+      const zoomedBox = await ocrBox.boundingBox();
+      expect(zoomedBox).toBeTruthy();
+      expect(zoomedBox!.width).toBeGreaterThan(initialBox!.width);
+      expect(zoomedBox!.height).toBeGreaterThan(initialBox!.height);
+    }).toPass({ timeout: 2000 });
+  });
+});
+
+test.describe('OCR text interaction', () => {
+  const fixture = setupAssetViewerFixture(923);
+
+  test.beforeEach(async ({ context }) => {
+    const primaryAssetDto = toAssetResponseDto(fixture.primaryAsset);
+    const ocrDataByAssetId = new Map<string, AssetOcrResponseDto[]>([
+      [primaryAssetDto.id, createMockOcrData(primaryAssetDto.id, PRIMARY_OCR_BOXES)],
+    ]);
+
+    await setupOcrMockApiRoutes(context, ocrDataByAssetId);
+  });
+
+  test('OCR text box has data-overlay-interactive attribute', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    await page.getByLabel('Text recognition').click();
+
+    const ocrBox = page.locator('[data-viewer-content] [data-testid="ocr-box"]').first();
+    await expect(ocrBox).toBeVisible();
+    await expect(ocrBox).toHaveAttribute('data-overlay-interactive');
+  });
+
+  test('OCR text box receives focus on click', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    await page.getByLabel('Text recognition').click();
+
+    const ocrBox = page.locator('[data-viewer-content] [data-testid="ocr-box"]').first();
+    await expect(ocrBox).toBeVisible();
+
+    await ocrBox.click();
+    await expect(ocrBox).toBeFocused();
+  });
+
+  test('dragging on OCR text box does not trigger image pan', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    await page.getByLabel('Text recognition').click();
+
+    const ocrBox = page.locator('[data-viewer-content] [data-testid="ocr-box"]').first();
+    await expect(ocrBox).toBeVisible();
+
+    const imgLocator = page.locator('[data-viewer-content] img[draggable="false"]');
+    const initialTransform = await imgLocator.evaluate((element) => {
+      return getComputedStyle(element.closest('[style*="transform"]') ?? element).transform;
+    });
+
+    const box = await ocrBox.boundingBox();
+    expect(box).toBeTruthy();
+    const centerX = box!.x + box!.width / 2;
+    const centerY = box!.y + box!.height / 2;
+
+    await page.mouse.move(centerX, centerY);
+    await page.mouse.down();
+    await page.mouse.move(centerX + 50, centerY + 30, { steps: 5 });
+    await page.mouse.up();
+
+    const afterTransform = await imgLocator.evaluate((element) => {
+      return getComputedStyle(element.closest('[style*="transform"]') ?? element).transform;
+    });
+    expect(afterTransform).toBe(initialTransform);
+  });
+
+  test('split touch gesture across zoom container does not trigger zoom', async ({ page }) => {
+    await page.goto(`/photos/${fixture.primaryAsset.id}`);
+    await assetViewerUtils.waitForViewerLoad(page, fixture.primaryAsset);
+
+    await page.getByLabel('Text recognition').click();
+    const ocrBox = page.locator('[data-viewer-content] [data-testid="ocr-box"]').first();
+    await expect(ocrBox).toBeVisible();
+
+    const imgLocator = page.locator('[data-viewer-content] img[draggable="false"]');
+    const initialTransform = await imgLocator.evaluate((element) => {
+      return getComputedStyle(element.closest('[style*="transform"]') ?? element).transform;
+    });
+
+    const viewerContent = page.locator('[data-viewer-content]');
+    const viewerBox = await viewerContent.boundingBox();
+    expect(viewerBox).toBeTruthy();
+
+    // Dispatch a synthetic split gesture: one touch inside the viewer, one outside
+    await page.evaluate(
+      ({ viewerCenterX, viewerCenterY, outsideY }) => {
+        const viewer = document.querySelector('[data-viewer-content]');
+        if (!viewer) {
+          return;
+        }
+
+        const createTouch = (id: number, x: number, y: number) => {
+          return new Touch({
+            identifier: id,
+            target: viewer,
+            clientX: x,
+            clientY: y,
+          });
+        };
+
+        const insideTouch = createTouch(0, viewerCenterX, viewerCenterY);
+        const outsideTouch = createTouch(1, viewerCenterX, outsideY);
+
+        const touchStartEvent = new TouchEvent('touchstart', {
+          touches: [insideTouch, outsideTouch],
+          targetTouches: [insideTouch],
+          changedTouches: [insideTouch, outsideTouch],
+          bubbles: true,
+          cancelable: true,
+        });
+
+        const touchMoveEvent = new TouchEvent('touchmove', {
+          touches: [createTouch(0, viewerCenterX, viewerCenterY - 30), createTouch(1, viewerCenterX, outsideY + 30)],
+          targetTouches: [createTouch(0, viewerCenterX, viewerCenterY - 30)],
+          changedTouches: [
+            createTouch(0, viewerCenterX, viewerCenterY - 30),
+            createTouch(1, viewerCenterX, outsideY + 30),
+          ],
+          bubbles: true,
+          cancelable: true,
+        });
+
+        const touchEndEvent = new TouchEvent('touchend', {
+          touches: [],
+          targetTouches: [],
+          changedTouches: [insideTouch, outsideTouch],
+          bubbles: true,
+          cancelable: true,
+        });
+
+        viewer.dispatchEvent(touchStartEvent);
+        viewer.dispatchEvent(touchMoveEvent);
+        viewer.dispatchEvent(touchEndEvent);
+      },
+      {
+        viewerCenterX: viewerBox!.x + viewerBox!.width / 2,
+        viewerCenterY: viewerBox!.y + viewerBox!.height / 2,
+        outsideY: 10, // near the top of the page, outside the viewer
+      },
+    );
+
+    const afterTransform = await imgLocator.evaluate((element) => {
+      return getComputedStyle(element.closest('[style*="transform"]') ?? element).transform;
+    });
+    expect(afterTransform).toBe(initialTransform);
+  });
+});
--- a/web/src/lib/actions/zoom-image.ts
+++ b/web/src/lib/actions/zoom-image.ts
@@ -1,11 +1,18 @@
 import { assetViewerManager } from '$lib/managers/asset-viewer-manager.svelte';
 import { createZoomImageWheel } from '@zoom-image/core';

-export const zoomImageAction = (node: HTMLElement, options?: { disabled?: boolean }) => {
+// Minimal touch shape — avoids importing DOM TouchEvent which isn't available in all TS targets.
+type TouchEventLike = {
+  touches: Iterable<{ clientX: number; clientY: number }> & { length: number };
+  targetTouches: ArrayLike<unknown>;
+};
+const asTouchEvent = (event: Event) => event as unknown as TouchEventLike;
+
+export const zoomImageAction = (node: HTMLElement, options?: { zoomTarget?: HTMLElement }) => {
  const zoomInstance = createZoomImageWheel(node, {
    maxZoom: 10,
    initialState: assetViewerManager.zoomState,
-    zoomTarget: null,
+    zoomTarget: options?.zoomTarget,
  });

  const unsubscribes = [
@@ -13,47 +20,130 @@ export const zoomImageAction = (node: HTMLElement, options?: { disabled?: boolea
    zoomInstance.subscribe(({ state }) => assetViewerManager.onZoomChange(state)),
  ];

-  const onInteractionStart = (event: Event) => {
-    if (options?.disabled) {
-      event.stopImmediatePropagation();
+  const controller = new AbortController();
+  const { signal } = controller;
+
+  node.addEventListener('pointerdown', () => assetViewerManager.cancelZoomAnimation(), { capture: true, signal });
+
+  // Intercept events in capture phase to prevent zoom-image from seeing interactions on
+  // overlay elements (e.g. OCR text boxes), preserving browser defaults like text selection.
+  const isOverlayEvent = (event: Event) => !!(event.target as HTMLElement).closest('[data-overlay-interactive]');
+  const isOverlayAtPoint = (x: number, y: number) =>
+    !!document.elementFromPoint(x, y)?.closest('[data-overlay-interactive]');
+
+  // Pointer event interception: track pointers that start on overlays and intercept the entire gesture.
+  const overlayPointers = new Set<number>();
+  const interceptedPointers = new Set<number>();
+  const interceptOverlayPointerDown = (event: PointerEvent) => {
+    if (isOverlayEvent(event) || isOverlayAtPoint(event.clientX, event.clientY)) {
+      overlayPointers.add(event.pointerId);
+      interceptedPointers.add(event.pointerId);
+      event.stopPropagation();
+    } else if (overlayPointers.size > 0) {
+      // Split gesture (e.g. pinch with one finger on overlay) — intercept entirely.
+      interceptedPointers.add(event.pointerId);
+      event.stopPropagation();
    }
-    assetViewerManager.cancelZoomAnimation();
  };
+  const interceptOverlayPointerEvent = (event: PointerEvent) => {
+    if (interceptedPointers.has(event.pointerId)) {
+      event.stopPropagation();
+    }
+  };
+  const interceptOverlayPointerEnd = (event: PointerEvent) => {
+    overlayPointers.delete(event.pointerId);
+    if (interceptedPointers.delete(event.pointerId)) {
+      event.stopPropagation();
+    }
+  };
+  node.addEventListener('pointerdown', interceptOverlayPointerDown, { capture: true, signal });
+  node.addEventListener('pointermove', interceptOverlayPointerEvent, { capture: true, signal });
+  node.addEventListener('pointerup', interceptOverlayPointerEnd, { capture: true, signal });
+  node.addEventListener('pointerleave', interceptOverlayPointerEnd, { capture: true, signal });

-  node.addEventListener('wheel', onInteractionStart, { capture: true });
-  node.addEventListener('pointerdown', onInteractionStart, { capture: true });
+  // Touch event interception for overlay touches or split gestures (pinch across container boundary).
+  // Once intercepted, stays intercepted until all fingers are lifted.
+  let touchGestureIntercepted = false;
+  const interceptOverlayTouchEvent = (event: Event) => {
+    if (touchGestureIntercepted) {
+      event.stopPropagation();
+      return;
+    }
+    const { touches, targetTouches } = asTouchEvent(event);
+    if (touches && targetTouches) {
+      if (touches.length > targetTouches.length) {
+        touchGestureIntercepted = true;
+        event.stopPropagation();
+        return;
+      }
+      for (const touch of touches) {
+        if (isOverlayAtPoint(touch.clientX, touch.clientY)) {
+          touchGestureIntercepted = true;
+          event.stopPropagation();
+          return;
+        }
+      }
+    } else if (isOverlayEvent(event)) {
+      event.stopPropagation();
+    }
+  };
+  const resetTouchGesture = (event: Event) => {
+    const { touches } = asTouchEvent(event);
+    if (touches.length === 0) {
+      touchGestureIntercepted = false;
+    }
+  };
+  node.addEventListener('touchstart', interceptOverlayTouchEvent, { capture: true, signal });
+  node.addEventListener('touchmove', interceptOverlayTouchEvent, { capture: true, signal });
+  node.addEventListener('touchend', resetTouchGesture, { capture: true, signal });

-  // Suppress Safari's synthetic dblclick on double-tap. Without this, zoom-image's touchstart
-  // handler zooms to maxZoom (10x), then Safari's synthetic dblclick triggers photo-viewer's
-  // handler which conflicts. Chrome does not fire synthetic dblclick on touch.
+  // Wheel and dblclick interception on overlay elements.
+  // Dblclick also intercepted for all touch double-taps (Safari fires synthetic dblclick
+  // on double-tap, which conflicts with zoom-image's touch zoom handler).
  let lastPointerWasTouch = false;
-  const trackPointerType = (event: PointerEvent) => {
-    lastPointerWasTouch = event.pointerType === 'touch';
-  };
-  const suppressTouchDblClick = (event: MouseEvent) => {
-    if (lastPointerWasTouch) {
-      event.stopImmediatePropagation();
-    }
-  };
-  node.addEventListener('pointerdown', trackPointerType, { capture: true });
-  node.addEventListener('dblclick', suppressTouchDblClick, { capture: true });
+  node.addEventListener('pointerdown', (event) => (lastPointerWasTouch = event.pointerType === 'touch'), {
+    capture: true,
+    signal,
+  });
+  node.addEventListener(
+    'wheel',
+    (event) => {
+      if (isOverlayEvent(event)) {
+        event.stopPropagation();
+      }
+    },
+    { capture: true, signal },
+  );
+  node.addEventListener(
+    'dblclick',
+    (event) => {
+      if (lastPointerWasTouch || isOverlayEvent(event)) {
+        event.stopImmediatePropagation();
+      }
+    },
+    { capture: true, signal },
+  );

-  // Allow zoomed content to render outside the container bounds
+  if (options?.zoomTarget) {
+    options.zoomTarget.style.willChange = 'transform';
+  }
  node.style.overflow = 'visible';
-  // Prevent browser handling of touch gestures so zoom-image can manage them
  node.style.touchAction = 'none';
  return {
-    update(newOptions?: { disabled?: boolean }) {
+    update(newOptions?: { zoomTarget?: HTMLElement }) {
      options = newOptions;
+      if (newOptions?.zoomTarget !== undefined) {
+        zoomInstance.setState({ zoomTarget: newOptions.zoomTarget });
+      }
    },
    destroy() {
+      controller.abort();
+      if (options?.zoomTarget) {
+        options.zoomTarget.style.willChange = '';
+      }
      for (const unsubscribe of unsubscribes) {
        unsubscribe();
      }
-      node.removeEventListener('wheel', onInteractionStart, { capture: true });
-      node.removeEventListener('pointerdown', onInteractionStart, { capture: true });
-      node.removeEventListener('pointerdown', trackPointerType, { capture: true });
-      node.removeEventListener('dblclick', suppressTouchDblClick, { capture: true });
      zoomInstance.cleanup();
    },
  };
--- a/web/src/lib/components/AdaptiveImage.svelte
+++ b/web/src/lib/components/AdaptiveImage.svelte
@@ -7,7 +7,7 @@
  import { assetViewerManager } from '$lib/managers/asset-viewer-manager.svelte';
  import { getAssetUrls } from '$lib/utils';
  import { AdaptiveImageLoader, type QualityList } from '$lib/utils/adaptive-image-loader.svelte';
-  import { scaleToCover, scaleToFit } from '$lib/utils/container-utils';
+  import { scaleToCover, scaleToFit, type Size } from '$lib/utils/container-utils';
  import { getAltText } from '$lib/utils/thumbnail-util';
  import { toTimelineAsset } from '$lib/utils/timeline-util';
  import type { AssetResponseDto, SharedLinkResponseDto } from '@immich/sdk';
@@ -17,10 +17,7 @@
    asset: AssetResponseDto;
    sharedLink?: SharedLinkResponseDto;
    objectFit?: 'contain' | 'cover';
-    container: {
-      width: number;
-      height: number;
-    };
+    container: Size;
    onUrlChange?: (url: string) => void;
    onImageReady?: () => void;
    onError?: () => void;
@@ -149,81 +146,66 @@
      (quality.preview === 'success' ? previewElement : undefined) ??
      (quality.thumbnail === 'success' ? thumbnailElement : undefined);
  });
-
-  const zoomTransform = $derived.by(() => {
-    const { currentZoom, currentPositionX, currentPositionY } = assetViewerManager.zoomState;
-    if (currentZoom === 1 && currentPositionX === 0 && currentPositionY === 0) {
-      return undefined;
-    }
-    return `translate(${currentPositionX}px, ${currentPositionY}px) scale(${currentZoom})`;
-  });
 </script>

 <div class="relative h-full w-full overflow-hidden will-change-transform" bind:this={ref}>
  {@render backdrop?.()}

-  <!-- pointer-events-none so events pass through to the container where zoom-image listens -->
-  <div
-    class="absolute inset-0 pointer-events-none"
-    style:transform={zoomTransform}
-    style:transform-origin={zoomTransform ? '0 0' : undefined}
-  >
-    <div class="absolute" style:left style:top style:width style:height>
-      {#if show.alphaBackground}
-        <AlphaBackground />
-      {/if}
+  <div class="absolute inset-0 pointer-events-none" style:left style:top style:width style:height>
+    {#if show.alphaBackground}
+      <AlphaBackground />
+    {/if}

-      {#if show.thumbhash}
-        {#if asset.thumbhash}
-          <!-- Thumbhash / spinner layer  -->
-          <canvas use:thumbhash={{ base64ThumbHash: asset.thumbhash }} class="h-full w-full absolute"></canvas>
-        {:else if show.spinner}
-          <DelayedLoadingSpinner />
-        {/if}
+    {#if show.thumbhash}
+      {#if asset.thumbhash}
+        <!-- Thumbhash / spinner layer  -->
+        <canvas use:thumbhash={{ base64ThumbHash: asset.thumbhash }} class="h-full w-full absolute"></canvas>
+      {:else if show.spinner}
+        <DelayedLoadingSpinner />
      {/if}
+    {/if}

-      {#if show.thumbnail}
-        <ImageLayer
-          {adaptiveImageLoader}
-          {width}
-          {height}
-          quality="thumbnail"
-          src={status.urls.thumbnail}
-          alt=""
-          role="presentation"
-          bind:ref={thumbnailElement}
-        />
-      {/if}
+    {#if show.thumbnail}
+      <ImageLayer
+        {adaptiveImageLoader}
+        {width}
+        {height}
+        quality="thumbnail"
+        src={status.urls.thumbnail}
+        alt=""
+        role="presentation"
+        bind:ref={thumbnailElement}
+      />
+    {/if}

-      {#if show.brokenAsset}
-        <BrokenAsset class="text-xl h-full w-full absolute" />
-      {/if}
+    {#if show.brokenAsset}
+      <BrokenAsset class="text-xl h-full w-full absolute" />
+    {/if}

-      {#if show.preview}
-        <ImageLayer
-          {adaptiveImageLoader}
-          {alt}
-          {width}
-          {height}
-          {overlays}
-          quality="preview"
-          src={status.urls.preview}
-          bind:ref={previewElement}
-        />
-      {/if}
+    {#if show.preview}
+      <ImageLayer
+        {adaptiveImageLoader}
+        {alt}
+        {width}
+        {height}
+        {overlays}
+        quality="preview"
+        src={status.urls.preview}
+        bind:ref={previewElement}
+      />
+    {/if}

-      {#if show.original}
-        <ImageLayer
-          {adaptiveImageLoader}
-          {alt}
-          {width}
-          {height}
-          {overlays}
-          quality="original"
-          src={status.urls.original}
-          bind:ref={originalElement}
-        />
-      {/if}
-    </div>
+    {#if show.original}
+      <ImageLayer
+        {adaptiveImageLoader}
+        {alt}
+        {width}
+        {height}
+        {overlays}
+        quality="original"
+        src={status.urls.original}
+        bind:ref={originalElement}
+      />
+    {/if}
  </div>
 </div>
--- a/web/src/lib/components/asset-viewer/face-editor/face-editor.svelte
+++ b/web/src/lib/components/asset-viewer/face-editor/face-editor.svelte
@@ -13,12 +13,12 @@
  import { onMount } from 'svelte';
  import { t } from 'svelte-i18n';

-  interface Props {
+  type Props = {
    htmlElement: HTMLImageElement | HTMLVideoElement;
    containerWidth: number;
    containerHeight: number;
    assetId: string;
-  }
+  };

  let { htmlElement, containerWidth, containerHeight, assetId }: Props = $props();

@@ -295,6 +295,7 @@
 <div
  id="face-editor-data"
  class="absolute start-0 top-0 z-5 h-full w-full overflow-hidden"
+  data-overlay-interactive
  data-face-left={faceBoxPosition.left}
  data-face-top={faceBoxPosition.top}
  data-face-width={faceBoxPosition.width}
--- a/web/src/lib/components/asset-viewer/ocr-bounding-box.svelte
+++ b/web/src/lib/components/asset-viewer/ocr-bounding-box.svelte
@@ -1,4 +1,5 @@
 <script lang="ts">
+  import { mediaQueryManager } from '$lib/stores/media-query-manager.svelte';
  import type { OcrBox } from '$lib/utils/ocr-utils';
  import { calculateBoundingBoxMatrix, calculateFittedFontSize } from '$lib/utils/ocr-utils';

@@ -8,6 +9,7 @@

  let { ocrBox }: Props = $props();

+  const isTouch = $derived(mediaQueryManager.pointerCoarse);
  const dimensions = $derived(calculateBoundingBoxMatrix(ocrBox.points));

  const transform = $derived(`matrix3d(${dimensions.matrix.join(',')})`);
@@ -15,13 +17,23 @@
    calculateFittedFontSize(ocrBox.text, dimensions.width, dimensions.height, ocrBox.verticalMode) + 'px',
  );

+  const handleSelectStart = (event: Event) => {
+    const target = event.currentTarget as HTMLElement;
+    requestAnimationFrame(() => {
+      const selection = globalThis.getSelection();
+      if (selection) {
+        selection.selectAllChildren(target);
+      }
+    });
+  };
+
  const verticalStyle = $derived.by(() => {
    switch (ocrBox.verticalMode) {
      case 'cjk': {
-        return ' writing-mode: vertical-rl;';
+        return 'writing-mode: vertical-rl;';
      }
      case 'rotated': {
-        return ' writing-mode: vertical-rl; text-orientation: sideways;';
+        return 'writing-mode: vertical-rl; text-orientation: sideways;';
      }
      default: {
        return '';
@@ -30,17 +42,23 @@
  });
 </script>

-<div class="absolute left-0 top-0">
-  <div
-    class="absolute flex items-center justify-center text-transparent border-2 border-blue-500 bg-blue-500/10 pointer-events-auto cursor-text select-text transition-colors hover:z-1 hover:text-white hover:bg-black/60 hover:border-blue-600 hover:border-3 focus:z-1 focus:text-white focus:bg-black/60 focus:border-blue-600 focus:border-3 focus:outline-none {ocrBox.verticalMode ===
-    'none'
-      ? 'px-2 py-1 whitespace-nowrap'
-      : 'px-1 py-2'}"
-    style="font-size: {fontSize}; width: {dimensions.width}px; height: {dimensions.height}px; transform: {transform}; transform-origin: 0 0;{verticalStyle}"
-    tabindex="0"
-    role="button"
-    aria-label={ocrBox.text}
-  >
-    {ocrBox.text}
-  </div>
+<div
+  class={[
+    'absolute left-0 top-0 flex items-center justify-center',
+    'border-2 border-blue-500 pointer-events-auto cursor-text',
+    'focus:z-1 focus:border-blue-600 focus:border-3 focus:outline-none',
+    isTouch
+      ? 'text-white bg-black/60 select-all'
+      : 'select-text text-transparent bg-blue-500/10 transition-colors hover:z-1 hover:text-white hover:bg-black/60 hover:border-blue-600 hover:border-3',
+    ocrBox.verticalMode === 'none' ? 'px-2 py-1 whitespace-nowrap' : 'px-1 py-2',
+  ]}
+  style="font-size: {fontSize}; width: {dimensions.width}px; height: {dimensions.height}px; transform: {transform}; transform-origin: 0 0; touch-action: none; {verticalStyle}"
+  data-testid="ocr-box"
+  data-overlay-interactive
+  tabindex="0"
+  role="button"
+  aria-label={ocrBox.text}
+  onselectstart={isTouch ? handleSelectStart : undefined}
+>
+  {ocrBox.text}
 </div>
--- a/web/src/lib/components/asset-viewer/photo-sphere-viewer-adapter.svelte
+++ b/web/src/lib/components/asset-viewer/photo-sphere-viewer-adapter.svelte
@@ -128,10 +128,8 @@
    }

    const boxes = getOcrBoundingBoxes(ocrData, {
-      contentWidth: viewer.state.textureData.panoData.croppedWidth,
-      contentHeight: viewer.state.textureData.panoData.croppedHeight,
-      offsetX: 0,
-      offsetY: 0,
+      width: viewer.state.textureData.panoData.croppedWidth,
+      height: viewer.state.textureData.panoData.croppedHeight,
    });

    for (const [index, box] of boxes.entries()) {
--- a/web/src/lib/components/asset-viewer/photo-viewer.svelte
+++ b/web/src/lib/components/asset-viewer/photo-viewer.svelte
@@ -14,7 +14,7 @@
  import { SlideshowLook, SlideshowState, slideshowStore } from '$lib/stores/slideshow.store';
  import { handlePromiseError } from '$lib/utils';
  import { canCopyImageToClipboard, copyImageToClipboard } from '$lib/utils/asset-utils';
-  import { getNaturalSize, scaleToFit, type ContentMetrics } from '$lib/utils/container-utils';
+  import { getNaturalSize, scaleToFit, type Size } from '$lib/utils/container-utils';
  import { handleError } from '$lib/utils/handle-error';
  import { getOcrBoundingBoxes } from '$lib/utils/ocr-utils';
  import { getBoundingBox } from '$lib/utils/people-utils';
@@ -25,14 +25,14 @@
  import { t } from 'svelte-i18n';
  import type { AssetCursor } from './asset-viewer.svelte';

-  interface Props {
+  type Props = {
    cursor: AssetCursor;
    element?: HTMLDivElement;
    sharedLink?: SharedLinkResponseDto;
    onReady?: () => void;
    onError?: () => void;
    onSwipe?: (event: SwipeCustomEvent) => void;
-  }
+  };

  let { cursor, element = $bindable(), sharedLink, onReady, onError, onSwipe }: Props = $props();

@@ -67,23 +67,15 @@
    height: containerHeight,
  });

-  const overlayMetrics = $derived.by((): ContentMetrics => {
+  const overlaySize = $derived.by((): Size => {
    if (!assetViewerManager.imgRef || !visibleImageReady) {
-      return { contentWidth: 0, contentHeight: 0, offsetX: 0, offsetY: 0 };
+      return { width: 0, height: 0 };
    }

-    const natural = getNaturalSize(assetViewerManager.imgRef);
-    const scaled = scaleToFit(natural, { width: containerWidth, height: containerHeight });
-
-    return {
-      contentWidth: scaled.width,
-      contentHeight: scaled.height,
-      offsetX: 0,
-      offsetY: 0,
-    };
+    return scaleToFit(getNaturalSize(assetViewerManager.imgRef), { width: containerWidth, height: containerHeight });
  });

-  const ocrBoxes = $derived(ocrManager.showOverlay ? getOcrBoundingBoxes(ocrManager.data, overlayMetrics) : []);
+  const ocrBoxes = $derived(ocrManager.showOverlay ? getOcrBoundingBoxes(ocrManager.data, overlaySize) : []);

  const onCopy = async () => {
    if (!canCopyImageToClipboard() || !assetViewerManager.imgRef) {
@@ -151,6 +143,8 @@
    $slideshowState !== SlideshowState.None && $slideshowLook === SlideshowLook.BlurredBackground && !!asset.thumbhash,
  );

+  let adaptiveImage = $state<HTMLDivElement | undefined>();
+
  const faceToNameMap = $derived.by(() => {
    // eslint-disable-next-line svelte/prefer-svelte-reactivity
    const map = new Map<Faces, string>();
@@ -181,7 +175,7 @@
    const mouseX = (event.clientX - containerRect.left - contentOffsetX * currentZoom - currentPositionX) / currentZoom;
    const mouseY = (event.clientY - containerRect.top - contentOffsetY * currentZoom - currentPositionY) / currentZoom;

-    const faceBoxes = getBoundingBox(faces, overlayMetrics);
+    const faceBoxes = getBoundingBox(faces, overlaySize);

    for (const [index, box] of faceBoxes.entries()) {
      if (mouseX >= box.left && mouseX <= box.left + box.width && mouseY >= box.top && mouseY <= box.top + box.height) {
@@ -215,7 +209,7 @@
  ondblclick={onZoom}
  onmousemove={handleImageMouseMove}
  onmouseleave={handleImageMouseLeave}
-  use:zoomImageAction={{ disabled: isFaceEditMode.value || ocrManager.showOverlay }}
+  use:zoomImageAction={{ zoomTarget: adaptiveImage }}
  {...useSwipe((event) => onSwipe?.(event))}
 >
  <AdaptiveImage
@@ -233,6 +227,7 @@
      onReady?.();
    }}
    bind:imgRef={assetViewerManager.imgRef}
+    bind:ref={adaptiveImage}
  >
    {#snippet backdrop()}
      {#if blurredSlideshow}
@@ -243,7 +238,7 @@
      {/if}
    {/snippet}
    {#snippet overlays()}
-      {#each getBoundingBox($boundingBoxesArray, overlayMetrics) as boundingbox, index (boundingbox.id)}
+      {#each getBoundingBox($boundingBoxesArray, overlaySize) as boundingbox, index (boundingbox.id)}
        <div
          class="absolute border-solid border-white border-3 rounded-lg"
          style="top: {boundingbox.top}px; left: {boundingbox.left}px; height: {boundingbox.height}px; width: {boundingbox.width}px;"
--- a/web/src/lib/utils/container-utils.spec.ts
+++ b/web/src/lib/utils/container-utils.spec.ts
@@ -1,4 +1,11 @@
-import { getContentMetrics, getNaturalSize, scaleToFit } from '$lib/utils/container-utils';
+import {
+  getContentMetrics,
+  getNaturalSize,
+  mapNormalizedRectToContent,
+  mapNormalizedToContent,
+  scaleToCover,
+  scaleToFit,
+} from '$lib/utils/container-utils';

 const mockImage = (props: {
  naturalWidth: number;
@@ -92,3 +99,81 @@ describe('getNaturalSize', () => {
    expect(getNaturalSize(video)).toEqual({ width: 1920, height: 1080 });
  });
 });
+
+describe('scaleToCover', () => {
+  it('should scale up to cover container when image is smaller', () => {
+    expect(scaleToCover({ width: 400, height: 300 }, { width: 800, height: 600 })).toEqual({
+      width: 800,
+      height: 600,
+    });
+  });
+
+  it('should use height scale when image is wider than container', () => {
+    expect(scaleToCover({ width: 2000, height: 1000 }, { width: 800, height: 600 })).toEqual({
+      width: 1200,
+      height: 600,
+    });
+  });
+
+  it('should use width scale when image is taller than container', () => {
+    expect(scaleToCover({ width: 1000, height: 2000 }, { width: 800, height: 600 })).toEqual({
+      width: 800,
+      height: 1600,
+    });
+  });
+});
+
+describe('mapNormalizedToContent', () => {
+  const metrics = { contentWidth: 800, contentHeight: 400, offsetX: 0, offsetY: 100 };
+
+  it('should map top-left corner', () => {
+    expect(mapNormalizedToContent({ x: 0, y: 0 }, metrics)).toEqual({ x: 0, y: 100 });
+  });
+
+  it('should map bottom-right corner', () => {
+    expect(mapNormalizedToContent({ x: 1, y: 1 }, metrics)).toEqual({ x: 800, y: 500 });
+  });
+
+  it('should map center point', () => {
+    expect(mapNormalizedToContent({ x: 0.5, y: 0.5 }, metrics)).toEqual({ x: 400, y: 300 });
+  });
+
+  it('should apply offsets correctly for letterboxed content', () => {
+    const letterboxed = { contentWidth: 300, contentHeight: 600, offsetX: 250, offsetY: 0 };
+    expect(mapNormalizedToContent({ x: 0, y: 0 }, letterboxed)).toEqual({ x: 250, y: 0 });
+    expect(mapNormalizedToContent({ x: 1, y: 1 }, letterboxed)).toEqual({ x: 550, y: 600 });
+  });
+
+  it('should accept Size (zero offsets)', () => {
+    const size = { width: 800, height: 400 };
+    expect(mapNormalizedToContent({ x: 0, y: 0 }, size)).toEqual({ x: 0, y: 0 });
+    expect(mapNormalizedToContent({ x: 1, y: 1 }, size)).toEqual({ x: 800, y: 400 });
+    expect(mapNormalizedToContent({ x: 0.5, y: 0.5 }, size)).toEqual({ x: 400, y: 200 });
+  });
+});
+
+describe('mapNormalizedRectToContent', () => {
+  const metrics = { contentWidth: 800, contentHeight: 400, offsetX: 0, offsetY: 100 };
+
+  it('should map a normalized rect to content pixel coordinates', () => {
+    const rect = mapNormalizedRectToContent({ x: 0.25, y: 0.25 }, { x: 0.75, y: 0.75 }, metrics);
+    expect(rect).toEqual({ left: 200, top: 200, width: 400, height: 200 });
+  });
+
+  it('should map full image rect', () => {
+    const rect = mapNormalizedRectToContent({ x: 0, y: 0 }, { x: 1, y: 1 }, metrics);
+    expect(rect).toEqual({ left: 0, top: 100, width: 800, height: 400 });
+  });
+
+  it('should handle letterboxed content with horizontal offsets', () => {
+    const letterboxed = { contentWidth: 300, contentHeight: 600, offsetX: 250, offsetY: 0 };
+    const rect = mapNormalizedRectToContent({ x: 0, y: 0 }, { x: 1, y: 1 }, letterboxed);
+    expect(rect).toEqual({ left: 250, top: 0, width: 300, height: 600 });
+  });
+
+  it('should accept Size (zero offsets)', () => {
+    const size = { width: 800, height: 400 };
+    const rect = mapNormalizedRectToContent({ x: 0.25, y: 0.25 }, { x: 0.75, y: 0.75 }, size);
+    expect(rect).toEqual({ left: 200, top: 100, width: 400, height: 200 });
+  });
+});
--- a/web/src/lib/utils/container-utils.ts
+++ b/web/src/lib/utils/container-utils.ts
@@ -1,14 +1,35 @@
-export interface ContentMetrics {
+// Coordinate spaces used throughout the viewer:
+//
+// "Normalized": 0–1 range, (0,0) = top-left, (1,1) = bottom-right. Resolution-independent.
+//   Example: OCR coordinates, or face coords after dividing by metadata dimensions.
+//
+// "Content": pixel position within the container after scaling (scaleToFit/scaleToCover)
+//   and centering. Used for DOM overlay positioning (face boxes, OCR text).
+//
+// "Natural": pixel position in the original full-resolution image file (e.g. 4000×3000).
+//   Used when cropping or drawing on the source image.
+//
+// "Metadata pixel space": coordinates from face detection / OCR models, in pixels relative
+//   to face.imageWidth/imageHeight. Divide by those dimensions to get normalized coords.
+
+export type Point = {
+  x: number;
+  y: number;
+};
+
+export type Size = {
+  width: number;
+  height: number;
+};
+
+export type ContentMetrics = {
  contentWidth: number;
  contentHeight: number;
  offsetX: number;
  offsetY: number;
-}
+};

-export const scaleToCover = (
-  dimensions: { width: number; height: number },
-  container: { width: number; height: number },
-): { width: number; height: number } => {
+export const scaleToCover = (dimensions: Size, container: Size): Size => {
  const scaleX = container.width / dimensions.width;
  const scaleY = container.height / dimensions.height;
  const scale = Math.max(scaleX, scaleY);
@@ -18,10 +39,7 @@ export const scaleToCover = (
  };
 };

-export const scaleToFit = (
-  dimensions: { width: number; height: number },
-  container: { width: number; height: number },
-): { width: number; height: number } => {
+export const scaleToFit = (dimensions: Size, container: Size): Size => {
  const scaleX = container.width / dimensions.width;
  const scaleY = container.height / dimensions.height;
  const scale = Math.min(scaleX, scaleY);
@@ -31,14 +49,14 @@ export const scaleToFit = (
  };
 };

-const getElementSize = (element: HTMLImageElement | HTMLVideoElement): { width: number; height: number } => {
+const getElementSize = (element: HTMLImageElement | HTMLVideoElement): Size => {
  if (element instanceof HTMLVideoElement) {
    return { width: element.clientWidth, height: element.clientHeight };
  }
  return { width: element.width, height: element.height };
 };

-export const getNaturalSize = (element: HTMLImageElement | HTMLVideoElement): { width: number; height: number } => {
+export const getNaturalSize = (element: HTMLImageElement | HTMLVideoElement): Size => {
  if (element instanceof HTMLVideoElement) {
    return { width: element.videoWidth, height: element.videoHeight };
  }
@@ -56,3 +74,38 @@ export const getContentMetrics = (element: HTMLImageElement | HTMLVideoElement):
    offsetY: (client.height - contentHeight) / 2,
  };
 };
+
+export function mapNormalizedToContent(point: Point, sizeOrMetrics: Size | ContentMetrics): Point {
+  if ('contentWidth' in sizeOrMetrics) {
+    return {
+      x: point.x * sizeOrMetrics.contentWidth + sizeOrMetrics.offsetX,
+      y: point.y * sizeOrMetrics.contentHeight + sizeOrMetrics.offsetY,
+    };
+  }
+  return {
+    x: point.x * sizeOrMetrics.width,
+    y: point.y * sizeOrMetrics.height,
+  };
+}
+
+export type Rect = {
+  top: number;
+  left: number;
+  width: number;
+  height: number;
+};
+
+export function mapNormalizedRectToContent(
+  topLeft: Point,
+  bottomRight: Point,
+  sizeOrMetrics: Size | ContentMetrics,
+): Rect {
+  const tl = mapNormalizedToContent(topLeft, sizeOrMetrics);
+  const br = mapNormalizedToContent(bottomRight, sizeOrMetrics);
+  return {
+    top: tl.y,
+    left: tl.x,
+    width: br.x - tl.x,
+    height: br.y - tl.y,
+  };
+}
--- a/web/src/lib/utils/ocr-utils.spec.ts
+++ b/web/src/lib/utils/ocr-utils.spec.ts
@@ -1,5 +1,5 @@
 import type { OcrBoundingBox } from '$lib/stores/ocr.svelte';
-import type { ContentMetrics } from '$lib/utils/container-utils';
+import type { Size } from '$lib/utils/container-utils';
 import { getOcrBoundingBoxes } from '$lib/utils/ocr-utils';

 describe('getOcrBoundingBoxes', () => {
@@ -21,9 +21,9 @@ describe('getOcrBoundingBoxes', () => {
        text: 'hello',
      },
    ];
-    const metrics: ContentMetrics = { contentWidth: 1000, contentHeight: 500, offsetX: 0, offsetY: 0 };
+    const imageSize: Size = { width: 1000, height: 500 };

-    const boxes = getOcrBoundingBoxes(ocrData, metrics);
+    const boxes = getOcrBoundingBoxes(ocrData, imageSize);

    expect(boxes).toHaveLength(1);
    expect(boxes[0].id).toBe('box1');
@@ -37,7 +37,7 @@ describe('getOcrBoundingBoxes', () => {
    ]);
  });

-  it('should apply offsets for letterboxed images', () => {
+  it('should map full-image box to full display area', () => {
    const ocrData: OcrBoundingBox[] = [
      {
        id: 'box1',
@@ -55,21 +55,20 @@ describe('getOcrBoundingBoxes', () => {
        text: 'test',
      },
    ];
-    const metrics: ContentMetrics = { contentWidth: 600, contentHeight: 400, offsetX: 100, offsetY: 50 };
+    const imageSize: Size = { width: 600, height: 400 };

-    const boxes = getOcrBoundingBoxes(ocrData, metrics);
+    const boxes = getOcrBoundingBoxes(ocrData, imageSize);

    expect(boxes[0].points).toEqual([
-      { x: 100, y: 50 },
-      { x: 700, y: 50 },
-      { x: 700, y: 450 },
-      { x: 100, y: 450 },
+      { x: 0, y: 0 },
+      { x: 600, y: 0 },
+      { x: 600, y: 400 },
+      { x: 0, y: 400 },
    ]);
  });

  it('should return empty array for empty input', () => {
-    const metrics: ContentMetrics = { contentWidth: 800, contentHeight: 600, offsetX: 0, offsetY: 0 };
-    expect(getOcrBoundingBoxes([], metrics)).toEqual([]);
+    expect(getOcrBoundingBoxes([], { width: 800, height: 600 })).toEqual([]);
  });

  it('should handle multiple boxes', () => {
@@ -105,9 +104,9 @@ describe('getOcrBoundingBoxes', () => {
        text: 'second',
      },
    ];
-    const metrics: ContentMetrics = { contentWidth: 200, contentHeight: 200, offsetX: 0, offsetY: 0 };
+    const imageSize: Size = { width: 200, height: 200 };

-    const boxes = getOcrBoundingBoxes(ocrData, metrics);
+    const boxes = getOcrBoundingBoxes(ocrData, imageSize);

    expect(boxes).toHaveLength(2);
    expect(boxes[0].text).toBe('first');
--- a/web/src/lib/utils/ocr-utils.ts
+++ b/web/src/lib/utils/ocr-utils.ts
@@ -1,23 +1,19 @@
 import type { OcrBoundingBox } from '$lib/stores/ocr.svelte';
-import type { ContentMetrics } from '$lib/utils/container-utils';
+import { mapNormalizedToContent, type Point, type Size } from '$lib/utils/container-utils';
 import { clamp } from 'lodash-es';
-
-export type Point = {
-  x: number;
-  y: number;
-};
+export type { Point } from '$lib/utils/container-utils';

 const distance = (p1: Point, p2: Point) => Math.hypot(p2.x - p1.x, p2.y - p1.y);

 export type VerticalMode = 'none' | 'cjk' | 'rotated';

-export interface OcrBox {
+export type OcrBox = {
  id: string;
  points: Point[];
  text: string;
  confidence: number;
  verticalMode: VerticalMode;
-}
+};

 const CJK_PATTERN =
  /[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\uAC00-\uD7AF\uFF00-\uFFEF]/;
@@ -38,7 +34,7 @@ const getVerticalMode = (width: number, height: number, text: string): VerticalM
 * @param points - Array of 4 corner points of the bounding box
 * @returns 4x4 matrix to transform the div with text onto the polygon defined by the corner points, and size to set on the source div.
 */
-export const calculateBoundingBoxMatrix = (points: Point[]): { matrix: number[]; width: number; height: number } => {
+export const calculateBoundingBoxMatrix = (points: Point[]): Size & { matrix: number[] } => {
  const [topLeft, topRight, bottomRight, bottomLeft] = points;

  const width = Math.max(distance(topLeft, topRight), distance(bottomLeft, bottomRight));
@@ -163,7 +159,7 @@ export const calculateFittedFontSize = (
  return clamp(Math.min(scaleFromWidth, scaleFromHeight), MIN_FONT_SIZE, MAX_FONT_SIZE);
 };

-export const getOcrBoundingBoxes = (ocrData: OcrBoundingBox[], metrics: ContentMetrics): OcrBox[] => {
+export const getOcrBoundingBoxes = (ocrData: OcrBoundingBox[], imageSize: Size): OcrBox[] => {
  const boxes: OcrBox[] = [];
  for (const ocr of ocrData) {
    const points = [
@@ -171,10 +167,7 @@ export const getOcrBoundingBoxes = (ocrData: OcrBoundingBox[], metrics: ContentM
      { x: ocr.x2, y: ocr.y2 },
      { x: ocr.x3, y: ocr.y3 },
      { x: ocr.x4, y: ocr.y4 },
-    ].map((point) => ({
-      x: point.x * metrics.contentWidth + metrics.offsetX,
-      y: point.y * metrics.contentHeight + metrics.offsetY,
-    }));
+    ].map((point) => mapNormalizedToContent(point, imageSize));

    const boxWidth = Math.max(distance(points[0], points[1]), distance(points[3], points[2]));
    const boxHeight = Math.max(distance(points[0], points[3]), distance(points[1], points[2]));
@@ -188,7 +181,7 @@ export const getOcrBoundingBoxes = (ocrData: OcrBoundingBox[], metrics: ContentM
    });
  }

-  const rowThreshold = metrics.contentHeight * 0.02;
+  const rowThreshold = imageSize.height * 0.02;
  boxes.sort((a, b) => {
    const yDifference = a.points[0].y - b.points[0].y;
    if (Math.abs(yDifference) < rowThreshold) {
--- a/web/src/lib/utils/people-utils.spec.ts
+++ b/web/src/lib/utils/people-utils.spec.ts
@@ -1,5 +1,5 @@
 import type { Faces } from '$lib/stores/people.store';
-import type { ContentMetrics } from '$lib/utils/container-utils';
+import type { Size } from '$lib/utils/container-utils';
 import { getBoundingBox } from '$lib/utils/people-utils';

 const makeFace = (overrides: Partial<Faces> = {}): Faces => ({
@@ -16,21 +16,21 @@ const makeFace = (overrides: Partial<Faces> = {}): Faces => ({
 describe('getBoundingBox', () => {
  it('should scale face coordinates to display dimensions', () => {
    const face = makeFace();
-    const metrics: ContentMetrics = { contentWidth: 800, contentHeight: 600, offsetX: 0, offsetY: 0 };
+    const imageSize: Size = { width: 800, height: 600 };

-    const boxes = getBoundingBox([face], metrics);
+    const boxes = getBoundingBox([face], imageSize);

    expect(boxes).toHaveLength(1);
    expect(boxes[0]).toEqual({
      id: 'face-1',
-      top: Math.round(600 * (750 / 3000)),
-      left: Math.round(800 * (1000 / 4000)),
-      width: Math.round(800 * (2000 / 4000) - 800 * (1000 / 4000)),
-      height: Math.round(600 * (1500 / 3000) - 600 * (750 / 3000)),
+      top: 600 * (750 / 3000),
+      left: 800 * (1000 / 4000),
+      width: 800 * (2000 / 4000) - 800 * (1000 / 4000),
+      height: 600 * (1500 / 3000) - 600 * (750 / 3000),
    });
  });

-  it('should apply offsets for letterboxed display', () => {
+  it('should map full-image face to full display area', () => {
    const face = makeFace({
      imageWidth: 1000,
      imageHeight: 1000,
@@ -39,49 +39,21 @@ describe('getBoundingBox', () => {
      boundingBoxX2: 1000,
      boundingBoxY2: 1000,
    });
-    const metrics: ContentMetrics = { contentWidth: 600, contentHeight: 600, offsetX: 100, offsetY: 0 };
+    const imageSize: Size = { width: 600, height: 600 };

-    const boxes = getBoundingBox([face], metrics);
+    const boxes = getBoundingBox([face], imageSize);

    expect(boxes[0]).toEqual({
      id: 'face-1',
      top: 0,
-      left: 100,
+      left: 0,
      width: 600,
      height: 600,
    });
  });

-  it('should handle zoom by pre-scaled metrics', () => {
-    const face = makeFace({
-      imageWidth: 1000,
-      imageHeight: 1000,
-      boundingBoxX1: 0,
-      boundingBoxY1: 0,
-      boundingBoxX2: 500,
-      boundingBoxY2: 500,
-    });
-    const metrics: ContentMetrics = {
-      contentWidth: 1600,
-      contentHeight: 1200,
-      offsetX: -200,
-      offsetY: -100,
-    };
-
-    const boxes = getBoundingBox([face], metrics);
-
-    expect(boxes[0]).toEqual({
-      id: 'face-1',
-      top: -100,
-      left: -200,
-      width: 800,
-      height: 600,
-    });
-  });
-
  it('should return empty array for empty faces', () => {
-    const metrics: ContentMetrics = { contentWidth: 800, contentHeight: 600, offsetX: 0, offsetY: 0 };
-    expect(getBoundingBox([], metrics)).toEqual([]);
+    expect(getBoundingBox([], { width: 800, height: 600 })).toEqual([]);
  });

  it('should handle multiple faces', () => {
@@ -89,9 +61,8 @@ describe('getBoundingBox', () => {
      makeFace({ id: 'face-1', boundingBoxX1: 0, boundingBoxY1: 0, boundingBoxX2: 1000, boundingBoxY2: 1000 }),
      makeFace({ id: 'face-2', boundingBoxX1: 2000, boundingBoxY1: 1500, boundingBoxX2: 3000, boundingBoxY2: 2500 }),
    ];
-    const metrics: ContentMetrics = { contentWidth: 800, contentHeight: 600, offsetX: 0, offsetY: 0 };

-    const boxes = getBoundingBox(faces, metrics);
+    const boxes = getBoundingBox(faces, { width: 800, height: 600 });

    expect(boxes).toHaveLength(2);
    expect(boxes[0].left).toBeLessThan(boxes[1].left);
--- a/web/src/lib/utils/people-utils.ts
+++ b/web/src/lib/utils/people-utils.ts
@@ -1,37 +1,21 @@
 import type { Faces } from '$lib/stores/people.store';
 import { getAssetMediaUrl } from '$lib/utils';
-import type { ContentMetrics } from '$lib/utils/container-utils';
+import { mapNormalizedRectToContent, type Rect, type Size } from '$lib/utils/container-utils';
 import { AssetTypeEnum, type AssetFaceResponseDto } from '@immich/sdk';

-export interface BoundingBox {
-  id: string;
-  top: number;
-  left: number;
-  width: number;
-  height: number;
-}
+export type BoundingBox = Rect & { id: string };

-export const getBoundingBox = (faces: Faces[], metrics: ContentMetrics): BoundingBox[] => {
+export const getBoundingBox = (faces: Faces[], imageSize: Size): BoundingBox[] => {
  const boxes: BoundingBox[] = [];

  for (const face of faces) {
-    const scaleX = metrics.contentWidth / face.imageWidth;
-    const scaleY = metrics.contentHeight / face.imageHeight;
+    const rect = mapNormalizedRectToContent(
+      { x: face.boundingBoxX1 / face.imageWidth, y: face.boundingBoxY1 / face.imageHeight },
+      { x: face.boundingBoxX2 / face.imageWidth, y: face.boundingBoxY2 / face.imageHeight },
+      imageSize,
+    );

-    const coordinates = {
-      x1: scaleX * face.boundingBoxX1 + metrics.offsetX,
-      x2: scaleX * face.boundingBoxX2 + metrics.offsetX,
-      y1: scaleY * face.boundingBoxY1 + metrics.offsetY,
-      y2: scaleY * face.boundingBoxY2 + metrics.offsetY,
-    };
-
-    boxes.push({
-      id: face.id,
-      top: Math.round(coordinates.y1),
-      left: Math.round(coordinates.x1),
-      width: Math.round(coordinates.x2 - coordinates.x1),
-      height: Math.round(coordinates.y2 - coordinates.y1),
-    });
+    boxes.push({ id: face.id, ...rect });
  }

  return boxes;