version: "3.8"

services:
  llm_trainer:
    image: pytorch/pytorch:2.1.2-cuda11.8-cudnn8-devel
    working_dir: /workspace
    network_mode: host
    shm_size: "16gb"
    volumes:
      - ./:/workspace
      - ./data:/datasets
      - ./model:/model
    environment:
      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: "all"
              capabilities: [gpu]
    container_name: llm_trainer
    entrypoint: ["/bin/bash"]
    tty: true