clusterforge/kaiwo/CustomResourceDefinition_kaiwoconfigs.config.kaiwo.silogen.ai.yaml

---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
  annotations:
    controller-gen.kubebuilder.io/version: v0.17.1
  name: kaiwoconfigs.config.kaiwo.silogen.ai
spec:
  group: config.kaiwo.silogen.ai
  names:
    kind: KaiwoConfig
    listKind: KaiwoConfigList
    plural: kaiwoconfigs
    singular: kaiwoconfig
  scope: Cluster
  versions:
    - name: v1alpha1
      schema:
        openAPIV3Schema:
          description: KaiwoConfig manages the Kaiwo operator's configuration which can be modified during runtime.
          properties:
            apiVersion:
              description: |-
                APIVersion defines the versioned schema of this representation of an object.
                Servers should convert recognized schemas to the latest internal value, and
                may reject unrecognized values.
                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
              type: string
            kind:
              description: |-
                Kind is a string value representing the REST resource this object represents.
                Servers may infer this from the endpoint the client submits requests to.
                Cannot be updated.
                In CamelCase.
                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
              type: string
            metadata:
              type: object
            spec:
              description: Spec defines the desired state for the Kaiwo operator configuration.
              properties:
                defaultClusterQueueCohortName:
                  default: kaiwo
                  description: |-
                    DefaultClusterQueueCohortName is the name of the default cohort that is used for the default cluster queue.
                    ClusterQueues in the same cohort can share resources.
                  type: string
                defaultClusterQueueName:
                  default: kaiwo
                  description: DefaultClusterQueueName is the name of the default cluster queue that is used for workloads that don't explicitly specify a cluster queue.
                  type: string
                dynamicallyUpdateDefaultClusterQueue:
                  default: false
                  description: |-
                    DynamicallyUpdateDefaultClusterQueue defines whether the Kaiwo operator should dynamically update default "kaiwo" clusterqueue.
                    If set to true, the operator will make sure that the default clusterqueue is always up to date and reflects total resources available.
                    If nodes are added or removed, the operator will update the default clusterqueue to reflect the current state of the cluster.
                  type: boolean
                nodes:
                  default: {}
                  description: Nodes defines the node configuration settings
                  properties:
                    addTaintsToGpuNodes:
                      default: false
                      description: AddTaintsToGpuNodes if set to true, will add the DefaultGpuTaintKey taint to the GPU nodes
                      type: boolean
                    defaultGpuResourceKey:
                      default: amd.com/gpu
                      description: DefaultGpuResourceKey defines the default GPU resource key that is used to reserve GPU capacity for pods
                      type: string
                    defaultGpuTaintKey:
                      default: kaiwo.silogen.ai/gpu
                      description: DefaultGpuTaintKey is the key that is used to taint GPU nodes
                      type: string
                    excludeMasterNodesFromNodePools:
                      default: false
                      description: ExcludeMasterNodesFromNodePools allows excluding the master node(s) from the node pools
                      type: boolean
                  type: object
                ray:
                  default: {}
                  description: Ray defines the Ray-specific settings
                  properties:
                    defaultRayImage:
                      default: ghcr.io/silogen/rocm-ray:6.4
                      description: DefaultRayImage is the image that is used for Ray workloads if no image is provided in the workload CRD
                      type: string
                    headPodMemory:
                      default: 16Gi
                      description: HeadPodMemory is the amount of memory that is requested for the Ray head pod
                      type: string
                  type: object
                resourceMonitoring:
                  default: {}
                  description: ResourceMonitoring defines the resource-monitoring specific settings
                  properties:
                    lowUtilizationThreshold:
                      default: 1
                      description: LowUtilizationThreshold is the threshold which, if the metric goes under, the workload is considered underutilized. The threshold is interpreted as the percentage utilization versus the requested capacity.
                      minimum: 0
                      type: number
                    profile:
                      default: gpu
                      description: Profile chooses the target resource to monitor.
                      enum:
                        - gpu
                      type: string
                    targetNamespaces:
                      description: TargetNamespaces is a list of namespaces to apply the monitoring to. If not supplied or empty, all namespaces apart from kube-system will be inspected. However, only pods associated with KaiwoJobs or KaiwoServices are impacted.
                      items:
                        type: string
                      type: array
                    terminateUnderutilized:
                      default: false
                      description: TerminateUnderutilized will terminate workloads that are underutilizing resources if set to `true`
                      type: boolean
                    terminateUnderutilizedAfter:
                      default: 24h
                      description: TerminateUnderutilizedAfter specifies the duration after which the workload will be terminated if it has been underutilizing resources (for this amount of time)
                      pattern: ^([0-9]+(s|m|h))+$
                      type: string
                  type: object
                scheduling:
                  default: {}
                  description: Scheduling contains the configuration Kaiwo uses for workload scheduling
                  properties:
                    kubeSchedulerName:
                      default: kaiwo-scheduler
                      description: KubeSchedulerName defines the default scheduler name that is used to schedule the workload
                      type: string
                    pendingThresholdForPreemption:
                      default: 5m
                      description: |-
                        PendingThresholdForPreemption is the threshold that is used to determine if a workload is awaiting for compute resources to be available.
                        If the workload is requesting GPUs and pending for longer than this threshold, kaiwo will start preempting workloads that have exceeded their duration deadline and are using GPUs of the same vendor as the pending workload.
                      type: string
                  type: object
                storage:
                  default: {}
                  description: Storage defines the storage-specific settings
                  properties:
                    defaultDataMountPath:
                      default: /workload
                      description: |-
                        DefaultDataMountPath is the default path for the data storage and downloads that gets mounted in the workload pods.
                        This value can be overwritten in the workload CRD.
                      type: string
                    defaultHfMountPath:
                      default: /hf_cache
                      description: |-
                        DefaultHfMountPath is the default path for the HuggingFace that gets mounted in the workload pods. The `HF_HOME` environmental variable
                        is also set to this value. This value can be overwritten in the workload CRD.
                      type: string
                    defaultStorageClass:
                      description: DefaultStorageClass is the storage class that is used for workloads that don't explicitly specify a storage class.
                      type: string
                  type: object
              type: object
          type: object
      served: true
      storage: true