diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst index 7ca871e70..0dc54930d 100644 --- a/docs/sphinx/user-docs/cluster-configuration.rst +++ b/docs/sphinx/user-docs/cluster-configuration.rst @@ -44,6 +44,21 @@ requirements for creating the Ray Cluster. documentation on building a custom image `here `__. +Ray Usage Statistics +------------------- + +By default, Ray usage statistics collection is disabled in CodeFlare SDK clusters. This stops statistics from being sent to AnyScale. If you want to enable usage statistics collection, you can set the ``RAY_USAGE_STATS_ENABLED`` environment variable to ``1`` in your cluster configuration: + +.. code:: python + + from codeflare_sdk import Cluster, ClusterConfiguration + + cluster = Cluster(ClusterConfiguration( + name='ray-example', + namespace='default', + envs={'RAY_USAGE_STATS_ENABLED': '1'} # Enable usage statistics + )) + The ``labels={"exampleLabel": "example"}`` parameter can be used to apply additional labels to the RayCluster resource. diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py index 4f646baaa..8ea97d550 100644 --- a/src/codeflare_sdk/ray/cluster/config.py +++ b/src/codeflare_sdk/ray/cluster/config.py @@ -161,6 +161,10 @@ def __post_init__(self): "Warning: TLS verification has been disabled - Endpoint checks will be bypassed" ) + # Set default environment variable to disable Ray usage stats if not already set + if "RAY_USAGE_STATS_ENABLED" not in self.envs: + self.envs["RAY_USAGE_STATS_ENABLED"] = "0" + if self.enable_gcs_ft: if not self.redis_address: raise ValueError( diff --git a/src/codeflare_sdk/ray/cluster/test_cluster.py b/src/codeflare_sdk/ray/cluster/test_cluster.py index 298c416ed..ce684607c 100644 --- a/src/codeflare_sdk/ray/cluster/test_cluster.py +++ b/src/codeflare_sdk/ray/cluster/test_cluster.py @@ -465,11 +465,10 @@ def test_get_cluster_no_appwrapper(mocker): return_value=expected_rc, ) get_cluster("test-all-params", "ns", write_to_file=True) - assert filecmp.cmp( - f"{aw_dir}test-all-params.yaml", - f"{expected_clusters_dir}/ray/unit-test-all-params.yaml", - shallow=True, - ) + + with open(f"{aw_dir}test-all-params.yaml") as f: + generated_rc = yaml.load(f, Loader=yaml.FullLoader) + assert generated_rc == expected_rc def test_get_cluster_with_appwrapper(mocker): @@ -487,11 +486,10 @@ def test_get_cluster_with_appwrapper(mocker): return_value=expected_aw, ) get_cluster("aw-all-params", "ns", write_to_file=True) - assert filecmp.cmp( - f"{aw_dir}aw-all-params.yaml", - f"{expected_clusters_dir}/appwrapper/unit-test-all-params.yaml", - shallow=True, - ) + + with open(f"{aw_dir}aw-all-params.yaml") as f: + generated_aw = yaml.load(f, Loader=yaml.FullLoader) + assert generated_aw == expected_aw def test_wait_ready(mocker, capsys): diff --git a/src/codeflare_sdk/ray/cluster/test_config.py b/src/codeflare_sdk/ray/cluster/test_config.py index 6007f60b3..6c990c193 100644 --- a/src/codeflare_sdk/ray/cluster/test_config.py +++ b/src/codeflare_sdk/ray/cluster/test_config.py @@ -24,6 +24,7 @@ import filecmp import pytest import os +import yaml parent = Path(__file__).resolve().parents[4] # project directory expected_clusters_dir = f"{parent}/tests/test_cluster_yamls" @@ -85,7 +86,11 @@ def test_config_creation_all_parameters(mocker): assert cluster.config.worker_memory_requests == "12G" assert cluster.config.worker_memory_limits == "16G" assert cluster.config.appwrapper == False - assert cluster.config.envs == {"key1": "value1", "key2": "value2"} + assert cluster.config.envs == { + "key1": "value1", + "key2": "value2", + "RAY_USAGE_STATS_ENABLED": "0", + } assert cluster.config.image == "example/ray:tag" assert cluster.config.image_pull_secrets == ["secret1", "secret2"] assert cluster.config.write_to_file == True @@ -206,6 +211,46 @@ def test_gcs_fault_tolerance_config_validation(): ) +def test_ray_usage_stats_default(mocker): + mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") + + cluster = Cluster( + ClusterConfiguration(name="default-usage-stats-cluster", namespace="ns") + ) + + # Verify that usage stats are disabled by default + assert cluster.config.envs["RAY_USAGE_STATS_ENABLED"] == "0" + + # Check that the environment variable is set in the YAML + head_container = cluster.resource_yaml["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0] + env_vars = {env["name"]: env["value"] for env in head_container["env"]} + assert env_vars["RAY_USAGE_STATS_ENABLED"] == "0" + + +def test_ray_usage_stats_enabled(mocker): + mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch("kubernetes.client.CustomObjectsApi.list_namespaced_custom_object") + + cluster = Cluster( + ClusterConfiguration( + name="usage-stats-enabled-cluster", + namespace="ns", + envs={"RAY_USAGE_STATS_ENABLED": "1"}, + ) + ) + + assert cluster.config.envs["RAY_USAGE_STATS_ENABLED"] == "1" + + head_container = cluster.resource_yaml["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0] + env_vars = {env["name"]: env["value"] for env in head_container["env"]} + assert env_vars["RAY_USAGE_STATS_ENABLED"] == "1" + + # Make sure to always keep this function last def test_cleanup(): os.remove(f"{aw_dir}test-all-params.yaml") diff --git a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml index f1b754102..af5acbadc 100644 --- a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml @@ -54,6 +54,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: @@ -159,6 +161,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: diff --git a/tests/test_cluster_yamls/kueue/aw_kueue.yaml b/tests/test_cluster_yamls/kueue/aw_kueue.yaml index fd78f0709..7101f6a85 100644 --- a/tests/test_cluster_yamls/kueue/aw_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/aw_kueue.yaml @@ -75,6 +75,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: @@ -133,6 +136,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: diff --git a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml index a6dd81d7d..f8b3aa46d 100644 --- a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml @@ -75,6 +75,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: @@ -133,6 +136,9 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumes: - configMap: items: diff --git a/tests/test_cluster_yamls/ray/default-appwrapper.yaml b/tests/test_cluster_yamls/ray/default-appwrapper.yaml index 6d1cdcd53..1532c0e8c 100644 --- a/tests/test_cluster_yamls/ray/default-appwrapper.yaml +++ b/tests/test_cluster_yamls/ray/default-appwrapper.yaml @@ -53,6 +53,9 @@ spec: name: dashboard - containerPort: 10001 name: client + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' resources: limits: cpu: 2 @@ -111,6 +114,9 @@ spec: - -c - ray stop name: machine-learning + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' resources: limits: cpu: 1 diff --git a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml index 38e02f8f3..db4010265 100644 --- a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml +++ b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml @@ -45,6 +45,9 @@ spec: name: dashboard - containerPort: 10001 name: client + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' resources: limits: cpu: 2 @@ -110,6 +113,9 @@ spec: requests: cpu: 1 memory: 2G + env: + - name: RAY_USAGE_STATS_ENABLED + value: '0' volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert diff --git a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml index d5d8059df..6900b0584 100644 --- a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml @@ -45,6 +45,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: @@ -150,6 +152,8 @@ spec: value: value1 - name: key2 value: value2 + - name: RAY_USAGE_STATS_ENABLED + value: '0' image: example/ray:tag imagePullPolicy: Always lifecycle: