Run PR check for guided notebooks

sutaakar · sutaakar · commit 1e84d8903c3b · 2024-07-25T20:09:46.000+02:00
diff --git a/.github/resources/minio_remote_config_cell.json b/.github/resources/minio_remote_config_cell.json
@@ -0,0 +1,20 @@
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@ray.remote\n",
+    "def get_minio_run_config():\n",
+    "    import s3fs\n",
+    "    import pyarrow\n",
+    "    s3_fs = s3fs.S3FileSystem(\n",
+    "        key = \"minio\",\n",
+    "        secret = \"minio123\",\n",
+    "        endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n",
+    "    )\n",
+    "    custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n",
+    "    run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n",
+    "    return run_config"
+   ]
+  }
diff --git a/.github/resources/wait_for_job_cell.json b/.github/resources/wait_for_job_cell.json
@@ -0,0 +1,20 @@
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from time import sleep\n",
+    "\n",
+    "finished = False\n",
+    "while not finished:\n",
+    "    sleep(5)\n",
+    "    status = client.get_job_status(submission_id)\n",
+    "    finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n",
+    "    print(status)\n",
+    "print(\"Job status \" + status)\n",
+    "print(\"Logs: \")\n",
+    "print(client.get_job_logs(submission_id))\n",
+    "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\""
+   ]
+  }
diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml
@@ -1,4 +1,4 @@
-name: e2e
+name: Guided notebooks tests
 
 on:
   pull_request:
@@ -27,6 +27,10 @@ concurrency:
 env:
   CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
 
+permissions:
+   id-token: write   # This is required for requesting the JWT
+   contents: read
+
 jobs:
   kubernetes:
 
@@ -76,6 +80,8 @@ jobs:
 
       - name: Install NVidia GPU operator for KinD
         uses: ./common/github-actions/nvidia-gpu-operator
+        with:
+          enable-time-slicing: 'true'
 
       - name: Deploy CodeFlare stack
         id: deploy
@@ -88,6 +94,11 @@ jobs:
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
           cd ..
 
+      - name: Install MINIO
+        run: |
+          kubectl apply -f ./tests/e2e/minio_deployment.yaml
+          kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio
+
       - name: Add user to KinD
         uses: ./common/github-actions/kind-add-user
         with:
@@ -113,46 +124,99 @@ jobs:
           kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
           kubectl config use-context sdk-user
 
-      - name: Run e2e tests
+      - name: Setup Guided notebooks execution
         run: |
-          export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
-          echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
+          echo "Installing papermill and dependencies..."
+          pip install poetry papermill ipython ipykernel
+          # Disable virtualenv due to problems using packaged in virtualenv in papermill
+          poetry config virtualenvs.create false
 
-          set -euo pipefail
-          pip install poetry
+          echo "Installing SDK..."
           poetry install --with test,docs
-          echo "Running e2e tests..."
-          poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
+
+      - name: Run 0_basic_ray.ipynb
+        run: |
+          set -euo pipefail
+
+          # Remove login/logout cells, as KinD doesn't support authentication using token
+          jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
+          jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
+          # Run notebook
+#          poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600
+        working-directory: demo-notebooks/guided-demos
+
+      - name: Run 1_cluster_job_client.ipynb
+        run: |
+          set -euo pipefail
+
+          # Remove login/logout cells, as KinD doesn't support authentication using token
+          jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
+          jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
+          # Replace async logs with waiting for job to finish, async logs don't work properly in papermill
+          JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
+          jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
+          # Run notebook
+#          poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200
+        working-directory: demo-notebooks/guided-demos
+
+      - name: Run 2_basic_interactive.ipynb
+        run: |
+          set -euo pipefail
+
+          # Remove login/logout cells, as KinD doesn't support authentication using token
+          jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
+          jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
+          # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
+          sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
+          # Set explicit namespace as SDK need it (currently) to resolve local queues
+          sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb
+          # Add MINIO related modules to runtime environment
+          sed -i "s/transformers/s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb
+          # Replace markdown cell with remote configuration for MINIO
+          MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json)
+          jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
+          # Configure persistent storage for Ray trainer
+          sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb
+          # Run notebook
+          poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200
         env:
           GRPC_DNS_RESOLVER: "native"
+        working-directory: demo-notebooks/guided-demos
 
       - name: Switch to kind-cluster context to print logs
         if: always() && steps.deploy.outcome == 'success'
         run: kubectl config use-context kind-cluster
 
-      - name: Print Pytest output log
+      - name: Print debug info
         if: always() && steps.deploy.outcome == 'success'
         run: |
-          echo "Printing Pytest output logs"
-          cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
+          echo "Printing debug info"
+          kubectl describe pods -n default
 
       - name: Print CodeFlare operator logs
         if: always() && steps.deploy.outcome == 'success'
         run: |
           echo "Printing CodeFlare operator logs"
-          kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
+          kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log
+
+      - name: Print Kueue operator logs
+        if: always() && steps.deploy.outcome == 'success'
+        run: |
+          echo "Printing Kueue operator logs"
+          KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
+          kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
 
       - name: Print KubeRay operator logs
         if: always() && steps.deploy.outcome == 'success'
         run: |
           echo "Printing KubeRay operator logs"
-          kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
+          kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
 
       - name: Export all KinD pod logs
         uses: ./common/github-actions/kind-export-logs
         if: always() && steps.deploy.outcome == 'success'
         with:
-          output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
+          output-directory: ${TEMP_DIR}
 
       - name: Upload logs
         uses: actions/upload-artifact@v4
@@ -161,4 +225,4 @@ jobs:
           name: logs
           retention-days: 10
           path: |
-            ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
+            ${{ env.TEMP_DIR }}/**/*.log
diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb
@@ -62,10 +62,12 @@
     "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
     "cluster = Cluster(ClusterConfiguration(\n",
     "    name='raytest', \n",
+    "    head_cpus='500m',\n",
+    "    head_memory=2,\n",
     "    head_gpus=0, # For GPU enabled workloads set the head_gpus and num_gpus\n",
     "    num_gpus=0,\n",
     "    num_workers=2,\n",
-    "    min_cpus=1,\n",
+    "    min_cpus='250m',\n",
     "    max_cpus=1,\n",
     "    min_memory=4,\n",
     "    max_memory=4,\n",
diff --git a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb
@@ -44,10 +44,12 @@
     "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
     "cluster = Cluster(ClusterConfiguration(\n",
     "    name='jobtest',\n",
+    "    head_cpus=1,\n",
+    "    head_memory=4,\n",
     "    head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
     "    num_gpus=1,\n",
     "    num_workers=2,\n",
-    "    min_cpus=1,\n",
+    "    min_cpus='250m',\n",
     "    max_cpus=1,\n",
     "    min_memory=4,\n",
     "    max_memory=4,\n",
diff --git a/demo-notebooks/guided-demos/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb
@@ -60,13 +60,15 @@
     "cluster_name = \"interactivetest\"\n",
     "cluster = Cluster(ClusterConfiguration(\n",
     "    name=cluster_name,\n",
+    "    head_cpus=1,\n",
+    "    head_memory=4,\n",
     "    head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n",
     "    num_gpus=1,\n",
     "    num_workers=2,\n",
-    "    min_cpus=2,\n",
-    "    max_cpus=2,\n",
-    "    min_memory=8,\n",
-    "    max_memory=8,\n",
+    "    min_cpus='250m',\n",
+    "    max_cpus=1,\n",
+    "    min_memory=4,\n",
+    "    max_memory=4,\n",
     "    image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n",
     "    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
     "    # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -251,7 +253,17 @@
     "\n",
     "    ray_trainer = TorchTrainer(\n",
     "        train_func,\n",
-    "        scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n",
+    "        scaling_config=ScalingConfig(\n",
+    "            # num_workers = number of worker nodes with the ray head node included\n",
+    "            num_workers=3,\n",
+    "            use_gpu=True,\n",
+    "            resources_per_worker={\n",
+    "                \"CPU\": 1,\n",
+    "            },\n",
+    "            trainer_resources={\n",
+    "                \"CPU\": 0,\n",
+    "            }\n",
+    "        )\n",
     "        # Configure persistent storage that is accessible across \n",
     "        # all worker nodes.\n",
     "        # Uncomment and update the RunConfig below to include your storage details.\n",
diff --git a/demo-notebooks/guided-demos/mnist_fashion.py b/demo-notebooks/guided-demos/mnist_fashion.py
@@ -78,8 +78,16 @@ def train_func_distributed():
 trainer = TorchTrainer(
     train_func_distributed,
     scaling_config=ScalingConfig(
-        num_workers=3, use_gpu=use_gpu
-    ),  # num_workers = number of worker nodes with the ray head node included
+        # num_workers = number of worker nodes with the ray head node included
+        num_workers=3,
+        use_gpu=use_gpu,
+        resources_per_worker={
+            "CPU": 1,
+        },
+        trainer_resources={
+            "CPU": 0,
+        },
+    ),
 )
 
 results = trainer.fit()
diff --git a/tests/e2e/minio_deployment.yaml b/tests/e2e/minio_deployment.yaml
@@ -88,10 +88,7 @@ spec:
               mountPath: /data
               subPath: minio
           terminationMessagePolicy: File
-          image: >-
-            quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z
-            # In case of disconnected environment, use image digest instead of tag
-            # For example : <mirror_registry_endpoint>/minio/minio@sha256:6b3abf2f59286b985bfde2b23e37230b466081eda5dccbf971524d54c8e406b5
+          image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z
           args:
             - server
             - /data
@@ -129,35 +126,3 @@ spec:
   sessionAffinity: None
   selector:
     app: minio
----
-kind: Route
-apiVersion: route.openshift.io/v1
-metadata:
-  name: minio-api
-spec:
-  to:
-    kind: Service
-    name: minio-service
-    weight: 100
-  port:
-    targetPort: api
-  wildcardPolicy: None
-  tls:
-    termination: edge
-    insecureEdgeTerminationPolicy: Redirect
----
-kind: Route
-apiVersion: route.openshift.io/v1
-metadata:
-  name: minio-ui
-spec:
-  to:
-    kind: Service
-    name: minio-service
-    weight: 100
-  port:
-    targetPort: ui
-  wildcardPolicy: None
-  tls:
-    termination: edge
-    insecureEdgeTerminationPolicy: Redirect