Skip to content

Commit 18a81df

Browse files
committed
Debug
1 parent 1965a8e commit 18a81df

File tree

3 files changed

+89
-9
lines changed

3 files changed

+89
-9
lines changed

.github/workflows/additional_demo_notebook_tests.yaml

+49-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ env:
1414

1515
jobs:
1616
verify-hf_interactive:
17-
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
17+
if: ${{ github.event.label.name == 'test' }}
1818
runs-on: ubuntu-20.04-4core
1919

2020
steps:
@@ -85,6 +85,8 @@ jobs:
8585
# Remove login/logout cells, as KinD doesn't support authentication using token
8686
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
8787
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
88+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
89+
sed -i "s/cluster_uri()/local_client_url()/g" hf_interactive.ipynb
8890
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
8991
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
9092
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
@@ -136,7 +138,7 @@ jobs:
136138
verify-local_interactive:
137139
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
138140
# runs-on: ubuntu-20.04-4core
139-
runs-on: ubuntu-20.04-4core
141+
runs-on: ubuntu-20.04-4core-gpu
140142

141143
steps:
142144
- name: Checkout code
@@ -174,9 +176,17 @@ jobs:
174176
python-version: '3.9'
175177
cache: 'pip' # caching pip dependencies
176178

179+
- name: Setup NVidia GPU environment for KinD
180+
uses: ./common/github-actions/nvidia-gpu-setup
181+
177182
- name: Setup and start KinD cluster
178183
uses: ./common/github-actions/kind
179184

185+
- name: Install NVidia GPU operator for KinD
186+
uses: ./common/github-actions/nvidia-gpu-operator
187+
with:
188+
enable-time-slicing: 'true'
189+
180190
- name: Deploy CodeFlare stack
181191
id: deploy
182192
run: |
@@ -188,6 +198,11 @@ jobs:
188198
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
189199
cd ..
190200
201+
- name: Install MINIO
202+
run: |
203+
kubectl apply -f ./tests/e2e/minio_deployment.yaml
204+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio
205+
191206
- name: Setup Additional demo notebooks execution
192207
run: |
193208
echo "Installing papermill and dependencies..."
@@ -205,14 +220,41 @@ jobs:
205220
# Remove login/logout cells, as KinD doesn't support authentication using token
206221
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
207222
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
223+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
224+
sed -i "s/cluster_uri()/local_client_url()/" local_interactive.ipynb
208225
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
209226
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
210227
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
211228
# Set explicit namespace as SDK need it (currently) to resolve local queues
212-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" local_interactive.ipynb
229+
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" local_interactive.ipynb
213230
# Run notebook
214-
poetry run papermill local_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
231+
poetry run papermill local_interactive.ipynb local_interactive_out.ipynb --log-output --execution-timeout 1200
232+
env:
233+
GRPC_DNS_RESOLVER: "native"
215234
working-directory: demo-notebooks/additional-demos
235+
# - name: Run 2_basic_interactive.ipynb
236+
# run: |
237+
# set -euo pipefail
238+
239+
# # Remove login/logout cells, as KinD doesn't support authentication using token
240+
# jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
241+
# jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
242+
# # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
243+
# sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
244+
# # Set explicit namespace as SDK need it (currently) to resolve local queues
245+
# sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 2_basic_interactive.ipynb
246+
# # Add MINIO related modules to runtime environment
247+
# sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb
248+
# # Replace markdown cell with remote configuration for MINIO
249+
# MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json)
250+
# jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
251+
# # Configure persistent storage for Ray trainer
252+
# sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb
253+
# # Run notebook
254+
# poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200
255+
# env:
256+
# GRPC_DNS_RESOLVER: "native"
257+
# working-directory: demo-notebooks/guided-demos
216258

217259
- name: Print CodeFlare operator logs
218260
if: always() && steps.deploy.outcome == 'success'
@@ -249,7 +291,7 @@ jobs:
249291
${{ env.TEMP_DIR }}/**/*.log
250292
251293
verify-ray_job_client:
252-
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
294+
if: ${{ github.event.label.name == 'test' }}
253295
runs-on: ubuntu-20.04-4core
254296

255297
steps:
@@ -319,6 +361,8 @@ jobs:
319361
# Remove login/logout cells, as KinD doesn't support authentication using token
320362
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
321363
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
364+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
365+
sed -i "s/cluster_uri()/local_client_url()/g" ray_job_client.ipynb
322366
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
323367
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
324368
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb

.github/workflows/guided_notebook_tests.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ name: Guided notebooks tests
33
on:
44
pull_request:
55
branches: [ main ]
6+
workflow_dispatch:
67

78
concurrency:
89
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -246,7 +247,7 @@ jobs:
246247
${{ env.TEMP_DIR }}/**/*.log
247248
248249
verify-2_basic_interactive:
249-
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
250+
#if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
250251
runs-on: ubuntu-20.04-4core-gpu
251252

252253
steps:

demo-notebooks/additional-demos/local_interactive.ipynb

+38-3
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,16 @@
5454
"source": [
5555
"# Create and submit our cluster\n",
5656
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
57-
"cluster_name = \"hfgputest-1\"\n",
57+
"cluster_name = \"hfgputest\"\n",
5858
"\n",
5959
"cluster = Cluster(ClusterConfiguration(\n",
6060
" name=cluster_name,\n",
6161
" head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
6262
" worker_extended_resource_requests={'nvidia.com/gpu':0},\n",
63+
" head_cpu_requests=1,\n",
64+
" head_cpu_limits=1,\n",
65+
" head_memory_requests=6,\n",
66+
" head_memory_limits=6,\n",
6367
" num_workers=1,\n",
6468
" worker_cpu_requests=1,\n",
6569
" worker_cpu_limits=1,\n",
@@ -95,6 +99,16 @@
9599
"cluster.wait_ready()"
96100
]
97101
},
102+
{
103+
"cell_type": "code",
104+
"execution_count": null,
105+
"id": "2c36c2b5",
106+
"metadata": {},
107+
"outputs": [],
108+
"source": [
109+
"cluster.details()"
110+
]
111+
},
98112
{
99113
"cell_type": "markdown",
100114
"id": "12eef53c",
@@ -127,8 +141,29 @@
127141
"source": [
128142
"import ray\n",
129143
"\n",
130-
"ray.shutdown()\n",
131-
"ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")"
144+
"ray.shutdown()\n"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": null,
150+
"id": "dd56c33c",
151+
"metadata": {},
152+
"outputs": [],
153+
"source": [
154+
"ray_cluster_uri = cluster.cluster_uri()\n",
155+
"print(f\"Ray cluster URI: {ray_cluster_uri}\")\n"
156+
]
157+
},
158+
{
159+
"cell_type": "code",
160+
"execution_count": null,
161+
"id": "3dab4124",
162+
"metadata": {},
163+
"outputs": [],
164+
"source": [
165+
"runtime_env = {\"pip\": [\"transformers==4.41.2\", \"datasets==2.17.0\", \"accelerate==0.31.0\", \"scikit-learn==1.5.0\"]}\n",
166+
"ray.init(address=ray_cluster_uri, runtime_env=runtime_env)"
132167
]
133168
},
134169
{

0 commit comments

Comments
 (0)