Skip to content

Commit 564fbbb

Browse files
committed
Debug
1 parent 1965a8e commit 564fbbb

File tree

3 files changed

+93
-16
lines changed

3 files changed

+93
-16
lines changed

.github/workflows/additional_demo_notebook_tests.yaml

+54-12
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ env:
1414

1515
jobs:
1616
verify-hf_interactive:
17-
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
17+
if: ${{ github.event.label.name == 'test' }}
1818
runs-on: ubuntu-20.04-4core
1919

2020
steps:
@@ -85,6 +85,8 @@ jobs:
8585
# Remove login/logout cells, as KinD doesn't support authentication using token
8686
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
8787
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
88+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
89+
sed -i "s/cluster_uri()/local_client_url()/g" hf_interactive.ipynb
8890
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
8991
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
9092
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
@@ -136,7 +138,7 @@ jobs:
136138
verify-local_interactive:
137139
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
138140
# runs-on: ubuntu-20.04-4core
139-
runs-on: ubuntu-20.04-4core
141+
runs-on: ubuntu-20.04-4core-gpu
140142

141143
steps:
142144
- name: Checkout code
@@ -174,9 +176,17 @@ jobs:
174176
python-version: '3.9'
175177
cache: 'pip' # caching pip dependencies
176178

179+
- name: Setup NVidia GPU environment for KinD
180+
uses: ./common/github-actions/nvidia-gpu-setup
181+
177182
- name: Setup and start KinD cluster
178183
uses: ./common/github-actions/kind
179184

185+
- name: Install NVidia GPU operator for KinD
186+
uses: ./common/github-actions/nvidia-gpu-operator
187+
with:
188+
enable-time-slicing: 'true'
189+
180190
- name: Deploy CodeFlare stack
181191
id: deploy
182192
run: |
@@ -188,6 +198,11 @@ jobs:
188198
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
189199
cd ..
190200
201+
- name: Install MINIO
202+
run: |
203+
kubectl apply -f ./tests/e2e/minio_deployment.yaml
204+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio
205+
191206
- name: Setup Additional demo notebooks execution
192207
run: |
193208
echo "Installing papermill and dependencies..."
@@ -198,21 +213,46 @@ jobs:
198213
echo "Installing SDK..."
199214
poetry install --with test,docs
200215
201-
- name: Run local_interactive.ipynb
216+
# - name: Run local_interactive.ipynb
217+
# run: |
218+
# set -euo pipefail
219+
220+
# # Remove login/logout cells, as KinD doesn't support authentication using token
221+
# jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
222+
# jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
223+
# # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
224+
# sed -i "s/cluster_uri()/local_client_url()/g" local_interactive.ipynb
225+
# # Replace async logs with waiting for job to finish, async logs don't work properly in papermill
226+
# JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
227+
# jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
228+
# # Set explicit namespace as SDK need it (currently) to resolve local queues
229+
# sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" local_interactive.ipynb
230+
# # Run notebook
231+
# poetry run papermill local_interactive.ipynb local_interactive_out.ipynb --log-output --execution-timeout 1200
232+
# working-directory: demo-notebooks/additional-demos
233+
- name: Run 2_basic_interactive.ipynb
202234
run: |
203235
set -euo pipefail
204236
205237
# Remove login/logout cells, as KinD doesn't support authentication using token
206-
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
207-
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
208-
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
209-
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
210-
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
238+
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
239+
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
240+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
241+
sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
211242
# Set explicit namespace as SDK need it (currently) to resolve local queues
212-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" local_interactive.ipynb
243+
sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 2_basic_interactive.ipynb
244+
# Add MINIO related modules to runtime environment
245+
sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb
246+
# Replace markdown cell with remote configuration for MINIO
247+
MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json)
248+
jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb
249+
# Configure persistent storage for Ray trainer
250+
sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb
213251
# Run notebook
214-
poetry run papermill local_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
215-
working-directory: demo-notebooks/additional-demos
252+
poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200
253+
env:
254+
GRPC_DNS_RESOLVER: "native"
255+
working-directory: demo-notebooks/guided-demos
216256

217257
- name: Print CodeFlare operator logs
218258
if: always() && steps.deploy.outcome == 'success'
@@ -249,7 +289,7 @@ jobs:
249289
${{ env.TEMP_DIR }}/**/*.log
250290
251291
verify-ray_job_client:
252-
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
292+
if: ${{ github.event.label.name == 'test' }}
253293
runs-on: ubuntu-20.04-4core
254294

255295
steps:
@@ -319,6 +359,8 @@ jobs:
319359
# Remove login/logout cells, as KinD doesn't support authentication using token
320360
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
321361
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
362+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
363+
sed -i "s/cluster_uri()/local_client_url()/g" ray_job_client.ipynb
322364
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
323365
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
324366
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb

.github/workflows/guided_notebook_tests.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ name: Guided notebooks tests
33
on:
44
pull_request:
55
branches: [ main ]
6+
workflow_dispatch:
67

78
concurrency:
89
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -246,7 +247,7 @@ jobs:
246247
${{ env.TEMP_DIR }}/**/*.log
247248
248249
verify-2_basic_interactive:
249-
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
250+
#if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
250251
runs-on: ubuntu-20.04-4core-gpu
251252

252253
steps:

demo-notebooks/additional-demos/local_interactive.ipynb

+37-3
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,16 @@
5454
"source": [
5555
"# Create and submit our cluster\n",
5656
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
57-
"cluster_name = \"hfgputest-1\"\n",
57+
"cluster_name = \"hfgputest\"\n",
5858
"\n",
5959
"cluster = Cluster(ClusterConfiguration(\n",
6060
" name=cluster_name,\n",
6161
" head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
6262
" worker_extended_resource_requests={'nvidia.com/gpu':0},\n",
63+
" head_cpu_requests=1,\n",
64+
" head_cpu_limits=1,\n",
65+
" head_memory_requests=6,\n",
66+
" head_memory_limits=6,\n",
6367
" num_workers=1,\n",
6468
" worker_cpu_requests=1,\n",
6569
" worker_cpu_limits=1,\n",
@@ -95,6 +99,16 @@
9599
"cluster.wait_ready()"
96100
]
97101
},
102+
{
103+
"cell_type": "code",
104+
"execution_count": null,
105+
"id": "2c36c2b5",
106+
"metadata": {},
107+
"outputs": [],
108+
"source": [
109+
"cluster.details()"
110+
]
111+
},
98112
{
99113
"cell_type": "markdown",
100114
"id": "12eef53c",
@@ -127,8 +141,28 @@
127141
"source": [
128142
"import ray\n",
129143
"\n",
130-
"ray.shutdown()\n",
131-
"ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")"
144+
"ray.shutdown()\n"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": null,
150+
"id": "dd56c33c",
151+
"metadata": {},
152+
"outputs": [],
153+
"source": [
154+
"ray_cluster_uri = cluster.cluster_uri()\n",
155+
"print(f\"Ray cluster URI: {ray_cluster_uri}\")\n"
156+
]
157+
},
158+
{
159+
"cell_type": "code",
160+
"execution_count": null,
161+
"id": "3dab4124",
162+
"metadata": {},
163+
"outputs": [],
164+
"source": [
165+
"ray.init(address=ray_cluster_uri, logging_level=\"DEBUG\")"
132166
]
133167
},
134168
{

0 commit comments

Comments
 (0)