Skip to content

Commit ea743be

Browse files
committed
Debug
1 parent 1965a8e commit ea743be

File tree

3 files changed

+31
-11
lines changed

3 files changed

+31
-11
lines changed

.github/workflows/additional_demo_notebook_tests.yaml

+27-9
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ env:
1414

1515
jobs:
1616
verify-hf_interactive:
17-
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
17+
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
1818
runs-on: ubuntu-20.04-4core
1919

2020
steps:
@@ -85,18 +85,24 @@ jobs:
8585
# Remove login/logout cells, as KinD doesn't support authentication using token
8686
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
8787
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
88+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
89+
sed -i "s/cluster_uri()/local_client_url()/g" hf_interactive.ipynb
8890
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
8991
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
9092
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
9193
# Set explicit namespace as SDK need it (currently) to resolve local queues
92-
sed -i "s/worker_cpu_requests=8,/worker_cpu_requests=1, namespace='default',/" hf_interactive.ipynb
9394
# Change cluster parameters (need to decrease)
9495
sed -i "s/{'nvidia.com\/gpu':1}/{'nvidia.com\/gpu':0}/g" hf_interactive.ipynb
95-
sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=1,/" hf_interactive.ipynb
96-
sed -i "s/worker_memory_requests=16,/worker_memory_requests=4,/" hf_interactive.ipynb
97-
sed -i "s/worker_memory_limits=8,/worker_memory_limits=4,/" hf_interactive.ipynb
96+
sed -i "s/worker_cpu_requests=8,/worker_cpu_requests='250m', namespace='default',/" hf_interactive.ipynb
97+
sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=4,/" hf_interactive.ipynb
98+
sed -i "s/worker_memory_requests=16,/worker_memory_requests=12,/" hf_interactive.ipynb
99+
sed -i "s/worker_memory_limits=16,/worker_memory_limits=12,/" hf_interactive.ipynb
100+
sed -i "s/use_gpu=True/use_gpu=False/" hf_interactive.ipynb
101+
cat hf_interactive.ipynb
98102
# Run notebook
99103
poetry run papermill hf_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
104+
env:
105+
GRPC_DNS_RESOLVER: "native"
100106
working-directory: demo-notebooks/additional-demos
101107

102108
- name: Print CodeFlare operator logs
@@ -135,7 +141,6 @@ jobs:
135141
136142
verify-local_interactive:
137143
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
138-
# runs-on: ubuntu-20.04-4core
139144
runs-on: ubuntu-20.04-4core
140145

141146
steps:
@@ -205,13 +210,18 @@ jobs:
205210
# Remove login/logout cells, as KinD doesn't support authentication using token
206211
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
207212
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
213+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
214+
sed -i "s/cluster_uri()/local_client_url()/g" local_interactive.ipynb
208215
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
209216
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
210217
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
211218
# Set explicit namespace as SDK need it (currently) to resolve local queues
212-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" local_interactive.ipynb
219+
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" local_interactive.ipynb
220+
cat local_interactive.ipynb
213221
# Run notebook
214-
poetry run papermill local_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
222+
poetry run papermill local_interactive.ipynb local_interactive_out.ipynb --log-output --execution-timeout 1200
223+
env:
224+
GRPC_DNS_RESOLVER: "native"
215225
working-directory: demo-notebooks/additional-demos
216226

217227
- name: Print CodeFlare operator logs
@@ -319,13 +329,21 @@ jobs:
319329
# Remove login/logout cells, as KinD doesn't support authentication using token
320330
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
321331
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
332+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
333+
sed -i "s/cluster_uri()/local_client_url()/g" ray_job_client.ipynb
322334
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
323335
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
324336
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
325337
# Set explicit namespace as SDK need it (currently) to resolve local queues
326-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" ray_job_client.ipynb
338+
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" ray_job_client.ipynb
339+
sed -i "s/worker_memory_requests=4,/worker_memory_requests=1,/" ray_job_client.ipynb
340+
sed -i "s/worker_memory_limits=4,/worker_memory_limits=1,/" ray_job_client.ipynb
341+
sed -i "s/RayJobClient(address=ray_dashboard, headers=header, verify=True)/RayJobClient(address=ray_dashboard, verify=False)/" ray_job_client.ipynb
342+
cat ray_job_client.ipynb
327343
# Run notebook
328344
poetry run papermill ray_job_client.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
345+
env:
346+
GRPC_DNS_RESOLVER: "native"
329347
working-directory: demo-notebooks/additional-demos
330348

331349
- name: Print CodeFlare operator logs

demo-notebooks/additional-demos/hf_interactive.ipynb

+3-1
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,9 @@
367367
"outputs": [],
368368
"source": [
369369
"#call the above cell as a remote ray function\n",
370-
"ray.get(train_fn.remote())"
370+
"print(\"Start training:\")\n",
371+
"ray.get(train_fn.remote())\n",
372+
"print(\"End training:\")"
371373
]
372374
},
373375
{

demo-notebooks/additional-demos/ray_job_client.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
" name='jobtest',\n",
6262
" head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
6363
" worker_extended_resource_requests={'nvidia.com/gpu':0},\n",
64-
" num_workers=2,\n",
64+
" num_workers=1,\n",
6565
" worker_cpu_requests=1,\n",
6666
" worker_cpu_limits=1,\n",
6767
" worker_memory_requests=4,\n",

0 commit comments

Comments
 (0)