Skip to content

Commit c2166cb

Browse files
committed
Debug
1 parent 1965a8e commit c2166cb

File tree

2 files changed

+37
-10
lines changed

2 files changed

+37
-10
lines changed

.github/workflows/additional_demo_notebook_tests.yaml

+34-9
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ env:
1515
jobs:
1616
verify-hf_interactive:
1717
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
18-
runs-on: ubuntu-20.04-4core
18+
runs-on: ubuntu-20.04-4core-gpu
1919

2020
steps:
2121
- name: Checkout code
@@ -53,9 +53,17 @@ jobs:
5353
python-version: '3.9'
5454
cache: 'pip' # caching pip dependencies
5555

56+
- name: Setup NVidia GPU environment for KinD
57+
uses: ./common/github-actions/nvidia-gpu-setup
58+
5659
- name: Setup and start KinD cluster
5760
uses: ./common/github-actions/kind
5861

62+
- name: Install NVidia GPU operator for KinD
63+
uses: ./common/github-actions/nvidia-gpu-operator
64+
with:
65+
enable-time-slicing: 'true'
66+
5967
- name: Deploy CodeFlare stack
6068
id: deploy
6169
run: |
@@ -85,18 +93,24 @@ jobs:
8593
# Remove login/logout cells, as KinD doesn't support authentication using token
8694
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
8795
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
96+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
97+
sed -i "s/cluster_uri()/local_client_url()/g" hf_interactive.ipynb
8898
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
8999
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
90100
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
91101
# Set explicit namespace as SDK need it (currently) to resolve local queues
92-
sed -i "s/worker_cpu_requests=8,/worker_cpu_requests=1, namespace='default',/" hf_interactive.ipynb
93102
# Change cluster parameters (need to decrease)
94103
sed -i "s/{'nvidia.com\/gpu':1}/{'nvidia.com\/gpu':0}/g" hf_interactive.ipynb
95-
sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=1,/" hf_interactive.ipynb
96-
sed -i "s/worker_memory_requests=16,/worker_memory_requests=4,/" hf_interactive.ipynb
97-
sed -i "s/worker_memory_limits=8,/worker_memory_limits=4,/" hf_interactive.ipynb
104+
sed -i "s/worker_cpu_requests=8,/worker_cpu_requests='250m', namespace='default',/" hf_interactive.ipynb
105+
sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=4,/" hf_interactive.ipynb
106+
#sed -i "s/worker_memory_requests=16,/worker_memory_requests=6,/" hf_interactive.ipynb
107+
#sed -i "s/worker_memory_limits=16,/worker_memory_limits=6,/" hf_interactive.ipynb
108+
sed -i "s/use_gpu=True/use_gpu=False/" hf_interactive.ipynb
109+
cat hf_interactive.ipynb
98110
# Run notebook
99111
poetry run papermill hf_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
112+
env:
113+
GRPC_DNS_RESOLVER: "native"
100114
working-directory: demo-notebooks/additional-demos
101115

102116
- name: Print CodeFlare operator logs
@@ -135,7 +149,6 @@ jobs:
135149
136150
verify-local_interactive:
137151
# if: ${{ github.event.label.name == 'test-additional-notebooks' }}
138-
# runs-on: ubuntu-20.04-4core
139152
runs-on: ubuntu-20.04-4core
140153

141154
steps:
@@ -205,13 +218,18 @@ jobs:
205218
# Remove login/logout cells, as KinD doesn't support authentication using token
206219
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
207220
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
221+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
222+
sed -i "s/cluster_uri()/local_client_url()/g" local_interactive.ipynb
208223
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
209224
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
210225
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
211226
# Set explicit namespace as SDK need it (currently) to resolve local queues
212-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" local_interactive.ipynb
227+
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" local_interactive.ipynb
228+
cat local_interactive.ipynb
213229
# Run notebook
214-
poetry run papermill local_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
230+
poetry run papermill local_interactive.ipynb local_interactive_out.ipynb --log-output --execution-timeout 1200
231+
env:
232+
GRPC_DNS_RESOLVER: "native"
215233
working-directory: demo-notebooks/additional-demos
216234

217235
- name: Print CodeFlare operator logs
@@ -319,13 +337,20 @@ jobs:
319337
# Remove login/logout cells, as KinD doesn't support authentication using token
320338
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
321339
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
340+
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
341+
sed -i "s/cluster_uri()/local_client_url()/g" ray_job_client.ipynb
322342
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
323343
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
324344
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
325345
# Set explicit namespace as SDK need it (currently) to resolve local queues
326-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" ray_job_client.ipynb
346+
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" ray_job_client.ipynb
347+
sed -i "s/worker_memory_requests=4,/worker_memory_requests=2,/" ray_job_client.ipynb
348+
sed -i "s/worker_memory_limits=4,/worker_memory_limits=2,/" ray_job_client.ipynb
349+
cat ray_job_client.ipynb
327350
# Run notebook
328351
poetry run papermill ray_job_client.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
352+
env:
353+
GRPC_DNS_RESOLVER: "native"
329354
working-directory: demo-notebooks/additional-demos
330355

331356
- name: Print CodeFlare operator logs

demo-notebooks/additional-demos/hf_interactive.ipynb

+3-1
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,9 @@
367367
"outputs": [],
368368
"source": [
369369
"#call the above cell as a remote ray function\n",
370-
"ray.get(train_fn.remote())"
370+
"print(\"Start training:\")\n",
371+
"ray.get(train_fn.remote())\n",
372+
"print(\"End training:\")"
371373
]
372374
},
373375
{

0 commit comments

Comments
 (0)