|
15 | 15 | jobs:
|
16 | 16 | verify-hf_interactive:
|
17 | 17 | # if: ${{ github.event.label.name == 'test-additional-notebooks' }}
|
18 |
| - runs-on: ubuntu-20.04-4core |
| 18 | + runs-on: ubuntu-20.04-4core-gpu |
19 | 19 |
|
20 | 20 | steps:
|
21 | 21 | - name: Checkout code
|
|
53 | 53 | python-version: '3.9'
|
54 | 54 | cache: 'pip' # caching pip dependencies
|
55 | 55 |
|
| 56 | + - name: Setup NVidia GPU environment for KinD |
| 57 | + uses: ./common/github-actions/nvidia-gpu-setup |
| 58 | + |
56 | 59 | - name: Setup and start KinD cluster
|
57 | 60 | uses: ./common/github-actions/kind
|
58 | 61 |
|
| 62 | + - name: Install NVidia GPU operator for KinD |
| 63 | + uses: ./common/github-actions/nvidia-gpu-operator |
| 64 | + with: |
| 65 | + enable-time-slicing: 'true' |
| 66 | + |
59 | 67 | - name: Deploy CodeFlare stack
|
60 | 68 | id: deploy
|
61 | 69 | run: |
|
@@ -85,18 +93,24 @@ jobs:
|
85 | 93 | # Remove login/logout cells, as KinD doesn't support authentication using token
|
86 | 94 | jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
|
87 | 95 | jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
|
| 96 | + # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster |
| 97 | + sed -i "s/cluster_uri()/local_client_url()/g" hf_interactive.ipynb |
88 | 98 | # Replace async logs with waiting for job to finish, async logs don't work properly in papermill
|
89 | 99 | JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
|
90 | 100 | jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' hf_interactive.ipynb > hf_interactive.ipynb.tmp && mv hf_interactive.ipynb.tmp hf_interactive.ipynb
|
91 | 101 | # Set explicit namespace as SDK need it (currently) to resolve local queues
|
92 |
| - sed -i "s/worker_cpu_requests=8,/worker_cpu_requests=1, namespace='default',/" hf_interactive.ipynb |
93 | 102 | # Change cluster parameters (need to decrease)
|
94 | 103 | sed -i "s/{'nvidia.com\/gpu':1}/{'nvidia.com\/gpu':0}/g" hf_interactive.ipynb
|
95 |
| - sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=1,/" hf_interactive.ipynb |
96 |
| - sed -i "s/worker_memory_requests=16,/worker_memory_requests=4,/" hf_interactive.ipynb |
97 |
| - sed -i "s/worker_memory_limits=8,/worker_memory_limits=4,/" hf_interactive.ipynb |
| 104 | + sed -i "s/worker_cpu_requests=8,/worker_cpu_requests='250m', namespace='default',/" hf_interactive.ipynb |
| 105 | + sed -i "s/worker_cpu_limits=8,/worker_cpu_limits=4,/" hf_interactive.ipynb |
| 106 | + #sed -i "s/worker_memory_requests=16,/worker_memory_requests=6,/" hf_interactive.ipynb |
| 107 | + #sed -i "s/worker_memory_limits=16,/worker_memory_limits=6,/" hf_interactive.ipynb |
| 108 | + sed -i "s/use_gpu=True/use_gpu=False/" hf_interactive.ipynb |
| 109 | + cat hf_interactive.ipynb |
98 | 110 | # Run notebook
|
99 | 111 | poetry run papermill hf_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
|
| 112 | + env: |
| 113 | + GRPC_DNS_RESOLVER: "native" |
100 | 114 | working-directory: demo-notebooks/additional-demos
|
101 | 115 |
|
102 | 116 | - name: Print CodeFlare operator logs
|
@@ -135,7 +149,6 @@ jobs:
|
135 | 149 |
|
136 | 150 | verify-local_interactive:
|
137 | 151 | # if: ${{ github.event.label.name == 'test-additional-notebooks' }}
|
138 |
| - # runs-on: ubuntu-20.04-4core |
139 | 152 | runs-on: ubuntu-20.04-4core
|
140 | 153 |
|
141 | 154 | steps:
|
@@ -205,13 +218,18 @@ jobs:
|
205 | 218 | # Remove login/logout cells, as KinD doesn't support authentication using token
|
206 | 219 | jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
|
207 | 220 | jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
|
| 221 | + # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster |
| 222 | + sed -i "s/cluster_uri()/local_client_url()/g" local_interactive.ipynb |
208 | 223 | # Replace async logs with waiting for job to finish, async logs don't work properly in papermill
|
209 | 224 | JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
|
210 | 225 | jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
|
211 | 226 | # Set explicit namespace as SDK need it (currently) to resolve local queues
|
212 |
| - sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" local_interactive.ipynb |
| 227 | + sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" local_interactive.ipynb |
| 228 | + cat local_interactive.ipynb |
213 | 229 | # Run notebook
|
214 |
| - poetry run papermill local_interactive.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200 |
| 230 | + poetry run papermill local_interactive.ipynb local_interactive_out.ipynb --log-output --execution-timeout 1200 |
| 231 | + env: |
| 232 | + GRPC_DNS_RESOLVER: "native" |
215 | 233 | working-directory: demo-notebooks/additional-demos
|
216 | 234 |
|
217 | 235 | - name: Print CodeFlare operator logs
|
@@ -319,13 +337,20 @@ jobs:
|
319 | 337 | # Remove login/logout cells, as KinD doesn't support authentication using token
|
320 | 338 | jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
|
321 | 339 | jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
|
| 340 | + # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster |
| 341 | + sed -i "s/cluster_uri()/local_client_url()/g" ray_job_client.ipynb |
322 | 342 | # Replace async logs with waiting for job to finish, async logs don't work properly in papermill
|
323 | 343 | JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
|
324 | 344 | jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
|
325 | 345 | # Set explicit namespace as SDK need it (currently) to resolve local queues
|
326 |
| - sed -i "s/worker_cpu_requests=1,/worker_cpu_requests=1, namespace='default',/" ray_job_client.ipynb |
| 346 | + sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" ray_job_client.ipynb |
| 347 | + sed -i "s/worker_memory_requests=4,/worker_memory_requests=2,/" ray_job_client.ipynb |
| 348 | + sed -i "s/worker_memory_limits=4,/worker_memory_limits=2,/" ray_job_client.ipynb |
| 349 | + cat ray_job_client.ipynb |
327 | 350 | # Run notebook
|
328 | 351 | poetry run papermill ray_job_client.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
|
| 352 | + env: |
| 353 | + GRPC_DNS_RESOLVER: "native" |
329 | 354 | working-directory: demo-notebooks/additional-demos
|
330 | 355 |
|
331 | 356 | - name: Print CodeFlare operator logs
|
|
0 commit comments