|
19 | 19 | # # Load image to local docker registry -> on head node, or new compute/build node.
|
20 | 20 | # docker load < /fsx/nvidia-pt-od__latest.tar
|
21 | 21 | ####################################################################################################
|
22 |
| -FROM nvcr.io/nvidia/pytorch:23.12-py3 |
| 22 | +# Check https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html for the base image contents |
| 23 | +# 24.06 comes with NCCL 2.21.5 |
| 24 | +FROM nvcr.io/nvidia/pytorch:24.06-py3 |
23 | 25 | ENV DEBIAN_FRONTEND=noninteractive
|
24 | 26 |
|
25 | 27 | # The three must-be-built packages.
|
26 | 28 | # Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error.
|
27 |
| -ENV EFA_INSTALLER_VERSION=1.30.0 |
28 |
| -ENV AWS_OFI_NCCL_VERSION=1.8.1-aws |
| 29 | +ENV EFA_INSTALLER_VERSION=1.33.0 |
| 30 | +ENV AWS_OFI_NCCL_VERSION=v1.9.2-aws |
| 31 | +ENV NCCL_VERSION=v2.21.5-1 |
29 | 32 | ENV NCCL_TESTS_VERSION=master
|
30 | 33 |
|
31 | 34 | ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and
|
@@ -111,16 +114,15 @@ ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH
|
111 | 114 | # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
|
112 | 115 | # aws-ofi-ccnl.
|
113 | 116 | ####################################################################################################
|
114 |
| -ENV NCCL_VERSION=2.19.3-1 |
115 |
| -RUN apt-get remove -y libnccl2 libnccl-dev \ |
116 |
| - && cd /tmp \ |
117 |
| - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ |
118 |
| - && cd nccl \ |
119 |
| - && make -j src.build BUILDDIR=/usr \ |
120 |
| - # Build for p4 & p5. |
121 |
| - NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ |
122 |
| - && rm -rf /tmp/nccl \ |
123 |
| - && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf |
| 117 | +#RUN apt-get remove -y libnccl2 libnccl-dev \ |
| 118 | +# && cd /tmp \ |
| 119 | +# && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ |
| 120 | +# && cd nccl \ |
| 121 | +# && make -j src.build BUILDDIR=/usr \ |
| 122 | +# # Build for p4 & p5. |
| 123 | +# NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ |
| 124 | +# && rm -rf /tmp/nccl \ |
| 125 | +# && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf |
124 | 126 |
|
125 | 127 |
|
126 | 128 | ####################################################################################################
|
@@ -180,7 +182,7 @@ RUN rm -fr ${OPEN_MPI_PATH} \
|
180 | 182 | # NCCL EFA Plugin
|
181 | 183 | RUN mkdir -p /tmp && \
|
182 | 184 | cd /tmp && \
|
183 |
| - curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
| 185 | + curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
184 | 186 | tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
|
185 | 187 | rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
|
186 | 188 | mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
|
@@ -228,14 +230,14 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
|
228 | 230 | # its own pt + cuda.
|
229 | 231 | #
|
230 | 232 | # Pre-requisite: build node has enough memory to compile xformers. More info on the stanza.
|
231 |
| -RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \ |
232 |
| - # On p4de.24xlarge: |
233 |
| - # - MAX_JOBS=16 => 145GB memory |
234 |
| - # - MAX_JOBS=32 => 241GB memory |
235 |
| - # - MAX_JOBS=48 => 243GB memory, 542.5s |
236 |
| - # |
237 |
| - # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to |
238 |
| - # work to prevent OOM. |
239 |
| - export MAX_JOBS=32 && \ |
240 |
| - export NVCC_PREPEND_FLAGS="-t 32" && \ |
241 |
| - pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers |
| 233 | +#RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \ |
| 234 | +# # On p4de.24xlarge: |
| 235 | +# # - MAX_JOBS=16 => 145GB memory |
| 236 | +# # - MAX_JOBS=32 => 241GB memory |
| 237 | +# # - MAX_JOBS=48 => 243GB memory, 542.5s |
| 238 | +# # |
| 239 | +# # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to |
| 240 | +# # work to prevent OOM. |
| 241 | +# export MAX_JOBS=32 && \ |
| 242 | +# export NVCC_PREPEND_FLAGS="-t 32" && \ |
| 243 | +# pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers |
0 commit comments