Skip to content

Commit 9cef2a2

Browse files
committed
update dependencies
1 parent 080aff2 commit 9cef2a2

File tree

1 file changed

+27
-25
lines changed

1 file changed

+27
-25
lines changed

2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile

+27-25
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@
1919
# # Load image to local docker registry -> on head node, or new compute/build node.
2020
# docker load < /fsx/nvidia-pt-od__latest.tar
2121
####################################################################################################
22-
FROM nvcr.io/nvidia/pytorch:23.12-py3
22+
# Check https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html for the base image contents
23+
# 24.06 comes with NCCL 2.21.5
24+
FROM nvcr.io/nvidia/pytorch:24.06-py3
2325
ENV DEBIAN_FRONTEND=noninteractive
2426

2527
# The three must-be-built packages.
2628
# Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error.
27-
ENV EFA_INSTALLER_VERSION=1.30.0
28-
ENV AWS_OFI_NCCL_VERSION=1.8.1-aws
29+
ENV EFA_INSTALLER_VERSION=1.33.0
30+
ENV AWS_OFI_NCCL_VERSION=v1.9.2-aws
31+
ENV NCCL_VERSION=v2.21.5-1
2932
ENV NCCL_TESTS_VERSION=master
3033

3134
## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and
@@ -111,16 +114,15 @@ ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH
111114
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
112115
# aws-ofi-ccnl.
113116
####################################################################################################
114-
ENV NCCL_VERSION=2.19.3-1
115-
RUN apt-get remove -y libnccl2 libnccl-dev \
116-
&& cd /tmp \
117-
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
118-
&& cd nccl \
119-
&& make -j src.build BUILDDIR=/usr \
120-
# Build for p4 & p5.
121-
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \
122-
&& rm -rf /tmp/nccl \
123-
&& echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
117+
#RUN apt-get remove -y libnccl2 libnccl-dev \
118+
# && cd /tmp \
119+
# && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
120+
# && cd nccl \
121+
# && make -j src.build BUILDDIR=/usr \
122+
# # Build for p4 & p5.
123+
# NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \
124+
# && rm -rf /tmp/nccl \
125+
# && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
124126

125127

126128
####################################################################################################
@@ -180,7 +182,7 @@ RUN rm -fr ${OPEN_MPI_PATH} \
180182
# NCCL EFA Plugin
181183
RUN mkdir -p /tmp && \
182184
cd /tmp && \
183-
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
185+
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/${AWS_OFI_NCCL_VERSION}.tar.gz && \
184186
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
185187
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
186188
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
@@ -228,14 +230,14 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
228230
# its own pt + cuda.
229231
#
230232
# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza.
231-
RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \
232-
# On p4de.24xlarge:
233-
# - MAX_JOBS=16 => 145GB memory
234-
# - MAX_JOBS=32 => 241GB memory
235-
# - MAX_JOBS=48 => 243GB memory, 542.5s
236-
#
237-
# NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to
238-
# work to prevent OOM.
239-
export MAX_JOBS=32 && \
240-
export NVCC_PREPEND_FLAGS="-t 32" && \
241-
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
233+
#RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \
234+
# # On p4de.24xlarge:
235+
# # - MAX_JOBS=16 => 145GB memory
236+
# # - MAX_JOBS=32 => 241GB memory
237+
# # - MAX_JOBS=48 => 243GB memory, 542.5s
238+
# #
239+
# # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to
240+
# # work to prevent OOM.
241+
# export MAX_JOBS=32 && \
242+
# export NVCC_PREPEND_FLAGS="-t 32" && \
243+
# pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers

0 commit comments

Comments
 (0)