1313# limitations under the License.
1414
1515ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.07-tf2-py3
16-
17- # #####
18- # Tokenizers is only available pre-built on x86
19- #
20- FROM ${FROM_IMAGE_NAME} AS tokenizers_amd64
21- WORKDIR /wheelhouse
22- RUN pip download tokenizers==0.7.0
23-
24- FROM quay.io/pypa/manylinux2014_aarch64 as tokenizers_arm64
25- ARG PYVER=38
26- RUN yum install -y openssl-devel
27- RUN curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain nightly-2019-11-01 -y
28- ENV PATH="/root/.cargo/bin:$PATH"
29- ENV PYBIN=/opt/python/cp${PYVER}-cp${PYVER}/bin
30- ENV PYTHON_SYS_EXECUTABLE="$PYBIN/python"
31- RUN git clone -b python-v0.8.0 https://github.com/huggingface/tokenizers.git /opt/tokenizers
32- WORKDIR /opt/tokenizers/bindings/python
33- RUN "${PYBIN}/pip" install setuptools-rust \
34- && "${PYBIN}/python" setup.py bdist_wheel \
35- && rm -rf build/* \
36- && for whl in dist/*.whl; do \
37- auditwheel repair "$whl" -w dist/; \
38- done \
39- && rm dist/*-linux_* \
40- && mkdir -p /wheelhouse \
41- && mv dist/*.whl /wheelhouse
42-
43- ARG TARGETARCH
44- FROM tokenizers_${TARGETARCH} AS tokenizers
45- #
46- # ####
47-
48-
4916FROM ${FROM_IMAGE_NAME}
5017RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
5118
52- RUN --mount=from=tokenizers,source=/wheelhouse,target=/tmp/wheelhouse \
53- pip install --no-cache-dir /tmp/wheelhouse/tokenizers*.whl
54-
5519ENV DATA_PREP_WORKING_DIR /workspace/electra/data
5620WORKDIR /workspace
5721RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
@@ -61,7 +25,7 @@ WORKDIR /workspace/electra
6125
6226RUN pip install --no-cache-dir tqdm boto3 requests six ipdb h5py nltk progressbar filelock \
6327 git+https://github.com/NVIDIA/dllogger \
64- nvidia-ml-py3==7.352.0
28+ nvidia-ml-py3==7.352.0 tokenizers==0.11.0
6529
6630RUN apt-get install -y iputils-ping
6731COPY . .
0 commit comments