none
commited on
Commit
·
e235434
1
Parent(s):
3592a0d
wip
Browse files- Dockerfile +3 -2
- learn.py +13 -2
- start.sh +6 -2
Dockerfile
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
|
| 2 |
# run with
|
| 3 |
-
# docker run --gpus all --user=42420:42420 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh hf_TOKEN
|
|
|
|
| 4 |
FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
|
| 5 |
# FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
|
| 6 |
# RUN mkdir -p /workspace
|
|
@@ -9,7 +10,6 @@ RUN /usr/sbin/useradd -u 42420 --gid 42420 -m -d /workspace -s /bin/bash ovh
|
|
| 9 |
RUN apt update -y && apt-get install -y curl git git-lfs screen
|
| 10 |
COPY --chmod=777 start.sh /start.sh
|
| 11 |
COPY learn.py /learn.py
|
| 12 |
-
COPY preload.py /preload.py
|
| 13 |
# Mandatory to run the jobs in rootless mode
|
| 14 |
USER root
|
| 15 |
RUN chown -R 42420:42420 /workspace
|
|
@@ -25,6 +25,7 @@ RUN . /workspace/.miniconda3/bin/activate \
|
|
| 25 |
&& pip install -U Pillow \
|
| 26 |
&& pip install -U torchvision torchaudio
|
| 27 |
RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
|
|
|
|
| 28 |
# Mandatory to run the jobs in rootless mode
|
| 29 |
# USER root
|
| 30 |
# RUN chown -R 42420:42420 /workspace
|
|
|
|
| 1 |
# build with: docker build . --tag sctg/roco-idefics3:0.0.5 --tag sctg/roco-idefics3:latest --push
|
| 2 |
# run with
|
| 3 |
+
# docker run --gpus all --user=42420:42420 -p 8080:8080 -e HF_TOKEN=hf_TOKEN -it sctg/roco-idefics3:0.0.5 bash -i /start.sh sleep infinity hf_TOKEN
|
| 4 |
+
# docker run --gpus all --user=42420:42420 -p 8080:8080 -it sctg/roco-idefics3:0.0.5 bash -i /start.sh python /learn.py hf_...
|
| 5 |
FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
|
| 6 |
# FROM nvidia/cuda:11.0.3-devel-ubuntu20.04
|
| 7 |
# RUN mkdir -p /workspace
|
|
|
|
| 10 |
RUN apt update -y && apt-get install -y curl git git-lfs screen
|
| 11 |
COPY --chmod=777 start.sh /start.sh
|
| 12 |
COPY learn.py /learn.py
|
|
|
|
| 13 |
# Mandatory to run the jobs in rootless mode
|
| 14 |
USER root
|
| 15 |
RUN chown -R 42420:42420 /workspace
|
|
|
|
| 25 |
&& pip install -U Pillow \
|
| 26 |
&& pip install -U torchvision torchaudio
|
| 27 |
RUN . /workspace/.miniconda3/bin/activate && conda install -y jupyter
|
| 28 |
+
RUN rm -f /workspace/miniconda.sh
|
| 29 |
# Mandatory to run the jobs in rootless mode
|
| 30 |
# USER root
|
| 31 |
# RUN chown -R 42420:42420 /workspace
|
learn.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
# License: Apache License 2.0
|
| 3 |
# Description: Train the model on the dataset
|
| 4 |
import os
|
|
|
|
| 5 |
import torch
|
| 6 |
|
| 7 |
from huggingface_hub import login as hf_login
|
|
@@ -12,11 +13,17 @@ from datasets.utils.logging import disable_progress_bar
|
|
| 12 |
disable_progress_bar()
|
| 13 |
|
| 14 |
HF_TOKEN = ""
|
|
|
|
| 15 |
|
| 16 |
if os.environ.get('HF_TOKEN') is not None:
|
| 17 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
| 18 |
print(f"Hugging Face token found in environment variable")
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
hf_login(
|
| 21 |
token=HF_TOKEN,
|
| 22 |
add_to_git_credential=True
|
|
@@ -27,7 +34,10 @@ source_model_id = "HuggingFaceM4/Idefics3-8B-Llama3"
|
|
| 27 |
destination_model_id = "eltorio/IDEFICS3_ROCOv2"
|
| 28 |
output_dir = "IDEFICS3_ROCOv2"
|
| 29 |
cache_dir = "/workspace/data"
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
DEVICE = "cuda:0"
|
| 33 |
USE_LORA = False
|
|
@@ -127,7 +137,7 @@ training_args = TrainingArguments(
|
|
| 127 |
gradient_accumulation_steps = 8,
|
| 128 |
dataloader_pin_memory = False,
|
| 129 |
save_total_limit = 3,
|
| 130 |
-
|
| 131 |
save_strategy = "steps",
|
| 132 |
eval_steps = 100,
|
| 133 |
save_steps = 10, # checkpoint each 10 steps
|
|
@@ -146,6 +156,7 @@ trainer = Trainer(
|
|
| 146 |
args = training_args,
|
| 147 |
data_collator = data_collator,
|
| 148 |
train_dataset = train_dataset,
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
trainer.train()
|
|
|
|
| 2 |
# License: Apache License 2.0
|
| 3 |
# Description: Train the model on the dataset
|
| 4 |
import os
|
| 5 |
+
import sys
|
| 6 |
import torch
|
| 7 |
|
| 8 |
from huggingface_hub import login as hf_login
|
|
|
|
| 13 |
disable_progress_bar()
|
| 14 |
|
| 15 |
HF_TOKEN = ""
|
| 16 |
+
arguments = sys.argv[1:]
|
| 17 |
|
| 18 |
if os.environ.get('HF_TOKEN') is not None:
|
| 19 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
| 20 |
print(f"Hugging Face token found in environment variable")
|
| 21 |
|
| 22 |
+
# If HF_TOKEN is empty checks if the first argument seems to be the token (ie starts with "hf_" )
|
| 23 |
+
if not HF_TOKEN and arguments and arguments[0].startswith("hf_"):
|
| 24 |
+
HF_TOKEN = arguments[0]
|
| 25 |
+
print(f"Hugging Face token found in script arguments")
|
| 26 |
+
|
| 27 |
hf_login(
|
| 28 |
token=HF_TOKEN,
|
| 29 |
add_to_git_credential=True
|
|
|
|
| 34 |
destination_model_id = "eltorio/IDEFICS3_ROCOv2"
|
| 35 |
output_dir = "IDEFICS3_ROCOv2"
|
| 36 |
cache_dir = "/workspace/data"
|
| 37 |
+
|
| 38 |
+
full_dataset = load_dataset(dataset_id,keep_in_memory=False)
|
| 39 |
+
train_dataset = full_dataset["train"]
|
| 40 |
+
eval_dataset = full_dataset["validation"]
|
| 41 |
|
| 42 |
DEVICE = "cuda:0"
|
| 43 |
USE_LORA = False
|
|
|
|
| 137 |
gradient_accumulation_steps = 8,
|
| 138 |
dataloader_pin_memory = False,
|
| 139 |
save_total_limit = 3,
|
| 140 |
+
eval_strategy = "steps",
|
| 141 |
save_strategy = "steps",
|
| 142 |
eval_steps = 100,
|
| 143 |
save_steps = 10, # checkpoint each 10 steps
|
|
|
|
| 156 |
args = training_args,
|
| 157 |
data_collator = data_collator,
|
| 158 |
train_dataset = train_dataset,
|
| 159 |
+
eval_dataset = train_dataset,
|
| 160 |
)
|
| 161 |
|
| 162 |
trainer.train()
|
start.sh
CHANGED
|
@@ -6,14 +6,18 @@ export HOME=/workspace
|
|
| 6 |
cd /workspace
|
| 7 |
git lfs install
|
| 8 |
if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
| 9 |
-
export HF_TOKEN=$
|
| 10 |
-
unset $
|
| 11 |
fi
|
| 12 |
|
| 13 |
echo "HF_TOKEN: $HF_TOKEN"
|
| 14 |
. /workspace/.bashrc
|
| 15 |
. /workspace/.miniconda3/bin/activate
|
|
|
|
|
|
|
| 16 |
git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
|
|
|
|
|
|
|
| 17 |
git config --global credential.helper store
|
| 18 |
|
| 19 |
huggingface-cli login --add-to-git-credential --token $HF_TOKEN
|
|
|
|
| 6 |
cd /workspace
|
| 7 |
git lfs install
|
| 8 |
if [[ -z "$HF_TOKEN" || ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
| 9 |
+
export HF_TOKEN=${!#}
|
| 10 |
+
unset ${!#}
|
| 11 |
fi
|
| 12 |
|
| 13 |
echo "HF_TOKEN: $HF_TOKEN"
|
| 14 |
. /workspace/.bashrc
|
| 15 |
. /workspace/.miniconda3/bin/activate
|
| 16 |
+
export SHELL=/bin/bash
|
| 17 |
+
|
| 18 |
git clone https://huggingface.co/eltorio/IDEFICS3_ROCOv2
|
| 19 |
+
git config --global user.email "[email protected]"
|
| 20 |
+
git config --global user.name "[email protected]"
|
| 21 |
git config --global credential.helper store
|
| 22 |
|
| 23 |
huggingface-cli login --add-to-git-credential --token $HF_TOKEN
|