ToastyPigeon commited on
Commit
4bc79e2
·
verified ·
1 Parent(s): 00b097b

Model save

Browse files
Files changed (1) hide show
  1. README.md +218 -0
README.md ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen2.5-32B
5
+ tags:
6
+ - axolotl
7
+ - generated_from_trainer
8
+ datasets:
9
+ - ToastyPigeon/story-samples
10
+ model-index:
11
+ - name: qwen32-story-ws-v2
12
+ results: []
13
+ ---
14
+
15
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
+ should probably proofread and complete it, then remove this comment. -->
17
+
18
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
19
+ <details><summary>See axolotl config</summary>
20
+
21
+ axolotl version: `0.6.0`
22
+ ```yaml
23
+ # git clone https://github.com/axolotl-ai-cloud/axolotl
24
+ # cd axolotl
25
+ # git checkout d425d5d3c3ca7644a9da8ed93c3d03f4be0c4854
26
+ # pip3 install packaging ninja huggingface_hub[cli]
27
+ # pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git"
28
+ # pip3 install -e '.[flash-attn,deepspeed]'
29
+ # apt update && apt install libopenmpi-dev
30
+ # pip install mpi4py
31
+ # huggingface-cli login --token $hf_key && wandb login $wandb_key
32
+ # python -m axolotl.cli.preprocess qwen-32b-story.yml
33
+ # medpace data analyst
34
+ # accelerate launch -m axolotl.cli.train qwen-32b-story.yml
35
+ # python -m axolotl.cli.merge_lora qwen-32b-story.yml
36
+ # huggingface-cli upload ToastyPigeon/new-ms-rp-test-v0-v3 train-workspace/merged . --exclude "*.md"
37
+
38
+ # git clone https://github.com/axolotl-ai-cloud/axolotl && cd axolotl && git checkout d8b4027200de0fe60f4ae0a71272c1a8cb2888f7 && pip3 install packaging ninja huggingface_hub[cli,hf_transfer] && pip3 install -e '.[flash-attn,deepspeed]' && cd .. && huggingface-cli login --token $hf_key && wandb login $wandb_key
39
+
40
+ # Model
41
+ base_model: Qwen/Qwen2.5-32B
42
+ model_type: AutoModelForCausalLM
43
+ tokenizer_type: AutoTokenizer
44
+
45
+ load_in_8bit: false
46
+ load_in_4bit: true
47
+ strict: false
48
+ bf16: true
49
+ fp16:
50
+ tf32: false
51
+ flash_attention: true
52
+ special_tokens:
53
+
54
+ # Output
55
+ output_dir: ./train-workspace
56
+ hub_model_id: ToastyPigeon/qwen32-story-ws-v2
57
+ hub_strategy: "checkpoint"
58
+ resume_from_checkpoint:
59
+ saves_per_epoch: 4
60
+
61
+ # Data
62
+ sequence_len: 4096 # fits
63
+ min_sample_len: 128
64
+ dataset_prepared_path: last_run_prepared
65
+ datasets:
66
+ - path: ToastyPigeon/story-samples
67
+ type: completion
68
+ field: text
69
+ split: train[:1500]
70
+ warmup_ratio: 0.05
71
+ shuffle_merged_datasets: true
72
+ sample_packing: true
73
+ #pad_to_sequence_len: true
74
+
75
+ # Batching
76
+ num_epochs: 1
77
+ gradient_accumulation_steps: 4
78
+ micro_batch_size: 1
79
+ eval_batch_size: 1
80
+
81
+ # Evaluation
82
+ val_set_size: 200
83
+ evals_per_epoch: 10
84
+ eval_table_size:
85
+ eval_max_new_tokens: 256
86
+ eval_sample_packing: true
87
+
88
+ save_safetensors: true
89
+
90
+ # WandB
91
+ wandb_project: Qwen-Test
92
+ #wandb_entity:
93
+
94
+ gradient_checkpointing: 'unsloth'
95
+ #gradient_checkpointing_kwargs:
96
+ # use_reentrant: false
97
+
98
+ unsloth_cross_entropy_loss: true
99
+ #unsloth_lora_mlp: true
100
+ #unsloth_lora_qkv: true
101
+ #unsloth_lora_o: true
102
+
103
+ # LoRA
104
+ adapter: qlora
105
+ lora_model_dir:
106
+ lora_r: 32
107
+ lora_alpha: 32
108
+ lora_dropout: 0.5
109
+ lora_target_linear:
110
+ lora_fan_in_fan_out:
111
+ lora_target_modules:
112
+ - gate_proj
113
+ - down_proj
114
+ - up_proj
115
+ - q_proj
116
+ - v_proj
117
+ - k_proj
118
+ - o_proj
119
+ lora_modules_to_save:
120
+ #peft_layers_to_transform: [35,36,37,38,39]
121
+
122
+ # Optimizer
123
+ optimizer: paged_ademamix_8bit # adamw_8bit
124
+ lr_scheduler: cosine
125
+ learning_rate: 5e-5
126
+ cosine_min_lr_ratio: 0.5
127
+ weight_decay: 0.01
128
+ max_grad_norm: 1.0
129
+
130
+ # Misc
131
+ train_on_inputs: false
132
+ #group_by_length: true
133
+ early_stopping_patience:
134
+ local_rank:
135
+ logging_steps: 1
136
+ xformers_attention:
137
+ debug:
138
+ deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json # previously blank
139
+ fsdp:
140
+ fsdp_config:
141
+
142
+ plugins:
143
+ - axolotl.integrations.liger.LigerPlugin
144
+ # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
145
+ #cut_cross_entropy: true
146
+ liger_rope: true
147
+ liger_rms_norm: true
148
+ liger_layer_norm: true
149
+ liger_glu_activation: true
150
+ liger_fused_linear_cross_entropy: true
151
+
152
+ gc_steps: 10
153
+ seed: 69
154
+ ```
155
+
156
+ </details><br>
157
+
158
+ # qwen32-story-ws-v2
159
+
160
+ This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on the ToastyPigeon/story-samples dataset.
161
+ It achieves the following results on the evaluation set:
162
+ - Loss: 2.2790
163
+
164
+ ## Model description
165
+
166
+ More information needed
167
+
168
+ ## Intended uses & limitations
169
+
170
+ More information needed
171
+
172
+ ## Training and evaluation data
173
+
174
+ More information needed
175
+
176
+ ## Training procedure
177
+
178
+ ### Training hyperparameters
179
+
180
+ The following hyperparameters were used during training:
181
+ - learning_rate: 5e-05
182
+ - train_batch_size: 1
183
+ - eval_batch_size: 1
184
+ - seed: 69
185
+ - distributed_type: multi-GPU
186
+ - num_devices: 4
187
+ - gradient_accumulation_steps: 4
188
+ - total_train_batch_size: 16
189
+ - total_eval_batch_size: 4
190
+ - optimizer: Use OptimizerNames.PAGED_ADEMAMIX_8BIT and the args are:
191
+ No additional optimizer arguments
192
+ - lr_scheduler_type: cosine
193
+ - lr_scheduler_warmup_steps: 5
194
+ - num_epochs: 1
195
+
196
+ ### Training results
197
+
198
+ | Training Loss | Epoch | Step | Validation Loss |
199
+ |:-------------:|:------:|:----:|:---------------:|
200
+ | 2.1763 | 0.0092 | 1 | 2.3021 |
201
+ | 2.129 | 0.1014 | 11 | 2.2997 |
202
+ | 2.2385 | 0.2028 | 22 | 2.2945 |
203
+ | 2.233 | 0.3041 | 33 | 2.2906 |
204
+ | 2.0907 | 0.4055 | 44 | 2.2874 |
205
+ | 2.2263 | 0.5069 | 55 | 2.2848 |
206
+ | 2.2703 | 0.6083 | 66 | 2.2828 |
207
+ | 2.4101 | 0.7097 | 77 | 2.2813 |
208
+ | 2.2473 | 0.8111 | 88 | 2.2800 |
209
+ | 2.1912 | 0.9124 | 99 | 2.2790 |
210
+
211
+
212
+ ### Framework versions
213
+
214
+ - PEFT 0.14.0
215
+ - Transformers 4.47.1
216
+ - Pytorch 2.5.1+cu124
217
+ - Datasets 3.2.0
218
+ - Tokenizers 0.21.0