YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
This is the centralized trained version of Hermes-4.3-36B which is released as a research artifact. It was trained on Nous Research's fork of Torchtitan commit d91ee11d6d5717c95daefcd789e4616ad82b7477
# torchtitan Config.toml
# NOTE: this toml config is a preset for 64 A100 GPUs.
[job]
dump_folder = "./outputs"
description = "Seed 36B training"
[profiling]
enable_profiling = false
save_traces_folder = "profile_trace"
profile_freq = 100
[metrics]
log_freq = 1
enable_tensorboard = false
save_tb_folder = "tb"
enable_wandb = true
[model]
name = "llama3"
flavor = "36B_seed_flex_attn"
tokenizer_path = "./assets/tokenizer/original/tokenizer.model"
# converters = ["float8"]
[optimizer]
name = "AdamW"
lr = 2.5e-5
eps = 1e-8
weight_decay = 0.01
beta1 = 0.9
beta2 = 0.999
[lr_scheduler]
warmup_steps = 300 # lr scheduler warm up
decay_type = "cosine"
[training]
local_batch_size = 2
global_batch_size = 384
seq_len = 131072
max_norm = 1.0 # grad norm clipping
# steps = 1000
epochs = 4
dataset = "hermes-4"
dataset_type = "preprocessed"
dataset_path = "/home/emozilla/preprocessed-datasets/Hermes-4.3-ByteDance-Seed-OSS-24K"
[compile]
enable=false
components = ["model", "loss"]
#components = ["loss"]
[parallelism]
data_parallel_replicate_degree = 1
data_parallel_shard_degree = -1
tensor_parallel_degree = 1
enable_async_tensor_parallel = true
pipeline_parallel_degree = 1
context_parallel_degree = 1
[checkpoint]
enable = true
folder = "/home/emozilla/dcp/hermes4.3-36b"
interval = 1000
last_save_model_only = true
last_save_in_hf = true
export_dtype = "bfloat16"
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
initial_load_path = "/home/emozilla/dcp/Seed-OSS-36B-Base"
[activation_checkpoint]
mode = "full"
#mode = "selective" # ["none", "selective", "full"]
#selective_ac_option = "op" # "int" = ac every positive int layer or 'op', ac based on ops policy
[quantize.linear.float8]
enable_fsdp_float8_all_gather = false
precompute_float8_dynamic_scale_for_fsdp = false
filter_fqns = ["output"]
- Downloads last month
- 17
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support