Upload checkpoint, sanitized config, and transcripts for ctc-baseline_xlsr_set_4

Files changed (5) hide show

README.md +41 -0
config.yaml +342 -0
hyp.trn +0 -0
ref.trn +0 -0
valid.loss.best.pth +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+title: "CTC-DRO XLSR-based ASR model - set 4"
+language: multilingual
+tags:
+  - asr
+  - ctc-dro
+  - XLSR
+license: cc-by-nc-4.0
+---
+# CTC-Baseline XLSR-based ASR model - set 4
+This repository contains a CTC-Baseline XLSR-based automatic speech recognition (ASR) model trained with ESPnet.
+The model was trained on balanced training data from set 4.
+## Intended Use
+This model is intended for ASR. Users can run inference using the provided checkpoint (`valid.loss.best.pth`) and configuration file (`config.yaml`):
+```bash
+import soundfile as sf
+from espnet2.bin.asr_inference import Speech2Text
+asr_train_config = "ctc-baseline_xlsr_set_4/config.yaml"
+asr_model_file = "ctc-baseline_xlsr_set_4/valid.loss.best.pth"
+model = Speech2Text.from_pretrained(
+    asr_train_config=asr_train_config,
+    asr_model_file=asr_model_file
+)
+speech, _ = sf.read("input.wav")
+text, *_ = model(speech)[0]
+print("Recognized text:", text)
+```
+## How to Use
+1. Clone this repository.
+2. Use ESPnet’s inference scripts with the provided `config.yaml` and checkpoint file.
+3. Ensure any external resources referenced in `config.yaml` are available at the indicated relative paths.

config.yaml ADDED Viewed

	@@ -0,0 +1,342 @@

+accum_grad: 16
+adapter: lora
+adapter_conf: {}
+allow_multi_rates: false
+allow_variable_data_keys: false
+aux_ctc_tasks: []
+batch_bins: 1000000
+batch_size: 4
+batch_type: duration_language
+best_model_criterion:
+- - valid
+  - loss
+  - min
+bpemodel: null
+chunk_default_fs: null
+chunk_excluded_key_prefixes: []
+chunk_length: 500
+chunk_shift_ratio: 0.5
+cleaner: null
+collect_stats: false
+create_graph_in_tensorboard: false
+ctc_conf:
+  ctc_type: builtin
+cudnn_benchmark: false
+cudnn_deterministic: true
+cudnn_enabled: true
+decoder: null
+decoder_conf: {}
+detect_anomaly: false
+distributed: false
+drop_last_iter: false
+dry_run: false
+duration_batch_length: -1
+early_stopping_criterion:
+- valid
+- loss
+- min
+encoder: transformer
+encoder_conf:
+  attention_dropout_rate: 0.1
+  attention_heads: 8
+  dropout_rate: 0.1
+  input_layer: conv2d2
+  linear_units: 1024
+  normalize_before: true
+  num_blocks: 2
+  output_size: 256
+  positional_dropout_rate: 0.1
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+fold_length:
+- 80000
+- 150
+freeze_param: []
+frontend: s3prl
+frontend_conf:
+  download_dir: ./hub
+  frontend_conf:
+    upstream: xls_r_300m
+  fs: 16k
+  multilayer_feature: true
+g2p: null
+grad_clip: 5.0
+grad_clip_type: 2.0
+grad_noise: false
+ignore_init_mismatch: false
+init: xavier_uniform
+init_param: []
+input_size: null
+iterator_type: sequence
+joint_net_conf: null
+keep_nbest_models: 3
+log_interval: null
+log_level: INFO
+max_cache_fd: 32
+max_cache_size: 0.0
+max_epoch: 40
+model: espnet
+model_conf:
+  ctc_weight: 1.0
+multiple_iterator: false
+multiprocessing_distributed: false
+nbest_averaging_interval: 0
+ngpu: 1
+no_forward_run: false
+noise_apply_prob: 1.0
+noise_db_range: '13_15'
+noise_scp: null
+non_linguistic_symbols: ./nlsyms.txt
+normalize: utterance_mvn
+normalize_conf: {}
+num_att_plot: 3
+num_cache_chunks: 1024
+num_iters_per_epoch: 1200
+num_workers: 4
+optim: adam
+optim_conf:
+  lr: 0.0001
+  weight_decay: 1.0e-06
+output_dir: ./inference_results
+patience: null
+postencoder: null
+postencoder_conf: {}
+preencoder: linear
+preencoder_conf:
+  input_size: 1024
+  output_size: 80
+preprocessor: default
+preprocessor_conf: {}
+pretrain_path: null
+print_config: false
+required:
+- output_dir
+- token_list
+resume: true
+rir_apply_prob: 1.0
+rir_scp: null
+save_strategy: all
+scheduler: null
+scheduler_conf: {}
+seed: 0
+sharded_ddp: false
+short_noise_thres: 0.5
+shuffle_within_batch: false
+sort_batch: descending
+sort_in_batch: descending
+specaug: specaug
+specaug_conf:
+  apply_freq_mask: true
+  apply_time_mask: true
+  apply_time_warp: true
+  freq_mask_width_range:
+  - 0
+  - 27
+  num_freq_mask: 2
+  num_time_mask: 10
+  time_mask_width_ratio_range:
+  - 0.0
+  - 0.05
+  time_warp_mode: bicubic
+  time_warp_window: 5
+speech_volume_normalize: null
+token_list:
+- <blank>
+- <unk>
+- <space>
+- E
+- A
+- O
+- N
+- S
+- I
+- ا
+- L
+- T
+- R
+- و
+- D
+- ن
+- ر
+- ی
+- ي
+- M
+- U
+- H
+- P
+- ک
+- م
+- C
+- А
+- Ӹ
+- Н
+- B
+- ت
+- س
+- ل
+- J
+- K
+- ہ
+- Т
+- ے
+- G
+- Ш
+- К
+- Е
+- Л
+- Ы
+- V
+- М
+- ج
+- Ӓ
+- ه
+- ب
+- د
+- О
+- Y
+- '[slv]'
+- Р
+- ڪ
+- پ
+- Z
+- '[mrj]'
+- F
+- گ
+- И
+- В
+- ئ
+- Д
+- '[sot]'
+- ں
+- '[spa]'
+- W
+- Q
+- П
+- Г
+- ف
+- ق
+- С
+- ع
+- ش
+- Ж
+- ز
+- ھ
+- آ
+- Č
+- Í
+- У
+- ح
+- '[urd]'
+- Š
+- ٹ
+- چ
+- Ь
+- ٽ
+- '[snd]'
+- ڻ
+- Й
+- ط
+- ص
+- ٿ
+- Ц
+- خ
+- Ó
+- Я
+- Á
+- É
+- Ч
+- ۾
+- '0'
+- Ž
+- З
+- '1'
+- ۽
+- –
+- ڏ
+- Э
+- ڊ
+- —
+- ڈ
+- ء
+- Ñ
+- ڙ
+- ِ
+- '2'
+- ٻ
+- Х
+- Ӱ
+- ظ
+- ض
+- ث
+- ڳ
+- ،
+- X
+- ¡
+- غ
+- ڑ
+- Ӧ
+- ذ
+- ¿
+- '5'
+- ڌ
+- '3'
+- ڀ
+- ُ
+- '9'
+- Ú
+- '4'
+- '8'
+- ۔
+- '6'
+- ٺ
+- Ю
+- »
+- Б
+- «
+- ڇ
+- ً
+- ڃ
+- '7'
+- ڄ
+- ؤ
+- ڍ
+- Ф
+- َ
+- ٰ
+- ّ
+- ڱ
+- ”
+- ژ
+- ڦ
+- Ё
+- ؛
+- ٍ
+- Щ
+- ؟
+- ’
+- ‘
+- °
+- ۃ
+- إ
+- Ć
+- <sos/eos>
+token_type: char
+train_dtype: float32
+unused_parameters: true
+use_adapter: false
+use_amp: false
+use_lang_prompt: false
+use_matplotlib: true
+use_nlp_prompt: false
+use_preprocessor: true
+use_tensorboard: true
+val_scheduler_criterion:
+- valid
+- loss
+valid_batch_bins: null
+valid_batch_size: null
+valid_batch_type: null
+valid_iterator_type: null
+valid_max_cache_size: null
+version: '202402'
+write_collected_feats: false

hyp.trn ADDED Viewed

The diff for this file is too large to render. See raw diff

ref.trn ADDED Viewed

The diff for this file is too large to render. See raw diff

valid.loss.best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ea23bca7b78e6588073f319b3b5fe03d7560f607fa29eddd13369f1b032fe13
+size 1288666400