| model: | |
| name: "p017_omni_ctc_300m_v2_fleurs_fa_ir_final" | |
| dataset: | |
| name: "thomcles_persian_farsi_speech" | |
| train_split: "train" | |
| valid_split: "dev" | |
| storage_mode: "MIXTURE_PARQUET" | |
| task_mode: "ASR" | |
| mixture_parquet_storage_config: | |
| dataset_summary_path: "/home/simon/github/peacock-asr/projects/P017-Persian-Evals/data/thomcles_persian_omni/language_distribution_0.tsv" | |
| beta_corpus: 0.5 | |
| beta_language: 0.5 | |
| fragment_loading: | |
| cache: true | |
| asr_task_config: | |
| min_audio_len: 16_000 | |
| max_audio_len: 960_000 | |
| max_num_elements: 960_000 | |
| batch_shuffle_window: 1 | |
| normalize_audio: true | |
| example_shuffle_window: 1 | |
| tokenizer: | |
| name: "omniASR_tokenizer_written_v2" | |
| optimizer: | |
| config: | |
| lr: 1e-05 | |
| trainer: | |
| freeze_encoder_for_n_steps: 0 | |
| mixed_precision: | |
| dtype: "torch.bfloat16" | |
| grad_accumulation: | |
| num_batches: 8 | |
| activation_checkpointing: | |
| mode: "layerwise" | |
| every_nth_layer: 1 | |
| gc_every_n_steps: 100 | |
| regime: | |
| num_steps: 5_000 | |
| validate_every_n_steps: 500 | |
| validate_after_n_steps: 499 | |
| checkpoint_every_n_steps: 500 | |
| checkpoint_after_n_steps: 499 | |
| save_model_only: "all_but_last" | |
| keep_last_n_checkpoints: 2 | |
| publish_metrics_every_n_steps: 1 | |
| publish_metrics_after_n_steps: 0 | |