ToastyPigeon commited on
Commit
00b097b
·
verified ·
1 Parent(s): e6e8e7b

Training in progress, step 108, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39a9749ffef3c830174bf40404cc485fd4c5663eea6479de77c33a29ef1deb34
3
  size 536992880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c094006aff2544c518cad78661fa2d417641d73d218ae414283344824b1062ab
3
  size 536992880
last-checkpoint/global_step108/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:993f47f680b9c7f6c4b4911b8a984dc9f216a9258a0d0dfdf9ba5fdf99d7a9a8
3
+ size 474418016
last-checkpoint/global_step108/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2317c4a1a8b5e45e3d00f5de8aad40af1bd4e404f24a766bcbbbce7598206a2e
3
+ size 474418016
last-checkpoint/global_step108/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff2d4c6ef7d103246066c2d03f1c4a652233b498c769fa6d433898efb7dd732
3
+ size 474418016
last-checkpoint/global_step108/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee011b1c3b3976e689095614cfa291679912347413aaea2362e43751e3dfc375
3
+ size 474418016
last-checkpoint/global_step108/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c17f05463192f4886edc573fd4c4e4f6cb6202e4deaa036fdab7eb4cff6e586
3
+ size 497569222
last-checkpoint/global_step108/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fd88fdb5135c44e803a5da8c2d4e0d208c71102a545e4728399dd8950b42c0f
3
+ size 497569222
last-checkpoint/global_step108/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4373a57b2c2cf30deab569dc4584821bc97574328a1e515b5e032ead0f92c1d
3
+ size 497569222
last-checkpoint/global_step108/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd8662b9062b826b371c261d71462248c98a4810485455b150211d5755dbbace
3
+ size 497569222
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step81
 
1
+ global_step108
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc34b9b7583664e073536f09b431c2900517f2eae037be0f350bebadc76f3a7a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d4a604acb40801033a18ef26f6d4904ccae9e17235a30689cbdd297cbf635ba
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:670ee22b03cb2f6e494846f535e536868078808871c8e82661cead03e336da23
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85ba958b693b3be1b1488820ef712ba2c6d3640a856dcd2409bea15cae78d6d8
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce6f51e01ab770161e01a525ecba8e0e6f6221b9b8dacc64707f0f41b381007a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d77af35e19f2f989f3468f3f525e24fb127360a01895d63c49b57c6fbe31341
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d55b1f769d6bd103cedb3990998f0b34e4e1d63c6ffd0c2ac3a5f8c4526e3b06
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24628b38b313ad9b720f780a5da8d10d2b58d9282e312631fe7ef1740e6d74fb
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0b842b024e7609e993d6c174def3f8f329d0b2e8d4f00c44396e44f9f3e3cda
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:347e78783bdffbe59a6a30fe7d177e31fdbdc48384eb3094510fabb6fef09779
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7465437788018433,
5
  "eval_steps": 11,
6
- "global_step": 81,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -638,6 +638,211 @@
638
  "learning_rate": 2.9004497406582e-05,
639
  "loss": 2.1402,
640
  "step": 81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  }
642
  ],
643
  "logging_steps": 1,
@@ -652,12 +857,12 @@
652
  "should_evaluate": false,
653
  "should_log": false,
654
  "should_save": true,
655
- "should_training_stop": false
656
  },
657
  "attributes": {}
658
  }
659
  },
660
- "total_flos": 1.5219590543966208e+16,
661
  "train_batch_size": 1,
662
  "trial_name": null,
663
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9953917050691244,
5
  "eval_steps": 11,
6
+ "global_step": 108,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
638
  "learning_rate": 2.9004497406582e-05,
639
  "loss": 2.1402,
640
  "step": 81
641
+ },
642
+ {
643
+ "epoch": 0.7557603686635944,
644
+ "grad_norm": 0.009968014397025708,
645
+ "learning_rate": 2.8728819646461336e-05,
646
+ "loss": 2.2383,
647
+ "step": 82
648
+ },
649
+ {
650
+ "epoch": 0.7649769585253456,
651
+ "grad_norm": 0.009620205315061477,
652
+ "learning_rate": 2.8461301125590613e-05,
653
+ "loss": 2.2639,
654
+ "step": 83
655
+ },
656
+ {
657
+ "epoch": 0.7741935483870968,
658
+ "grad_norm": 0.00977544354748898,
659
+ "learning_rate": 2.8202190698460053e-05,
660
+ "loss": 2.265,
661
+ "step": 84
662
+ },
663
+ {
664
+ "epoch": 0.783410138248848,
665
+ "grad_norm": 0.010307755448269416,
666
+ "learning_rate": 2.7951729398075695e-05,
667
+ "loss": 2.2436,
668
+ "step": 85
669
+ },
670
+ {
671
+ "epoch": 0.7926267281105991,
672
+ "grad_norm": 0.009422457517611288,
673
+ "learning_rate": 2.7710150211742585e-05,
674
+ "loss": 2.25,
675
+ "step": 86
676
+ },
677
+ {
678
+ "epoch": 0.8018433179723502,
679
+ "grad_norm": 0.009754638407058579,
680
+ "learning_rate": 2.7477677864332375e-05,
681
+ "loss": 2.2758,
682
+ "step": 87
683
+ },
684
+ {
685
+ "epoch": 0.8110599078341014,
686
+ "grad_norm": 0.009561231269697724,
687
+ "learning_rate": 2.7254528609236844e-05,
688
+ "loss": 2.2473,
689
+ "step": 88
690
+ },
691
+ {
692
+ "epoch": 0.8110599078341014,
693
+ "eval_loss": 2.2800304889678955,
694
+ "eval_runtime": 296.664,
695
+ "eval_samples_per_second": 0.674,
696
+ "eval_steps_per_second": 0.169,
697
+ "step": 88
698
+ },
699
+ {
700
+ "epoch": 0.8202764976958525,
701
+ "grad_norm": 0.01112849185110947,
702
+ "learning_rate": 2.7040910027201976e-05,
703
+ "loss": 2.3045,
704
+ "step": 89
705
+ },
706
+ {
707
+ "epoch": 0.8294930875576036,
708
+ "grad_norm": 0.010506389237890506,
709
+ "learning_rate": 2.683702083322948e-05,
710
+ "loss": 2.2741,
711
+ "step": 90
712
+ },
713
+ {
714
+ "epoch": 0.8387096774193549,
715
+ "grad_norm": 0.009768468870786754,
716
+ "learning_rate": 2.6643050691725647e-05,
717
+ "loss": 2.2703,
718
+ "step": 91
719
+ },
720
+ {
721
+ "epoch": 0.847926267281106,
722
+ "grad_norm": 0.010141192796013403,
723
+ "learning_rate": 2.6459180040069244e-05,
724
+ "loss": 2.2548,
725
+ "step": 92
726
+ },
727
+ {
728
+ "epoch": 0.8571428571428571,
729
+ "grad_norm": 0.010229279382807752,
730
+ "learning_rate": 2.6285579920762826e-05,
731
+ "loss": 2.2488,
732
+ "step": 93
733
+ },
734
+ {
735
+ "epoch": 0.8663594470046083,
736
+ "grad_norm": 0.010307954340086967,
737
+ "learning_rate": 2.6122411822323345e-05,
738
+ "loss": 2.275,
739
+ "step": 94
740
+ },
741
+ {
742
+ "epoch": 0.8755760368663594,
743
+ "grad_norm": 0.011123883009152893,
744
+ "learning_rate": 2.5969827529060266e-05,
745
+ "loss": 2.124,
746
+ "step": 95
747
+ },
748
+ {
749
+ "epoch": 0.8847926267281107,
750
+ "grad_norm": 0.009608681495586172,
751
+ "learning_rate": 2.5827968979880805e-05,
752
+ "loss": 2.3229,
753
+ "step": 96
754
+ },
755
+ {
756
+ "epoch": 0.8940092165898618,
757
+ "grad_norm": 0.009782962640744528,
758
+ "learning_rate": 2.5696968136253768e-05,
759
+ "loss": 2.2763,
760
+ "step": 97
761
+ },
762
+ {
763
+ "epoch": 0.9032258064516129,
764
+ "grad_norm": 0.009605554717833914,
765
+ "learning_rate": 2.5576946859454592e-05,
766
+ "loss": 2.2603,
767
+ "step": 98
768
+ },
769
+ {
770
+ "epoch": 0.9124423963133641,
771
+ "grad_norm": 0.009456935936538857,
772
+ "learning_rate": 2.5468016797206052e-05,
773
+ "loss": 2.1912,
774
+ "step": 99
775
+ },
776
+ {
777
+ "epoch": 0.9124423963133641,
778
+ "eval_loss": 2.27895188331604,
779
+ "eval_runtime": 291.7402,
780
+ "eval_samples_per_second": 0.686,
781
+ "eval_steps_per_second": 0.171,
782
+ "step": 99
783
+ },
784
+ {
785
+ "epoch": 0.9216589861751152,
786
+ "grad_norm": 0.010595387456993979,
787
+ "learning_rate": 2.5370279279819864e-05,
788
+ "loss": 2.2735,
789
+ "step": 100
790
+ },
791
+ {
792
+ "epoch": 0.9308755760368663,
793
+ "grad_norm": 0.010469741198338179,
794
+ "learning_rate": 2.528382522593594e-05,
795
+ "loss": 2.2018,
796
+ "step": 101
797
+ },
798
+ {
799
+ "epoch": 0.9400921658986175,
800
+ "grad_norm": 0.01162265413802059,
801
+ "learning_rate": 2.5208735057946846e-05,
802
+ "loss": 2.3279,
803
+ "step": 102
804
+ },
805
+ {
806
+ "epoch": 0.9493087557603687,
807
+ "grad_norm": 0.0109629214839692,
808
+ "learning_rate": 2.5145078627186302e-05,
809
+ "loss": 2.2708,
810
+ "step": 103
811
+ },
812
+ {
813
+ "epoch": 0.9585253456221198,
814
+ "grad_norm": 0.010413150712905432,
815
+ "learning_rate": 2.5092915148951153e-05,
816
+ "loss": 2.3774,
817
+ "step": 104
818
+ },
819
+ {
820
+ "epoch": 0.967741935483871,
821
+ "grad_norm": 0.010610525090597078,
822
+ "learning_rate": 2.5052293147417382e-05,
823
+ "loss": 2.231,
824
+ "step": 105
825
+ },
826
+ {
827
+ "epoch": 0.9769585253456221,
828
+ "grad_norm": 0.010306572358828672,
829
+ "learning_rate": 2.5023250410501333e-05,
830
+ "loss": 2.4008,
831
+ "step": 106
832
+ },
833
+ {
834
+ "epoch": 0.9861751152073732,
835
+ "grad_norm": 0.010261194384190835,
836
+ "learning_rate": 2.5005813954708107e-05,
837
+ "loss": 2.3173,
838
+ "step": 107
839
+ },
840
+ {
841
+ "epoch": 0.9953917050691244,
842
+ "grad_norm": 0.010312253992880615,
843
+ "learning_rate": 2.5e-05,
844
+ "loss": 2.3206,
845
+ "step": 108
846
  }
847
  ],
848
  "logging_steps": 1,
 
857
  "should_evaluate": false,
858
  "should_log": false,
859
  "should_save": true,
860
+ "should_training_stop": true
861
  },
862
  "attributes": {}
863
  }
864
  },
865
+ "total_flos": 2.037172357247795e+16,
866
  "train_batch_size": 1,
867
  "trial_name": null,
868
  "trial_params": null