llama_vs_gemma_tokenizer

Created Diff never expires
16 removals
50 lines
227 additions
262 lines
normalizer_spec {
normalizer_spec {
name: "identity"
name: "identity"
precompiled_charsmap: ""
precompiled_charsmap: ""
add_dummy_prefix: true
add_dummy_prefix: false
remove_extra_whitespaces: false
remove_extra_whitespaces: false
normalization_rule_tsv: ""
normalization_rule_tsv: ""
}
}


trainer_spec {
trainer_spec {
input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
model_prefix: "/cns/mf-d/home/gemini-data-access/tokenizers/final_v1_51GB_run1/bpe_coverage_0_999995_v5/255969"
model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
model_type: BPE
model_type: BPE
vocab_size: 32000
vocab_size: 256000
self_test_sample_size: 0
self_test_sample_size: 0
input_format: "text"
input_format: ""
character_coverage: 0.99995
character_coverage: 0.999995
input_sentence_size: 200000000
input_sentence_size: 2000000000
seed_sentencepiece_size: 1000000
seed_sentencepiece_size: 1000000
shrinking_factor: 0.75
shrinking_factor: 0.75
num_threads: 80
num_threads: 16
num_sub_iterations: 2
num_sub_iterations: 2
max_sentence_length: 4192
max_sentence_length: 4192
shuffle_input_sentence: true
shuffle_input_sentence: true
max_sentencepiece_length: 16
max_sentencepiece_length: 16
split_by_unicode_script: true
split_by_unicode_script: true
split_by_whitespace: true
split_by_whitespace: true
split_by_number: true
split_by_number: true
treat_whitespace_as_suffix: false
treat_whitespace_as_suffix: false
split_digits: true
split_digits: true
allow_whitespace_only_pieces: true
allow_whitespace_only_pieces: true
user_defined_symbols: "<mask>"
user_defined_symbols: "<2mass>"
user_defined_symbols: "[@BOS@]"
user_defined_symbols: "<unused0>"
user_defined_symbols: "<unused1>"
user_defined_symbols: "<unused2>"
user_defined_symbols: "<unused3>"
user_defined_symbols: "<unused4>"
user_defined_symbols: "<unused5>"
user_defined_symbols: "<unused6>"
user_defined_symbols: "<unused7>"
user_defined_symbols: "<unused8>"
user_defined_symbols: "<unused9>"
user_defined_symbols: "<unused10>"
user_defined_symbols: "<unused11>"
user_defined_symbols: "<unused12>"
user_defined_symbols: "<unused13>"
user_defined_symbols: "<unused14>"
user_defined_symbols: "<unused15>"
user_defined_symbols: "<unused16>"
user_defined_symbols: "<unused17>"
user_defined_symbols: "<unused18>"
user_defined_symbols: "<unused19>"
user_defined_symbols: "<unused20>"
user_defined_symbols: "<unused21>"
user_defined_symbols: "<unused22>"
user_defined_symbols: "<unused23>"
user_defined_symbols: "<unused24>"
user_defined_symbols: "<unused25>"
user_defined_symbols: "<unused26>"
user_defined_symbols: "<unused27>"
user_defined_symbols: "<unused28>"
user_defined_symbols: "<unused29>"
user_defined_symbols: "<unused30>"
user_defined_symbols: "<unused31>"
user_defined_symbols: "<unused32>"
user_defined_symbols: "<unused33>"
user_defined_symbols: "<unused34>"
user_defined_symbols: "<unused35>"
user_defined_symbols: "<unused36>"
user_defined_symbols: "<unused37>"
user_defined_symbols: "<unused38>"
user_defined_symbols: "<unused39>"
user_defined_symbols: "<unused40>"
user_defined_symbols: "<unused41>"
user_defined_symbols: "<unused42>"
user_defined_symbols: "<unused43>"
user_defined_symbols: "<unused44>"
user_defined_symbols: "<unused45>"
user_defined_symbols: "<unused46>"
user_defined_symbols: "<unused47>"
user_defined_symbols: "<unused48>"
user_defined_symbols: "<unused49>"
user_defined_symbols: "<unused50>"
user_defined_symbols: "<unused51>"
user_defined_symbols: "<unused52>"
user_defined_symbols: "<unused53>"
user_defined_symbols: "<unused54>"
user_defined_symbols: "<unused55>"
user_defined_symbols: "<unused56>"
user_defined_symbols: "<unused57>"
user_defined_symbols: "<unused58>"
user_defined_symbols: "<unused59>"
user_defined_symbols: "<unused60>"
user_defined_symbols: "<unused61>"
user_defined_symbols: "<unused62>"
user_defined_symbols: "<unused63>"
user_defined_symbols: "<unused64>"
user_defined_symbols: "<unused65>"
user_defined_symbols: "<unused66>"
user_defined_symbols: "<unused67>"
user_defined_symbols: "<unused68>"
user_defined_symbols: "<unused69>"
user_defined_symbols: "<unused70>"
user_defined_symbols: "<unused71>"
user_defined_symbols: "<unused72>"
user_defined_symbols: "<unused73>"
user_defined_symbols: "<unused74>"
user_defined_symbols: "<unused75>"
user_defined_symbols: "<unused76>"
user_defined_symbols: "<unused77>"
user_defined_symbols: "<unused78>"
user_defined_symbols: "<unused79>"
user_defined_symbols: "<unused80>"
user_defined_symbols: "<unused81>"
user_defined_symbols: "<unused82>"
user_defined_symbols: "<unused83>"
user_defined_symbols: "<unused84>"
user_defined_symbols: "<unused85>"
user_defined_symbols: "<unused86>"
user_defined_symbols: "<unused87>"
user_defined_symbols: "<unused88>"
user_defined_symbols: "<unused89>"
user_defined_symbols: "<unused90>"
user_defined_symbols: "<unused91>"
user_defined_symbols: "<unused92>"
user_defined_symbols: "<unused93>"
user_defined_symbols: "<unused94>"
user_defined_symbols: "<unused95>"
user_defined_symbols: "<unused96>"
user_defined_symbols: "<unused97>"
user_defined_symbols: "<unused98>"
user_defined_symbols: "<start_of_turn>"
user_defined_symbols: "<end_of_turn>"
user_defined_symbols: "\n"
user_defined_symbols: "\n\n"
user_defined_symbols: "\n\n\n"
user_defined_symbols: "\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
user_defined_symbols: "\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201\342\226\201"
user_defined_symbols: "<table>"
user_defined_symbols: "<caption>"
user_defined_symbols: "<thead>"
user_defined_symbols: "<tbody>"
user_defined_symbols: "<tfoot>"
user_defined_symbols: "<tr>"
user_defined_symbols: "<th>"
user_defined_symbols: "<td>"
user_defined_symbols: "</table>"
user_defined_symbols: "</caption>"
user_defined_symbols: "</thead>"
user_defined_symbols: "</tbody>"
user_defined_symbols: "</tfoot>"
user_defined_symbols: "</tr>"
user_defined_symbols: "</th>"
user_defined_symbols: "</td>"
user_defined_symbols: "<h1>"
user_defined_symbols: "<h2>"
user_defined_symbols: "<h3>"
user_defined_symbols: "<h4>"
user_defined_symbols: "<h5>"
user_defined_symbols: "<h6>"
user_defined_symbols: "<blockquote>"
user_defined_symbols: "</h1>"
user_defined_symbols: "</h2>"
user_defined_symbols: "</h3>"
user_defined_symbols: "</h4>"
user_defined_symbols: "</h5>"
user_defined_symbols: "</h6>"
user_defined_symbols: "</blockquote>"
user_defined_symbols: "<strong>"
user_defined_symbols: "<em>"
user_defined_symbols: "<b>"
user_defined_symbols: "<i>"
user_defined_symbols: "<u>"
user_defined_symbols: "<s>"
user_defined_symbols: "<sub>"
user_defined_symbols: "<sup>"
user_defined_symbols: "<code>"
user_defined_symbols: "</strong>"
user_defined_symbols: "</em>"
user_defined_symbols: "</b>"
user_defined_symbols: "</i>"
user_defined_symbols: "</u>"
user_defined_symbols: "</s>"
user_defined_symbols: "</sub>"
user_defined_symbols: "</sup>"
user_defined_symbols: "</code>"
vocabulary_output_piece_score: true
vocabulary_output_piece_score: true
hard_vocab_limit: true
hard_vocab_limit: true
use_all_vocab: false
use_all_vocab: false
byte_fallback: true
byte_fallback: true
required_chars: ""
required_chars: ""
unk_id: 0
unk_id: 3
bos_id: 1
bos_id: 2
eos_id: 2
eos_id: 1
pad_id: -1
pad_id: 0
unk_surface: " \342\201\207 "
unk_surface: " \342\201\207 "
unk_piece: "<unk>"
unk_piece: "<unk>"
bos_piece: "<s>"
bos_piece: "<bos>"
eos_piece: "</s>"
eos_piece: "<eos>"
pad_piece: "<pad>"
pad_piece: "<pad>"
train_extremely_large_corpus: false
train_extremely_large_corpus: true
enable_differential_privacy: false
enable_differential_privacy: false
differential_privacy_noise_level: 0.0
differential_privacy_noise_level: 0.0
differential_privacy_clipping_threshold: 0
differential_privacy_clipping_threshold: 0
}
}


normalizer_spec {
name: "identity"
precompiled_charsmap: ""
add_dummy_prefix: true
remove_extra_whitespaces: false
normalization_rule_tsv: ""
}

trainer_spec {
input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
model_type: BPE
vocab_size: 32000
self_test_sample_size: 0
input_format: "text"
character_coverage: 0.99995
input_sentence_size: 200000000
seed_sentencepiece_size: 1000000
shrinking_factor: 0.75
num_threads: 80
num_sub_iterations: 2
max_sentence_length: 4192
shuffle_input_sentence: true
max_sentencepiece_length: 16
split_by_unicode_script: true
split_by_whitespace: true
split_by_number: true
treat_whitespace_as_suffix: false
split_digits: true
allow_whitespace_only_pieces: true
vocabulary_output_piece_score: true
hard_vocab_limit: true
use_all_vocab: false
byte_fallback: true
required_chars: ""
unk_id: 0
normalizer_spec {
name: "identity"
precompiled_charsmap: ""
add_dummy_prefix: false
remove_extra_whitespaces: false
normalization_rule_tsv: ""
}

trainer_spec {
model_prefix: "/cns/mf-d/home/gemini-data-access/tokenizers/final_v1_51GB_run1/bpe_coverage_0_999995_v5/255969"
model_type: BPE
vocab_size: 256000
self_test_sample_size: 0
input_format: ""
character_coverage: 0.999995
input_sentence_size: 2000000000
seed_sentencepiece_size: 1000000
shrinking_factor: 0.75
num_threads: 16
num_sub_iterations: 2
max_sentence_length: 4192
shuffle_input_sentence: true
max_sentencepiece_length: 16
split_by_unicode_script: true
split_by_whitespace: true
split_by_number: true
treat_whitespace_as_suffix: false
split_digits: true
allow_whitespace_only_pieces: true
user_defined_symbols: "<mask>"
user_defined_symbols: "<2mass>"
user_defined_symbols: "[@BOS@]"
user_defined_symbols: "<unused0>"
user_defined_symbols: "<unused1>"
user_defined_symbols: "<unused2>"
user_defined_symbols: "<unused3>"