🪐 A family of small models with 135M, 360M, and 1.7B parameters, trained on a new high-quality dataset.
135m
360m
1.7b
84.7K Pulls Updated 3 months ago
998ab8ecd7e9 · 88MB
-
general.architecturellama
-
general.base_model.0.nameSmolLM 135M
-
general.base_model.0.organizationHuggingFaceTB
-
general.base_model.0.repo_urlhttps://huggingface.co/HuggingFaceTB/SmolLM-135M
-
general.base_model.count1
-
general.basenameSmolLM
-
general.datasets[Magpie-Align/Magpie-Pro-300K-Filtered, bigcode/self-oss-instruct-sc2-exec-filter-50k, teknium/OpenHermes-2.5, HuggingFaceTB/everyday-conversations-llama3.1-2k]
-
general.file_type10
-
general.finetuneInstruct
-
general.languages[en]
-
general.licenseapache-2.0
-
general.nameSmolLM 135M
-
general.organizationHuggingFaceTB
-
general.quantization_version2
-
general.size_label135M
-
general.tags[alignment-handbook, trl, sft]
-
general.typemodel
-
llama.attention.head_count9
-
llama.attention.head_count_kv3
-
llama.attention.layer_norm_rms_epsilon1e-05
-
llama.block_count30
-
llama.context_length2048
-
llama.embedding_length576
-
llama.feed_forward_length1536
-
llama.rope.dimension_count64
-
llama.rope.freq_base10000
-
llama.vocab_size49152
-
tokenizer.ggml.add_bos_tokenfalse
-
tokenizer.ggml.add_space_prefixfalse
-
tokenizer.ggml.bos_token_id1
-
tokenizer.ggml.eos_token_id2
-
tokenizer.ggml.merges[Ġ t, Ġ a, i n, h e, Ġ Ġ, ...]
-
tokenizer.ggml.modelgpt2
-
tokenizer.ggml.padding_token_id2
-
tokenizer.ggml.presmollm
-
tokenizer.ggml.token_type[3, 3, 3, 3, 3, ...]
-
tokenizer.ggml.tokens[<|endoftext|>, <|im_start|>, <|im_end|>, <repo_name>, <reponame>, ...]
-
tokenizer.ggml.unknown_token_id0
-
NameTypeShape
-
token_embd.weightQ8_0[576, 49152]
-
blk.0.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.0.attn_norm.weightF32[576]
-
blk.0.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.0.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.0.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.0.ffn_down.weightQ3_K[1536, 576]
-
blk.0.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.0.ffn_norm.weightF32[576]
-
blk.0.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.1.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.1.attn_norm.weightF32[576]
-
blk.1.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.1.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.1.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.1.ffn_down.weightQ3_K[1536, 576]
-
blk.1.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.1.ffn_norm.weightF32[576]
-
blk.1.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.2.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.2.attn_norm.weightF32[576]
-
blk.2.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.2.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.2.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.2.ffn_down.weightQ3_K[1536, 576]
-
blk.2.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.2.ffn_norm.weightF32[576]
-
blk.2.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.3.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.3.attn_norm.weightF32[576]
-
blk.3.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.3.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.3.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.3.ffn_down.weightQ3_K[1536, 576]
-
blk.3.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.3.ffn_norm.weightF32[576]
-
blk.3.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.4.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.4.attn_norm.weightF32[576]
-
blk.4.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.4.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.4.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.4.ffn_down.weightQ3_K[1536, 576]
-
blk.4.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.4.ffn_norm.weightF32[576]
-
blk.4.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.5.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.5.attn_norm.weightF32[576]
-
blk.5.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.5.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.5.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.5.ffn_down.weightQ3_K[1536, 576]
-
blk.5.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.5.ffn_norm.weightF32[576]
-
blk.5.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.6.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.6.attn_norm.weightF32[576]
-
blk.6.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.6.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.6.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.6.ffn_down.weightQ3_K[1536, 576]
-
blk.6.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.6.ffn_norm.weightF32[576]
-
blk.6.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.7.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.7.attn_norm.weightF32[576]
-
blk.7.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.7.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.7.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.7.ffn_down.weightQ3_K[1536, 576]
-
blk.7.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.7.ffn_norm.weightF32[576]
-
blk.7.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.8.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.8.attn_norm.weightF32[576]
-
blk.8.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.8.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.8.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.8.ffn_down.weightQ3_K[1536, 576]
-
blk.8.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.8.ffn_norm.weightF32[576]
-
blk.8.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.9.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.9.attn_norm.weightF32[576]
-
blk.9.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.9.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.9.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.9.ffn_down.weightQ3_K[1536, 576]
-
blk.9.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.9.ffn_norm.weightF32[576]
-
blk.9.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.10.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.10.attn_norm.weightF32[576]
-
blk.10.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.10.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.10.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.10.ffn_down.weightQ3_K[1536, 576]
-
blk.10.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.10.ffn_norm.weightF32[576]
-
blk.10.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.11.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.11.attn_norm.weightF32[576]
-
blk.11.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.11.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.11.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.11.ffn_down.weightQ3_K[1536, 576]
-
blk.11.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.11.ffn_norm.weightF32[576]
-
blk.11.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.12.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.12.attn_norm.weightF32[576]
-
blk.12.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.12.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.12.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.12.ffn_down.weightQ3_K[1536, 576]
-
blk.12.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.12.ffn_norm.weightF32[576]
-
blk.12.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.13.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.13.attn_norm.weightF32[576]
-
blk.13.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.13.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.13.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.13.ffn_down.weightQ3_K[1536, 576]
-
blk.13.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.13.ffn_norm.weightF32[576]
-
blk.13.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.14.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.14.attn_norm.weightF32[576]
-
blk.14.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.14.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.14.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.14.ffn_down.weightQ3_K[1536, 576]
-
blk.14.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.14.ffn_norm.weightF32[576]
-
blk.14.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.15.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.15.attn_norm.weightF32[576]
-
blk.15.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.15.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.15.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.15.ffn_down.weightQ3_K[1536, 576]
-
blk.15.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.15.ffn_norm.weightF32[576]
-
blk.15.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.16.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.16.attn_norm.weightF32[576]
-
blk.16.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.16.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.16.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.16.ffn_down.weightQ3_K[1536, 576]
-
blk.16.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.16.ffn_norm.weightF32[576]
-
blk.16.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.17.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.17.attn_norm.weightF32[576]
-
blk.17.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.17.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.17.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.17.ffn_down.weightQ3_K[1536, 576]
-
blk.17.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.17.ffn_norm.weightF32[576]
-
blk.17.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.18.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.18.attn_norm.weightF32[576]
-
blk.18.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.18.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.18.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.18.ffn_down.weightQ3_K[1536, 576]
-
blk.18.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.18.ffn_norm.weightF32[576]
-
blk.18.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.19.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.19.attn_norm.weightF32[576]
-
blk.19.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.19.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.19.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.19.ffn_down.weightQ3_K[1536, 576]
-
blk.19.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.19.ffn_norm.weightF32[576]
-
blk.19.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.20.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.20.attn_norm.weightF32[576]
-
blk.20.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.20.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.20.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.20.ffn_down.weightQ3_K[1536, 576]
-
blk.20.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.20.ffn_norm.weightF32[576]
-
blk.20.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.21.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.21.attn_norm.weightF32[576]
-
blk.21.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.21.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.21.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.21.ffn_down.weightQ3_K[1536, 576]
-
blk.21.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.21.ffn_norm.weightF32[576]
-
blk.21.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.22.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.22.attn_norm.weightF32[576]
-
blk.22.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.22.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.22.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.22.ffn_down.weightQ3_K[1536, 576]
-
blk.22.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.22.ffn_norm.weightF32[576]
-
blk.22.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.23.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.23.attn_norm.weightF32[576]
-
blk.23.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.23.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.23.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.23.ffn_down.weightQ3_K[1536, 576]
-
blk.23.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.23.ffn_norm.weightF32[576]
-
blk.23.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.24.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.24.attn_norm.weightF32[576]
-
blk.24.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.24.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.24.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.24.ffn_down.weightQ3_K[1536, 576]
-
blk.24.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.24.ffn_norm.weightF32[576]
-
blk.24.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.25.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.25.attn_norm.weightF32[576]
-
blk.25.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.25.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.25.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.25.ffn_down.weightQ3_K[1536, 576]
-
blk.25.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.25.ffn_norm.weightF32[576]
-
blk.25.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.26.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.26.attn_norm.weightF32[576]
-
blk.26.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.26.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.26.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.26.ffn_down.weightQ3_K[1536, 576]
-
blk.26.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.26.ffn_norm.weightF32[576]
-
blk.26.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.27.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.27.attn_norm.weightF32[576]
-
blk.27.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.27.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.27.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.27.ffn_down.weightQ3_K[1536, 576]
-
blk.27.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.27.ffn_norm.weightF32[576]
-
blk.27.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.28.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.28.attn_norm.weightF32[576]
-
blk.28.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.28.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.28.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.28.ffn_down.weightQ3_K[1536, 576]
-
blk.28.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.28.ffn_norm.weightF32[576]
-
blk.28.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
blk.29.attn_k.weight(!unknown_type 20!)[576, 192]
-
blk.29.attn_norm.weightF32[576]
-
blk.29.attn_output.weight(!unknown_type 20!)[576, 576]
-
blk.29.attn_q.weight(!unknown_type 20!)[576, 576]
-
blk.29.attn_v.weight(!unknown_type 20!)[576, 192]
-
blk.29.ffn_down.weightQ3_K[1536, 576]
-
blk.29.ffn_gate.weight(!unknown_type 20!)[576, 1536]
-
blk.29.ffn_norm.weightF32[576]
-
blk.29.ffn_up.weight(!unknown_type 20!)[576, 1536]
-
output_norm.weightF32[576]
Metadata
Tensor
blk.0
blk.1
blk.2
blk.3
blk.4
blk.5
blk.6
blk.7
blk.8
blk.9
blk.10
blk.11
blk.12
blk.13
blk.14
blk.15
blk.16
blk.17
blk.18
blk.19
blk.20
blk.21
blk.22
blk.23
blk.24
blk.25
blk.26
blk.27
blk.28
blk.29