🪐 A family of small models with 135M, 360M, and 1.7B parameters, trained on a new high-quality dataset.
135m
360m
1.7b
84.7K Pulls Updated 3 months ago
3b19ffe4b729 · 219MB
-
general.architecturellama
-
general.basenameSmolLM
-
general.datasets[HuggingFaceTB/smollm-corpus]
-
general.file_type10
-
general.languages[en]
-
general.licenseapache-2.0
-
general.nameSmolLM 360M
-
general.quantization_version2
-
general.size_label360M
-
general.typemodel
-
llama.attention.head_count15
-
llama.attention.head_count_kv5
-
llama.attention.layer_norm_rms_epsilon1e-05
-
llama.block_count32
-
llama.context_length2048
-
llama.embedding_length960
-
llama.feed_forward_length2560
-
llama.rope.dimension_count64
-
llama.rope.freq_base10000
-
llama.vocab_size49152
-
tokenizer.ggml.add_bos_tokenfalse
-
tokenizer.ggml.add_space_prefixfalse
-
tokenizer.ggml.bos_token_id0
-
tokenizer.ggml.eos_token_id0
-
tokenizer.ggml.merges[Ġ t, Ġ a, i n, h e, Ġ Ġ, ...]
-
tokenizer.ggml.modelgpt2
-
tokenizer.ggml.presmollm
-
tokenizer.ggml.token_type[3, 3, 3, 3, 3, ...]
-
tokenizer.ggml.tokens[<|endoftext|>, <|im_start|>, <|im_end|>, <repo_name>, <reponame>, ...]
-
tokenizer.ggml.unknown_token_id0
-
NameTypeShape
-
token_embd.weightQ8_0[960, 49152]
-
blk.0.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.0.attn_norm.weightF32[960]
-
blk.0.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.0.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.0.ffn_down.weightQ3_K[2560, 960]
-
blk.0.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.0.ffn_norm.weightF32[960]
-
blk.0.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.1.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.1.attn_norm.weightF32[960]
-
blk.1.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.1.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.1.ffn_down.weightQ3_K[2560, 960]
-
blk.1.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.1.ffn_norm.weightF32[960]
-
blk.1.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.2.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.2.attn_norm.weightF32[960]
-
blk.2.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.2.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.2.ffn_down.weightQ3_K[2560, 960]
-
blk.2.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.2.ffn_norm.weightF32[960]
-
blk.2.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.3.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.3.attn_norm.weightF32[960]
-
blk.3.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.3.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.3.ffn_down.weightQ3_K[2560, 960]
-
blk.3.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.3.ffn_norm.weightF32[960]
-
blk.3.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.4.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.4.attn_norm.weightF32[960]
-
blk.4.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.4.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.4.ffn_down.weightQ3_K[2560, 960]
-
blk.4.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.4.ffn_norm.weightF32[960]
-
blk.4.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.5.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.5.attn_norm.weightF32[960]
-
blk.5.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.5.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.5.ffn_down.weightQ3_K[2560, 960]
-
blk.5.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.5.ffn_norm.weightF32[960]
-
blk.5.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.6.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.6.attn_norm.weightF32[960]
-
blk.6.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.6.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.6.ffn_down.weightQ3_K[2560, 960]
-
blk.6.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.6.ffn_norm.weightF32[960]
-
blk.6.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.7.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.7.attn_norm.weightF32[960]
-
blk.7.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.7.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.7.ffn_down.weightQ3_K[2560, 960]
-
blk.7.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.7.ffn_norm.weightF32[960]
-
blk.7.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.8.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.8.attn_norm.weightF32[960]
-
blk.8.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.8.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.8.ffn_down.weightQ3_K[2560, 960]
-
blk.8.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.8.ffn_norm.weightF32[960]
-
blk.8.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.9.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.9.attn_norm.weightF32[960]
-
blk.9.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.9.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.9.ffn_down.weightQ3_K[2560, 960]
-
blk.9.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.9.ffn_norm.weightF32[960]
-
blk.9.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.10.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.10.attn_norm.weightF32[960]
-
blk.10.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.10.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.10.ffn_down.weightQ3_K[2560, 960]
-
blk.10.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.10.ffn_norm.weightF32[960]
-
blk.10.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.11.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.11.attn_norm.weightF32[960]
-
blk.11.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.11.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.11.ffn_down.weightQ3_K[2560, 960]
-
blk.11.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.11.ffn_norm.weightF32[960]
-
blk.11.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.12.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.12.attn_norm.weightF32[960]
-
blk.12.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.12.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.12.ffn_down.weightQ3_K[2560, 960]
-
blk.12.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.12.ffn_norm.weightF32[960]
-
blk.12.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.13.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.13.attn_norm.weightF32[960]
-
blk.13.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.13.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.13.ffn_down.weightQ3_K[2560, 960]
-
blk.13.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.13.ffn_norm.weightF32[960]
-
blk.13.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.14.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.14.attn_norm.weightF32[960]
-
blk.14.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.14.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.14.ffn_down.weightQ3_K[2560, 960]
-
blk.14.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.14.ffn_norm.weightF32[960]
-
blk.14.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.15.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.15.attn_norm.weightF32[960]
-
blk.15.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.15.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.15.ffn_down.weightQ3_K[2560, 960]
-
blk.15.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.15.ffn_norm.weightF32[960]
-
blk.15.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.16.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.16.attn_norm.weightF32[960]
-
blk.16.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.16.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.16.ffn_down.weightQ3_K[2560, 960]
-
blk.16.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.16.ffn_norm.weightF32[960]
-
blk.16.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.17.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.17.attn_norm.weightF32[960]
-
blk.17.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.17.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.17.ffn_down.weightQ3_K[2560, 960]
-
blk.17.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.17.ffn_norm.weightF32[960]
-
blk.17.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.18.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.18.attn_norm.weightF32[960]
-
blk.18.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.18.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.18.ffn_down.weightQ3_K[2560, 960]
-
blk.18.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.18.ffn_norm.weightF32[960]
-
blk.18.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.19.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.19.attn_norm.weightF32[960]
-
blk.19.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.19.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.19.ffn_down.weightQ3_K[2560, 960]
-
blk.19.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.19.ffn_norm.weightF32[960]
-
blk.19.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.20.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.20.attn_norm.weightF32[960]
-
blk.20.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.20.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.20.ffn_down.weightQ3_K[2560, 960]
-
blk.20.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.20.ffn_norm.weightF32[960]
-
blk.20.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.21.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.21.attn_norm.weightF32[960]
-
blk.21.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.21.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.21.ffn_down.weightQ3_K[2560, 960]
-
blk.21.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.21.ffn_norm.weightF32[960]
-
blk.21.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.22.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.22.attn_norm.weightF32[960]
-
blk.22.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.22.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.22.ffn_down.weightQ3_K[2560, 960]
-
blk.22.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.22.ffn_norm.weightF32[960]
-
blk.22.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.23.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.23.attn_norm.weightF32[960]
-
blk.23.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.23.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.23.ffn_down.weightQ3_K[2560, 960]
-
blk.23.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.23.ffn_norm.weightF32[960]
-
blk.23.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.24.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.24.attn_norm.weightF32[960]
-
blk.24.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.24.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.24.ffn_down.weightQ3_K[2560, 960]
-
blk.24.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.24.ffn_norm.weightF32[960]
-
blk.24.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.25.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.25.attn_norm.weightF32[960]
-
blk.25.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.25.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.25.ffn_down.weightQ3_K[2560, 960]
-
blk.25.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.25.ffn_norm.weightF32[960]
-
blk.25.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.26.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.26.attn_norm.weightF32[960]
-
blk.26.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.26.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.26.ffn_down.weightQ3_K[2560, 960]
-
blk.26.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.26.ffn_norm.weightF32[960]
-
blk.26.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.27.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.27.attn_norm.weightF32[960]
-
blk.27.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.27.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.27.ffn_down.weightQ3_K[2560, 960]
-
blk.27.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.27.ffn_norm.weightF32[960]
-
blk.27.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.28.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.28.attn_norm.weightF32[960]
-
blk.28.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.28.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.28.ffn_down.weightQ3_K[2560, 960]
-
blk.28.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.28.ffn_norm.weightF32[960]
-
blk.28.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.29.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.29.attn_norm.weightF32[960]
-
blk.29.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.29.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.29.ffn_down.weightQ3_K[2560, 960]
-
blk.29.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.29.ffn_norm.weightF32[960]
-
blk.29.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.30.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.30.attn_norm.weightF32[960]
-
blk.30.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.30.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.30.ffn_down.weightQ3_K[2560, 960]
-
blk.30.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.30.ffn_norm.weightF32[960]
-
blk.30.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
blk.31.attn_k.weight(!unknown_type 20!)[960, 320]
-
blk.31.attn_norm.weightF32[960]
-
blk.31.attn_output.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_q.weight(!unknown_type 20!)[960, 960]
-
blk.31.attn_v.weight(!unknown_type 20!)[960, 320]
-
blk.31.ffn_down.weightQ3_K[2560, 960]
-
blk.31.ffn_gate.weight(!unknown_type 20!)[960, 2560]
-
blk.31.ffn_norm.weightF32[960]
-
blk.31.ffn_up.weight(!unknown_type 20!)[960, 2560]
-
output_norm.weightF32[960]
Metadata
Tensor
blk.0
blk.1
blk.2
blk.3
blk.4
blk.5
blk.6
blk.7
blk.8
blk.9
blk.10
blk.11
blk.12
blk.13
blk.14
blk.15
blk.16
blk.17
blk.18
blk.19
blk.20
blk.21
blk.22
blk.23
blk.24
blk.25
blk.26
blk.27
blk.28
blk.29
blk.30
blk.31