defget_num_params(self,non_embedding=True):""" Return the number of parameters in the model. For non-embedding count (default), the position embeddings get subtracted. The token embeddings would too, except due to the parameter sharing these params are actually used as weights in the final layer, so we include them. """# 统计模型里 所有参数的数量n_params=sum(p.numel()forpinself.parameters())ifnon_embedding:n_params-=self.transformer.wpe.weight.numel()returnn_params
L=cfg.n_layer# Transformer 层数H=cfg.n_head# 注意力头数Q=cfg.n_embd//cfg.n_head# 每个 head 的维度T=cfg.block_size# 序列长度N=参数总量defestimate_mfu(self,fwdbwd_per_iter,dt):""" estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """# first estimate the number of flops we do per iteration.# see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311N=self.get_num_params()cfg=self.configL,H,Q,T=cfg.n_layer,cfg.n_head,cfg.n_embd//cfg.n_head,cfg.block_sizeflops_per_token=6*N+12*L*H*Q*Tflops_per_fwdbwd=flops_per_token*Tflops_per_iter=flops_per_fwdbwd*fwdbwd_per_iter# express our flops throughput as ratio of A100 bfloat16 peak flopsflops_achieved=flops_per_iter*(1.0/dt)# per secondflops_promised=312e12# A100 GPU bfloat16 peak flops is 312 TFLOPSmfu=flops_achieved/flops_promisedreturnmfu