Home

Awesome

ShortGPT

Unofficial implementations of:

To Use

Details

Results

Comparison of ShortGPT layers removed on Llama-2-7B (9 least important layers):

Paper: [27, 26, 25, 28, 24, 29, 23, 21, 22]
This Implementation: [25, 27, 24, 26, 28, 29, 23, 22, 21]

Same layers but different order.

TODO:

Citations

@misc{men2024shortgpt,
    title={ShortGPT: Layers in Large Language Models are More Redundant Than You Expect}, 
    author={Xin Men and Mingyu Xu and Qingyu Zhang and Bingning Wang and Hongyu Lin and Yaojie Lu and Xianpei Han and Weipeng Chen},
    year={2024},
    eprint={2403.03853},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{gromov2024unreasonable,
    title={The Unreasonable Ineffectiveness of the Deeper Layers}, 
    author={Andrey Gromov and Kushal Tirumala and Hassan Shapourian and Paolo Glorioso and Daniel A. Roberts},
    year={2024},
    eprint={2403.17887},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{song2024sleb,
    title={SLEB: Streamlining LLMs through Redundancy Verification and Elimination of Transformer Blocks}, 
    author={Jiwon Song and Kyungseok Oh and Taesu Kim and Hyungjun Kim and Yulhwa Kim and Jae-Joon Kim},
    year={2024},
    eprint={2402.09025},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@article{raecompressive2019,
    author = {Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and Hillier, Chloe and Lillicrap, Timothy P},
    title = {Compressive Transformers for Long-Range Sequence Modelling},
    journal = {arXiv preprint},
    url = {https://arxiv.org/abs/1911.05507},
    year = {2019},
}