{"id":133,"date":"2026-06-13T19:56:31","date_gmt":"2026-06-13T19:56:31","guid":{"rendered":"https:\/\/blog.joezhou.top\/ai-3\/"},"modified":"2026-06-13T19:56:31","modified_gmt":"2026-06-13T19:56:31","slug":"ai-3","status":"publish","type":"post","link":"https:\/\/www.joezhou.top\/?p=133","title":{"rendered":"AI \u6a21\u578b\u4f18\u5316\u6280\u672f\u5168\u666f\uff1a\u4ece\u91cf\u5316\u84b8\u998f\u5230\u63a8\u7406\u52a0\u901f\u7684\u5de5\u7a0b\u5b9e\u6218"},"content":{"rendered":"<p>2026\u5e74\uff0c\u5927\u6a21\u578b\u7684\u90e8\u7f72\u5df2\u7ecf\u4ece&quot;\u80fd\u4e0d\u80fd\u8dd1&quot;\u7684\u95ee\u9898\uff0c\u8f6c\u53d8\u4e3a&quot;\u5982\u4f55\u8dd1\u5f97\u66f4\u5feb\u3001\u66f4\u4fbf\u5b9c\u3001\u66f4\u7a33\u5b9a&quot;\u3002\u968f\u7740\u5343\u4ebf\u53c2\u6570\u6a21\u578b\u4e0d\u65ad\u6d8c\u73b0\uff0c\u6a21\u578b\u4f18\u5316\u6280\u672f\u6210\u4e3a\u4e86AI\u5de5\u7a0b\u5316\u7684\u6838\u5fc3\u8d5b\u9053\u3002\u672c\u6587\u5c06\u5168\u9762\u68b3\u7406\u5f53\u524d\u4e3b\u6d41\u7684\u6a21\u578b\u4f18\u5316\u6280\u672f\u8def\u7ebf\uff0c\u4ece\u91cf\u5316\u3001\u84b8\u998f\u3001\u526a\u679d\u5230\u63a8\u7406\u5f15\u64ce\u4f18\u5316\uff0c\u4e3aAI\u5de5\u7a0b\u5e08\u63d0\u4f9b\u4e00\u4efd\u5b9e\u7528\u7684\u6280\u672f\u5bfc\u822a\u3002<\/p>\n<h2>\u4e00\u3001\u91cf\u5316\uff1a\u7528\u7cbe\u5ea6\u6362\u901f\u5ea6\uff0c\u4f46\u7cbe\u5ea6\u4e0d\u80fd\u4e22\u592a\u591a<\/h2>\n<p>\u91cf\u5316\uff08Quantization\uff09\u662f\u76ee\u524d\u5e94\u7528\u6700\u5e7f\u6cdb\u7684\u6a21\u578b\u52a0\u901f\u6280\u672f\uff0c\u5176\u6838\u5fc3\u903b\u8f91\u5f88\u7b80\u5355\uff1a\u5c06\u6a21\u578b\u53c2\u6570\u4ece\u9ad8\u7cbe\u5ea6\u6d6e\u70b9\u6570\uff08FP32\/FP16\uff09\u8f6c\u6362\u4e3a\u8f83\u4f4e\u7cbe\u5ea6\uff08INT8\/INT4\uff09\uff0c\u4ece\u800c\u663e\u8457\u51cf\u5c11\u6a21\u578b\u4f53\u79ef\u548c\u8ba1\u7b97\u5f00\u9500\u3002<\/p>\n<h3>1.1 \u91cf\u5316\u7cbe\u5ea6\u5bf9\u6bd4<\/h3>\n<table>\n<thead>\n<tr>\n<th>\u7cbe\u5ea6\u7c7b\u578b<\/th>\n<th>\u5185\u5b58\u5360\u7528\uff08\u76f8\u5bf9FP32\uff09<\/th>\n<th>\u63a8\u7406\u52a0\u901f\u6bd4\uff08\u5178\u578b\u503c\uff09<\/th>\n<th>\u9002\u7528\u573a\u666f<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>FP32<\/td>\n<td>100%<\/td>\n<td>1x<\/td>\n<td>\u8bad\u7ec3\/\u7cbe\u5ea6\u654f\u611f<\/td>\n<\/tr>\n<tr>\n<td>FP16\/BF16<\/td>\n<td>50%<\/td>\n<td>1.5-2x<\/td>\n<td>GPU\u63a8\u7406\u4f18\u9009<\/td>\n<\/tr>\n<tr>\n<td>INT8<\/td>\n<td>25%<\/td>\n<td>2-4x<\/td>\n<td>CPU\/GPU\u901a\u7528<\/td>\n<\/tr>\n<tr>\n<td>INT4<\/td>\n<td>12.5%<\/td>\n<td>3-6x<\/td>\n<td>\u8fb9\u7f18\u8bbe\u5907<\/td>\n<\/tr>\n<tr>\n<td>NF4<\/td>\n<td>12.5%<\/td>\n<td>3-5x<\/td>\n<td>\u91cf\u5316\u654f\u611f\u6a21\u578b<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3>1.2 \u91cf\u5316\u65b9\u5f0f\u7684\u9009\u62e9<\/h3>\n<p><strong>\u8bad\u7ec3\u540e\u91cf\u5316\uff08PTQ\uff09<\/strong> \u662f\u6700\u7b80\u5355\u7684\u65b9\u5f0f\uff0c\u76f4\u63a5\u628a\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u505a\u91cf\u5316\u6821\u51c6\uff0c\u4e0d\u9700\u8981\u91cd\u65b0\u8bad\u7ec3\u3002\u4ee5Intel\u7684NNCF\u548cNVIDIA\u7684TensorRT\u4e3a\u4ee3\u8868\uff0cPTQ\u5df2\u7ecf\u6210\u4e3a\u591a\u6570\u5382\u5546\u7684\u9996\u9009\u65b9\u6848\u3002\u4f18\u70b9\u662f\u5feb\uff0c\u7f3a\u70b9\u662f\u6709\u4e00\u5b9a\u7cbe\u5ea6\u635f\u5931\u3002<\/p>\n<p><strong>\u91cf\u5316\u611f\u77e5\u8bad\u7ec3\uff08QAT\uff09<\/strong> \u5219\u662f\u5728\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u6a21\u62df\u91cf\u5316\u6548\u679c\uff0c\u8ba9\u6a21\u578b\u53c2\u6570\u81ea\u9002\u5e94\u4f4e\u7cbe\u5ea6\u8868\u793a\u3002QAT\u4e00\u822c\u6bd4PTQ\u7cbe\u5ea6\u9ad80.5-2%\uff0c\u4f46\u9700\u8981\u989d\u5916\u7684\u8bad\u7ec3\u5468\u671f\u3002\u5bf9\u4e8e\u7cbe\u5ea6\u8981\u6c42\u6781\u9ad8\u7684\u573a\u666f\uff08\u5982\u533b\u7597\u3001\u91d1\u878d\uff09\uff0cQAT\u662f\u66f4\u597d\u7684\u9009\u62e9\u3002<\/p>\n<p><strong>AWQ\uff08Activation-aware Weight Quantization\uff09<\/strong> \u662f2024\u5e74MIT\u63d0\u51fa\u7684\u6539\u8fdb\u65b9\u6848\uff0c\u901a\u8fc7\u5bf9\u6fc0\u6d3b\u503c\u7684\u611f\u77e5\u6765\u52a8\u6001\u51b3\u5b9a\u54ea\u4e9b\u6743\u91cd\u901a\u9053\u4fdd\u7559\u9ad8\u7cbe\u5ea6\u3002\u5728LLM\u63a8\u7406\u4e2d\uff0cAWQ\u6bd4\u4f20\u7edfINT4\u91cf\u5316\u65b9\u6848\u4fdd\u7559\u66f4\u591a\u91cd\u8981\u4fe1\u606f\uff0c\u5df2\u6210\u4e3a\u4e3b\u6d41\u5f00\u6e90\u65b9\u6848\u4e4b\u4e00\u3002<\/p>\n<h3>1.3 \u91cf\u5316\u5b9e\u6218\u5efa\u8bae<\/h3>\n<p>\u5982\u679c\u4f60\u6b63\u5728\u505aLLM\u843d\u5730\u90e8\u7f72\uff0c\u5efa\u8bae\u8fd9\u6837\u9009\u62e9\u91cf\u5316\u7b56\u7565\uff1a<\/p>\n<ul>\n<li>GPU\u63a8\u7406\u7528FP16+INT8\u6df7\u5408\u91cf\u5316\uff08TensorRT-LLM\u65b9\u6848\uff09<\/li>\n<li>CPU\u63a8\u7406\u7528INT4\u91cf\u5316\uff08\u7ed3\u5408llama.cpp\u6216MLC-LLM\uff09<\/li>\n<li>\u8fb9\u7f18\u8bbe\u5907\u7528NF4+\u7a00\u758f\u5316\u7ec4\u5408<\/li>\n<li>\u7cbe\u5ea6\u654f\u611f\u573a\u666f\u9009QAT\u6216AWQ<\/li>\n<\/ul>\n<h2>\u4e8c\u3001\u84b8\u998f\uff1a\u8ba9\u5b66\u751f\u6a21\u578b\u7ee7\u627f\u6559\u5e08\u6a21\u578b\u7684\u667a\u6167<\/h2>\n<p>\u77e5\u8bc6\u84b8\u998f\uff08Knowledge Distillation\uff09\u7684\u6838\u5fc3\u601d\u60f3\u662f\uff1a\u7528\u4e00\u4e2a\u66f4\u5927\u7684&quot;\u6559\u5e08\u6a21\u578b&quot;\u6307\u5bfc\u4e00\u4e2a\u66f4\u5c0f\u7684&quot;\u5b66\u751f\u6a21\u578b&quot;\u5b66\u4e60\u3002\u5b66\u751f\u6a21\u578b\u4e0d\u9700\u8981\u8fbe\u5230\u6559\u5e08\u6a21\u578b\u7684\u7406\u8bba\u4e0a\u9650\uff0c\u53ea\u9700\u8981\u5728\u7279\u5b9a\u4efb\u52a1\u4e0a&quot;\u5b66\u5f97\u50cf&quot;\u5c31\u591f\u4e86\u3002<\/p>\n<h3>2.1 \u84b8\u998f\u7684\u4e09\u5c42\u5883\u754c<\/h3>\n<p><strong>\u7b2c\u4e00\u5c42\uff1a\u8f93\u51fa\u5c42\u84b8\u998f<\/strong>\u2014\u2014\u6700\u7b80\u5355\u7684\u84b8\u998f\u65b9\u5f0f\uff0c\u8ba9\u5b66\u751f\u6a21\u578b\u5b66\u4e60\u6559\u5e08\u6a21\u578b\u7684softmax\u8f93\u51fa\u5206\u5e03\u3002\u901a\u8fc7\u6e29\u5ea6\u53c2\u6570\u63a7\u5236softmax\u7684\u5e73\u6ed1\u7a0b\u5ea6\uff0c\u8ba9\u7c7b\u522b\u4e4b\u95f4\u7684\u76f8\u5bf9\u5173\u7cfb\u66f4\u660e\u663e\u3002\u4f8b\u5982\uff0c\u6559\u5e08\u6a21\u578b\u8bf4&quot;\u732b&quot;\u7684\u6982\u7387\u662f0.8\u3001&quot;\u72d7&quot;\u662f0.15\uff0c\u8fd9\u4e2a\u4fe1\u606f\u6bd4\u5355\u7eaf\u7684\u6807\u7b7e&quot;\u732b&quot;\u4e30\u5bcc\u5f97\u591a\u3002<\/p>\n<p><strong>\u7b2c\u4e8c\u5c42\uff1a\u4e2d\u95f4\u5c42\u84b8\u998f<\/strong>\u2014\u2014\u8ba9\u5b66\u751f\u7684\u4e2d\u95f4\u5c42\u7279\u5f81\u8868\u793a\u4e5f\u8d34\u8fd1\u6559\u5e08\u7684\u5bf9\u5e94\u5c42\u3002\u8fd9\u9700\u8981\u8bbe\u8ba1\u5408\u9002\u7684\u5bf9\u9f50\u51fd\u6570\uff0c\u5e38\u89c1\u7684\u6709MSE Loss\u548c\u5bf9\u6bd4\u5b66\u4e60Loss\u3002\u5bf9\u4e8eTransformer\u67b6\u6784\uff0c\u53ef\u4ee5\u5728\u7279\u5b9a\u5c42\u4e4b\u95f4\u505a\u7279\u5f81\u5bf9\u9f50\u3002<\/p>\n<p><strong>\u7b2c\u4e09\u5c42\uff1a\u5173\u7cfb\u84b8\u998f<\/strong>\u2014\u2014\u4e0d\u518d\u662f\u5355\u70b9\u5bf9\u5e94\uff0c\u800c\u662f\u5b66\u4e60\u6837\u672c\u4e4b\u95f4\u7684\u5173\u7cfb\u7ed3\u6784\u3002\u6bd4\u5982\uff0c\u6559\u5e08\u6a21\u578b\u4e2d&quot;A\u6bd4B\u66f4\u50cfC&quot;\u7684\u5173\u7cfb\uff0c\u5b66\u751f\u6a21\u578b\u4e5f\u8981\u5b66\u4f1a\u3002\u8fd9\u5bf9\u5c0f\u6837\u672c\u573a\u666f\u7279\u522b\u6709\u6548\u3002<\/p>\n<h3>2.2 \u84b8\u998f\u7684\u5178\u578b\u6848\u4f8b<\/h3>\n<p>DistilBERT\u6210\u529f\u5c06BERT\u6a21\u578b\u538b\u7f29\u4e8640%\uff0c\u4fdd\u7559\u4e8697%\u7684\u6027\u80fd\uff0c\u4f46\u63a8\u7406\u901f\u5ea6\u5feb\u4e8660%\u3002\u6700\u8fd1\u6d8c\u73b0\u7684Phi\u7cfb\u5217\u6a21\u578b\uff08\u5982Phi-3\uff09\u4e5f\u5927\u91cf\u4f7f\u7528\u4e86\u84b8\u998f\u6280\u672f\uff0c\u4ee53.8B\u53c2\u6570\u5b9e\u73b0\u4e86\u63a5\u8fd17B\u6a21\u578b\u7684\u6548\u679c\u3002<\/p>\n<p>\u5bf9\u4e8eLLM\u84b8\u998f\uff0c\u76ee\u524d\u8fd8\u6709\u4e24\u4e2a\u503c\u5f97\u5173\u6ce8\u7684\u65b9\u5411\uff1a<\/p>\n<ul>\n<li><strong>\u6e10\u8fdb\u5f0f\u84b8\u998f<\/strong>\uff1a\u5148\u84b8\u998f\u5230\u8f83\u5c0f\u6a21\u578b\uff08\u5982\u4ece70B\u523013B\uff09\uff0c\u518d\u8fdb\u4e00\u6b65\u84b8\u998f\u52306B<\/li>\n<li><strong>\u5bf9\u6297\u6027\u84b8\u998f<\/strong>\uff1a\u5f15\u5165\u5224\u522b\u5668\uff0c\u5206\u8fa8\u8f93\u51fa\u6765\u81ea\u6559\u5e08\u8fd8\u662f\u5b66\u751f\uff0c\u5012\u903c\u5b66\u751f\u6a21\u578b\u5b66\u4e60\u66f4\u771f\u5b9e\u7684\u5206\u5e03<\/li>\n<\/ul>\n<h3>2.3 \u84b8\u998f\u7684\u5c40\u9650\u6027<\/h3>\n<p>\u84b8\u998f\u5e76\u975e\u4e07\u80fd\u3002\u5f53\u6559\u5e08\u6a21\u578b\u672c\u8eab\u5b58\u5728\u5e7b\u89c9\u6216\u504f\u89c1\u65f6\uff0c\u5b66\u751f\u6a21\u578b\u4f1a\u7ee7\u627f\u8fd9\u4e9b\u7f3a\u9677\u3002\u6b64\u5916\uff0c\u84b8\u998f\u5728\u7279\u5b9a\u4efb\u52a1\u4e0a\u7684\u6cdb\u5316\u80fd\u529b\u5f80\u5f80\u4e0d\u5982\u539f\u59cb\u5c0f\u6a21\u578b\u72ec\u7acb\u8bad\u7ec3\u3002\u56e0\u6b64\uff0c\u84b8\u998f\u66f4\u9002\u5408\u4f5c\u4e3a\u5fae\u8c03\u7684\u524d\u7f6e\u6b65\u9aa4\uff0c\u800c\u4e0d\u662f\u552f\u4e00\u7684\u4f18\u5316\u624b\u6bb5\u3002<\/p>\n<h2>\u4e09\u3001\u526a\u679d\uff1a\u7ed9\u795e\u7ecf\u7f51\u7edc&quot;\u65ad\u820d\u79bb&quot;<\/h2>\n<p>\u526a\u679d\uff08Pruning\uff09\u7684\u903b\u8f91\u662f\uff1a\u4e00\u4e2a\u8bad\u7ec3\u597d\u7684\u795e\u7ecf\u7f51\u7edc\u4e2d\uff0c\u5927\u91cf\u53c2\u6570\u5176\u5b9e\u63a5\u8fd1\u96f6\u6216\u5bf9\u6700\u7ec8\u8f93\u51fa\u8d21\u732e\u5f88\u5c0f\uff0c\u53ef\u4ee5\u76f4\u63a5\u53bb\u6389\u6216\u5f52\u96f6\u3002<\/p>\n<h3>3.1 \u7ed3\u6784\u526a\u679d vs \u975e\u7ed3\u6784\u526a\u679d<\/h3>\n<p><strong>\u975e\u7ed3\u6784\u526a\u679d<\/strong>\u662f\u5c06\u6743\u91cd\u77e9\u9635\u4e2d\u4f4e\u4e8e\u9608\u503c\u7684\u53c2\u6570\u76f4\u63a5\u7f6e\u96f6\uff0c\u4ea7\u751f\u7a00\u758f\u77e9\u9635\u3002\u8fd9\u79cd\u65b9\u6cd5\u7406\u8bba\u4e0a\u538b\u7f29\u7387\u9ad8\uff0c\u4f46\u5b9e\u9645\u63a8\u7406\u52a0\u901f\u9700\u8981\u786c\u4ef6\u5bf9\u7a00\u758f\u8ba1\u7b97\u6709\u7279\u6b8a\u652f\u6301\uff08\u5982NVIDIA\u5b89\u57f9\u67b6\u6784\u76842:4\u7a00\u758f\uff09\u3002<\/p>\n<p><strong>\u7ed3\u6784\u526a\u679d<\/strong>\u5219\u662f\u76f4\u63a5\u79fb\u9664\u6574\u4e2a\u901a\u9053\u3001\u5934\u6216\u5c42\u3002\u5bf9\u4e8eTransformer\u6a21\u578b\uff0c\u53ef\u4ee5\u79fb\u9664\u67d0\u4e9b\u6ce8\u610f\u529b\u5934\uff0c\u751a\u81f3\u88c1\u526a\u6574\u4e2aTransformer\u5c42\u3002\u7ed3\u6784\u526a\u679d\u7684\u597d\u5904\u662f\uff1a\u538b\u7f29\u540e\u7684\u6a21\u578b\u53ef\u4ee5\u76f4\u63a5\u5728\u73b0\u6709\u63a8\u7406\u5f15\u64ce\u4e0a\u8fd0\u884c\uff0c\u4e0d\u9700\u8981\u7279\u6b8a\u7684\u7a00\u758f\u8ba1\u7b97\u652f\u6301\u3002\u7f3a\u70b9\u662f\u53ef\u80fd\u5bf9\u6a21\u578b\u7cbe\u5ea6\u9020\u6210\u8f83\u5927\u51b2\u51fb\u3002<\/p>\n<h3>3.2 \u526a\u679d\u7684\u6700\u65b0\u8fdb\u5c55<\/h3>\n<p>SparseGPT\u662f\u8fd1\u5e74\u6765\u6700\u77a9\u76ee\u7684\u526a\u679d\u6210\u679c\u4e4b\u4e00\uff0c\u65e0\u9700\u5fae\u8c03\u5373\u53ef\u5b9e\u73b050%\u7684\u7ed3\u6784\u5316\u526a\u679d\u4e14\u4fdd\u6301LLM\u7684\u96f6\u6837\u672c\u6027\u80fd\u3002\u5176\u6838\u5fc3\u5728\u4e8e\u901a\u8fc7\u8fd1\u4f3c\u6700\u4f18\u7684\u526a\u679d\u987a\u5e8f\u6765\u6700\u5c0f\u5316\u635f\u5931\u3002<\/p>\n<p>LLM-Pruner\u5219\u5c06\u526a\u679d\u4e0e\u77e5\u8bc6\u84b8\u998f\u7ed3\u5408\uff1a\u5148\u526a\u679d\uff0c\u518d\u7528\u6559\u5e08\u6a21\u578b\u7684\u8f93\u51fa\u505a\u84b8\u998f\u6062\u590d\u3002\u8fd9\u79cd&quot;\u5148\u526a\u540e\u84b8&quot;\u7684\u7ec4\u5408\u5df2\u88ab\u9a8c\u8bc1\u5728\u591a\u6570\u573a\u666f\u4e0b\u6548\u679c\u4f18\u4e8e\u5355\u4e00\u7b56\u7565\u3002<\/p>\n<p>\u4e00\u4e2a\u5b9e\u7528\u5efa\u8bae\uff1a\u5bf9\u4e8e10B\u4ee5\u4e0b\u7684\u5c0f\u6a21\u578b\uff0c\u7ed3\u6784\u526a\u679d\u7684\u6548\u679c\u6709\u9650\uff0c\u5efa\u8bae\u4f18\u5148\u4f7f\u7528\u84b8\u998f\u6216\u91cf\u5316\u3002\u5bf9\u4e8e30B\u4ee5\u4e0a\u7684\u5927\u6a21\u578b\uff0c\u7ed3\u6784\u526a\u679d\u5e26\u6765\u7684\u63a8\u7406\u52a0\u901f\u975e\u5e38\u663e\u8457\u3002<\/p>\n<h2>\u56db\u3001\u63a8\u7406\u5f15\u64ce\u4f18\u5316\uff1a\u4ece\u5de5\u7a0b\u89d2\u5ea6\u69a8\u5e72\u6bcf\u4e00\u5206\u6027\u80fd<\/h2>\n<p>\u6a21\u578b\u5c42\u9762\u7684\u4f18\u5316\u505a\u597d\u4e86\uff0c\u8fd8\u9700\u8981\u4e00\u4e2a\u597d\u7684\u63a8\u7406\u5f15\u64ce\u6765\u6267\u884c\u3002\u5f53\u524d\u4e3b\u6d41\u7684\u63a8\u7406\u5f15\u64ce\u5404\u6709\u4fa7\u91cd\u3002<\/p>\n<h3>4.1 \u4e3b\u6d41\u63a8\u7406\u5f15\u64ce\u5bf9\u6bd4<\/h3>\n<table>\n<thead>\n<tr>\n<th>\u5f15\u64ce<\/th>\n<th>\u786c\u4ef6\u652f\u6301<\/th>\n<th>\u7279\u8272\u529f\u80fd<\/th>\n<th>\u5f31\u70b9<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>TensorRT-LLM<\/td>\n<td>NVIDIA GPU<\/td>\n<td>\u98de\u8f6e\u6ce8\u610f\u529b\u3001Inflight\u6279\u5904\u7406<\/td>\n<td>\u7ed1\u5b9aNVIDIA\u786c\u4ef6<\/td>\n<\/tr>\n<tr>\n<td>vLLM<\/td>\n<td>NVIDIA\/AMD GPU<\/td>\n<td>PagedAttention\u3001\u8fde\u7eed\u6279\u5904\u7406<\/td>\n<td>\u751f\u6001\u76f8\u5bf9\u8f83\u65b0<\/td>\n<\/tr>\n<tr>\n<td>llama.cpp<\/td>\n<td>CPU\/GPU<\/td>\n<td>\u7eafCPU\u53ef\u8dd1\u3001\u91cf\u5316\u652f\u6301\u597d<\/td>\n<td>GPU\u4f18\u5316\u4e0d\u5982TensorRT<\/td>\n<\/tr>\n<tr>\n<td>ONNX Runtime<\/td>\n<td>\u5168\u5e73\u53f0<\/td>\n<td>\u8de8\u5e73\u53f0\u80fd\u529b\u6700\u5f3a<\/td>\n<td>\u5927\u6a21\u578b\u652f\u6301\u4e0d\u591f\u6210\u719f<\/td>\n<\/tr>\n<tr>\n<td>MLC-LLM<\/td>\n<td>\u5168\u5e73\u53f0\uff08\u542b\u624b\u673a\uff09<\/td>\n<td>\u7aef\u4fa7\u90e8\u7f72\u6700\u4f18<\/td>\n<td>\u793e\u533a\u89c4\u6a21\u8f83\u5c0f<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3>4.2 \u63a8\u7406\u4f18\u5316\u7684\u5173\u952e\u6280\u5de7<\/h3>\n<p><strong>KV Cache\u7ba1\u7406<\/strong>\u662fLLM\u63a8\u7406\u4f18\u5316\u7684\u6838\u5fc3\u3002Transformer\u7684\u81ea\u56de\u5f52\u7279\u6027\u9700\u8981\u7f13\u5b58\u5386\u53f2Key\u548cValue\uff0c\u968f\u5e8f\u5217\u589e\u957f\u5185\u5b58\u66b4\u6da8\u3002vLLM\u7684PagedAttention\u65b9\u6848\u501f\u9274\u4e86\u64cd\u4f5c\u7cfb\u7edf\u5206\u9875\u7ba1\u7406\u601d\u60f3\uff0c\u5c06KV Cache\u5206\u9875\u7ba1\u7406\uff0c\u663e\u8457\u63d0\u5347\u4e86GPU\u663e\u5b58\u5229\u7528\u7387\u3002<\/p>\n<p><strong>\u8fde\u7eed\u6279\u5904\u7406\uff08Continuous Batching\uff09<\/strong> \u7a81\u7834\u4e86\u4f20\u7edf\u6279\u5904\u7406\u7684\u9650\u5236\uff1a\u4e0d\u9700\u8981\u7b49\u6574\u4e2a\u6279\u6b21\u5168\u90e8\u751f\u6210\u5b8c\u624d\u5904\u7406\u65b0\u8bf7\u6c42\uff0c\u800c\u662f\u91c7\u7528&quot;\u6765\u4e00\u4e2a\u5904\u7406\u4e00\u4e2a&quot;\u7684\u7b56\u7565\u3002\u5728\u4e2d\u7b49\u8d1f\u8f7d\u573a\u666f\u4e0b\uff0c\u8fde\u7eed\u6279\u5904\u7406\u53ef\u5c06\u541e\u5410\u91cf\u63d0\u53473-5\u500d\u3002<\/p>\n<p><strong>Speculative Decoding\uff08\u63a8\u6d4b\u89e3\u7801\uff09<\/strong> \u662f\u6700\u8fd1\u975e\u5e38\u70ed\u95e8\u7684\u52a0\u901f\u65b9\u6cd5\uff1a\u7528\u4e00\u4e2a\u66f4\u5c0f\u7684\u8349\u7a3f\u6a21\u578b\u751f\u6210\u5019\u9009\u5e8f\u5217\uff0c\u7136\u540e\u7528\u5927\u6a21\u578b\u8fdb\u884c\u5e76\u884c\u9a8c\u8bc1\u3002\u5728\u751f\u6210\u8d28\u91cf\u51e0\u4e4e\u65e0\u635f\u7684\u524d\u63d0\u4e0b\uff0c\u63a8\u7406\u901f\u5ea6\u53ef\u4ee5\u63d0\u53472-3\u500d\u3002<\/p>\n<h2>\u4e94\u3001\u4f18\u5316\u7ec4\u5408\u62f3\uff1a\u4e0d\u540c\u573a\u666f\u7684\u6700\u4f73\u5b9e\u8df5<\/h2>\n<p>\u6ca1\u6709\u4efb\u4f55\u4e00\u79cd\u4f18\u5316\u6280\u672f\u662f\u4e07\u80fd\u7684\uff0c\u771f\u6b63\u9ad8\u6548\u7684\u90e8\u7f72\u65b9\u6848\u9700\u8981\u6253\u7ec4\u5408\u62f3\u3002<\/p>\n<p><strong>\u4e91\u7aef\u9ad8\u5e76\u53d1\u573a\u666f\uff08\u5982AI\u804a\u5929\u673a\u5668\u4eba\uff09\uff1a<\/strong><br \/>GPU\u63a8\u7406 + FP16\/INT8\u91cf\u5316 + \u8fde\u7eed\u6279\u5904\u7406 + KV Cache\u4f18\u5316 + TensorRT-LLM\/vLLM<\/p>\n<p><strong>\u8fb9\u7f18\u8bbe\u5907\u573a\u666f\uff08\u5982\u624b\u673a\u7aef\u672c\u5730\u63a8\u7406\uff09\uff1a<\/strong><br \/>INT4\/NF4\u91cf\u5316 + \u7ed3\u6784\u526a\u679d\uff08\u79fb\u9664\u5197\u4f59\u5c42\uff09 + \u84b8\u998f\uff08\u7f29\u5c0f\u6a21\u578b\u5e95\u5ea7\uff09<br \/>\u4f8b\u5982\uff1a\u4eceLlama-3-8B \u2192 \u91cf\u5316+\u526a\u679d \u2192 2-3B\u7b49\u6548\u6a21\u578b \u2192 \u624b\u673a\u7aef\u8fd0\u884c<\/p>\n<p><strong>\u4f01\u4e1a\u5185\u90e8\u79c1\u6709\u5316\u90e8\u7f72\uff08\u5982\u77e5\u8bc6\u5e93RAG\uff09\uff1a<\/strong><br \/>FP16\u63a8\u7406 + \u957f\u4e0a\u4e0b\u6587\u4f18\u5316 + \u77e5\u8bc6\u84b8\u998f\uff08\u5c0f\u6a21\u578b\u4e13\u7cbe\u4f01\u4e1a\u6570\u636e\uff09<br \/>\u5173\u952e\u5728\u4e8e\uff1a\u4e0d\u8ffd\u6c42\u5168\u624d\uff0c\u53ea\u8ffd\u6c42\u5728\u5782\u76f4\u573a\u666f\u7684\u7cbe\u51c6\u5ea6<\/p>\n<p><strong>\u5d4c\u5165\u5f0f\u786c\u4ef6\uff08\u5982AIoT\u8bbe\u5907\uff09\uff1a<\/strong><br \/>\u5fc5\u9009\uff1aINT4\u91cf\u5316 + \u6781\u7aef\u526a\u679d<br \/>\u53ef\u9009\uff1a\u4e8c\u503c\u5316\u7f51\u7edc\uff08BNN\uff09\u2014\u2014\u6a21\u578b\u6743\u91cd\u53ea\u6709+1\u548c-1\uff0c\u6781\u7aef\u4f46\u6709\u6548<\/p>\n<h2>\u516d\u3001\u5c55\u671b\uff1a2026\u5e74\u6a21\u578b\u4f18\u5316\u7684\u8d8b\u52bf<\/h2>\n<p>\u8fdb\u51652026\u5e74\u4e0b\u534a\u5e74\uff0c\u6a21\u578b\u4f18\u5316\u5448\u73b0\u51e0\u4e2a\u660e\u663e\u8d8b\u52bf\uff1a<\/p>\n<p>\u7b2c\u4e00\uff0c<strong>\u786c\u4ef6-\u7b97\u6cd5\u534f\u540c\u4f18\u5316<\/strong>\u4e0d\u518d\u662f\u53e3\u53f7\u3002NVIDIA Blackwell\u67b6\u6784\u5bf9FP4\u7684\u786c\u4ef6\u652f\u6301\u3001AMD\u7684ROCm\u5bf9\u63a8\u7406\u5f15\u64ce\u7684\u52a0\u901f\u9002\u914d\u3001Apple\u7684ANE\uff08\u795e\u7ecf\u7f51\u7edc\u5f15\u64ce\uff09\u5bf9\u7aef\u4fa7Transfomer\u7684\u4e13\u5c5e\u4f18\u5316\uff0c\u90fd\u5728\u8ba9\u8f6f\u786c\u4ef6\u534f\u540c\u6210\u4e3a\u73b0\u5b9e\u3002<\/p>\n<p>\u7b2c\u4e8c\uff0c<strong>\u591a\u6a21\u6001\u6a21\u578b\u7684\u4f18\u5316<\/strong>\u6210\u4e3a\u65b0\u70ed\u70b9\u3002\u89c6\u89c9\u8bed\u8a00\u6a21\u578b\uff08VLM\uff09\u7684\u4f18\u5316\u4e0d\u540c\u4e8e\u7eaf\u6587\u672c\u6a21\u578b\uff0c\u9700\u8981\u540c\u65f6\u5904\u7406\u89c6\u89c9\u7f16\u7801\u5668\u548c\u8bed\u8a00\u89e3\u7801\u5668\u7684\u52a0\u901f\u3002\u6700\u8fd1\u51fa\u73b0\u7684\u5bf9\u89c6\u89c9Token\u8fdb\u884c\u538b\u7f29\u7684\u6280\u672f\uff08\u5982Token Merging\uff09\uff0c\u53ef\u4ee5\u5c06\u89c6\u89c9Token\u6570\u91cf\u51cf\u5c1175%\uff0c\u5927\u5e45\u964d\u4f4e\u8ba1\u7b97\u5f00\u9500\u3002<\/p>\n<p>\u7b2c\u4e09\uff0c<strong>\u81ea\u52a8\u5316\u4f18\u5316\u7ba1\u9053\uff08Auto-Opt\uff09<\/strong> \u9010\u6e10\u6210\u719f\u3002\u5de5\u7a0b\u5e08\u4e0d\u518d\u9700\u8981\u624b\u52a8\u8c03\u53c2\uff0c\u800c\u662f\u901a\u8fc7\u81ea\u52a8\u5316\u5de5\u5177\u6765\u641c\u7d22\u6700\u4f18\u7684\u91cf\u5316\u4f4d\u5bbd\u3001\u526a\u679d\u6bd4\u4f8b\u548c\u84b8\u998f\u7b56\u7565\u7ec4\u5408\u3002\u7c7b\u4f3cAutoML\u5728\u8bad\u7ec3\u9636\u6bb5\u7684\u4f5c\u7528\uff0cAuto-Opt\u6b63\u5728\u6539\u53d8\u63a8\u7406\u4f18\u5316\u7684\u8303\u5f0f\u3002<\/p>\n<p>\u6a21\u578b\u4f18\u5316\u7684\u672c\u8d28\u662f\u5728\u7cbe\u5ea6\u3001\u901f\u5ea6\u548c\u53c2\u6570\u91cf\u4e4b\u95f4\u5bfb\u627e\u6700\u4f73\u5e73\u8861\u3002\u6ca1\u6709\u94f6\u5f39\uff0c\u53ea\u6709\u6839\u636e\u5177\u4f53\u573a\u666f\u9009\u62e9\u9002\u5408\u7684\u6280\u672f\u7ec4\u5408\u3002\u5bf9\u4e8eAI\u5de5\u7a0b\u5e08\u6765\u8bf4\uff0c\u7406\u89e3\u5404\u6280\u672f\u7684\u4f18\u52a3\u8fb9\u754c\u548c\u9002\u7528\u6761\u4ef6\uff0c\u6bd4\u76f2\u76ee\u8ffd\u6c42\u538b\u7f29\u7387\u91cd\u8981\u5f97\u591a\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>2026\u5e74\uff0c\u5927\u6a21\u578b\u7684\u90e8\u7f72\u5df2\u7ecf\u4ece&quot;\u80fd\u4e0d\u80fd\u8dd1&quot;\u7684\u95ee\u9898\uff0c\u8f6c\u53d8\u4e3a&quot;\u5982\u4f55\u8dd1\u5f97\u66f4\u5feb\u3001\u66f4\u4fbf\u5b9c\u3001 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[5],"tags":[18,26,25,27],"class_list":["post-133","post","type-post","status-publish","format-standard","hentry","category-ai","tag-ai","tag-26","tag-25","tag-27"],"_links":{"self":[{"href":"https:\/\/www.joezhou.top\/index.php?rest_route=\/wp\/v2\/posts\/133","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.joezhou.top\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.joezhou.top\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.joezhou.top\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.joezhou.top\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=133"}],"version-history":[{"count":0,"href":"https:\/\/www.joezhou.top\/index.php?rest_route=\/wp\/v2\/posts\/133\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.joezhou.top\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=133"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.joezhou.top\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=133"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.joezhou.top\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=133"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}