[{"data":1,"prerenderedAt":205},["ShallowReactive",2],{"DlFXI4Eibt_Bn9lrEZz1TYbHCWFZj3IvqwHQSEW-Exc":3,"GR0-FlfOKLnHpaophg893jxn0jMHRsHe1zqTPGD28Tg":194},{"code":4,"msg":5,"data":6},0,"",{"category":7,"tag":11,"hot":39,"new":78,"banner":118,"data":143,"cache":193},[8,9,10],"Agent","OpenAI","LLM",[12,14,17,20,23,25,27,30,33,36],{"title":8,"total":13},39,{"title":15,"total":16},"Google",44,{"title":18,"total":19},"Nvidia",13,{"title":21,"total":22},"Claude",11,{"title":9,"total":24},35,{"title":10,"total":26},85,{"title":28,"total":29},"DeepSeek",9,{"title":31,"total":32},"OCR",1,{"title":34,"total":35},"Chat",7,{"title":37,"total":38},"Generator",116,[40,48,55,64,71],{"id":41,"publish_date":42,"is_original":4,"collection":5,"cover_url":43,"cover_url_1_1":44,"title":45,"summary":46,"author":47},557,"2022-04-29","article_res/cover/7a9b1375ed9bb298154981bae42b794d.jpeg","article_res/cover/afa281dd52bc0454e6735daa8e6b0706.jpeg","Translation and summary of Messari Report [2.8 Kristin Smith, Blockchain Association and Katie Haun, a16z]","We need unity and speed right now.","Translation",{"id":49,"publish_date":50,"is_original":4,"collection":5,"cover_url":51,"cover_url_1_1":52,"title":53,"summary":54,"author":47},531,"2022-05-25","article_res/cover/e8362057f8fa189594c60afdfaaeb6e5.jpeg","article_res/cover/8ea08d0d6fa7eee6b57ed4ec61b61ad6.jpeg","Decentralized Society: Finding Web3’s Soul / Decentralized Society: Finding the Soul of Web3 -7","Decentralization through Pluralism When analyzing ecosystems, it's desirable to measure how decentralized it is.",{"id":56,"publish_date":57,"is_original":32,"collection":58,"cover_url":59,"cover_url_1_1":60,"title":61,"summary":62,"author":63},127,"2024-11-14","#Google #AI Game #World Model #AI Story","article_res/cover/0233a875b7ec2debf59779e311547569.jpeg","article_res/cover/6ffddb6ae4914b3c699493311aa9f198.jpeg","Google Launches \"Unbounded\": A Generative Infinite Character Life Simulation Game","Unbounded: A Generative Infinite Game of Character Life Simulation","Renee's Entrepreneurial Journey",{"id":13,"publish_date":65,"is_original":32,"collection":66,"cover_url":67,"cover_url_1_1":68,"title":69,"summary":70,"author":63},"2025-02-14","#Deep Dive into LLMs #Andrej Karpathy #LLM #Tool Use #Hallucination","article_res/cover/11e858ad6b74dfa80f923d549b62855c.jpeg","article_res/cover/615e1b320f1fc163edc1d2d154a6de33.jpeg","Andrej Karpathy's in-depth explanation of LLM (Part 4): Hallucinations","hallucinations, tool use, knowledge/working memory",{"id":72,"publish_date":73,"is_original":4,"collection":5,"cover_url":74,"cover_url_1_1":75,"title":76,"summary":77,"author":47},579,"2022-04-07","article_res/cover/39387376ba28447af1eb40576b9df215.jpeg","article_res/cover/02727ede8551ed49901d0abe6d6305b7.jpeg","Messari Report Translation and Summary 【1-7 Surviving the Winter】","I’d be more cautious here: 10 year and 10 hour thinking only.",[79,87,95,103,111],{"id":80,"publish_date":81,"is_original":32,"collection":82,"cover_url":83,"cover_url_1_1":84,"title":85,"summary":86,"author":63},627,"2025-03-20","#AI Avatar #AI Video Generation","article_res/cover/d95481358f73924989f8c4ee9c75d1c8.jpeg","article_res/cover/b74bc0fab01f8b6a6aa87696c0c3ed8b.jpeg","DisPose: Generating Animated Videos by Driving Video with Reference Images","DisPose is a controllable human image animation method that enhances video generation.",{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},626,"2025-03-21","#Deep Dive into LLMs #LLM #RL #Andrej Karpathy #AlphaGo","article_res/cover/446553a5c8f8f2f07d97b20eaee84e56.jpeg","article_res/cover/e6c2823409c9b34624064b9acbaca6f1.jpeg","AlphaGo and the Power of Reinforcement Learning - Andrej Karpathy's Deep Dive on LLMs (Part 9)","Simply learning from humans will never surpass human capabilities.",{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},625,"2025-03-22","#Deep Dive into LLMs #LLM #RL #RLHF #Andrej Karpathy","article_res/cover/8da81d38b1e5cf558a164710fd8a5389.jpeg","article_res/cover/96f028d76c362a99a0dd56389e8f7a9b.jpeg","Reinforcement Learning from Human Feedback (RLHF) - Andrej Karpathy's Deep Dive on LLMs (Part 10)","Fine-Tuning Language Models from Human Preferences",{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},624,"2025-03-23","#Deep Dive into LLMs #LLM #Andrej Karpathy #AI Agent #MMM","article_res/cover/a5e7c3d48bb09109684d6513287c661d.jpeg","article_res/cover/d3f22b7c0ab8d82fd2da457a299e0773.jpeg","The Future of Large Language Models - Andrej Karpathy's In-Depth Explanation of LLM (Part 11)","preview of things to come",{"id":112,"publish_date":105,"is_original":32,"collection":113,"cover_url":114,"cover_url_1_1":115,"title":116,"summary":117,"author":63},623,"#Google #Voe #AI Video Generation","article_res/cover/c44062fea0f336c2b96b3928292392c2.jpeg","article_res/cover/a041041c69092ad3db191c5bf3ff981b.jpeg","Trial of Google's video generation model VOE2","Our state-of-the-art video generation model",[119,127,135],{"id":120,"publish_date":121,"is_original":32,"collection":122,"cover_url":123,"cover_url_1_1":124,"title":125,"summary":126,"author":63},160,"2024-10-04","#Philosophy","article_res/cover/496990c49211e8b7f996b7d39c18168e.jpeg","article_res/cover/14dbaa1ade9cb4316d5829423a900362.jpeg","Time","The fungus of the morning does not know the waxing and waning of the moon, and the cicada does not know the seasons; this is a short life. To the south of the state of Chu there is a dark spirit which regards five hundred years as spring and five hundred years as autumn. In ancient times there was a great tree called the Ming which regarded eight thousand years as spring and eight thousand years as autumn; this is a long life.",{"id":128,"publish_date":129,"is_original":32,"collection":130,"cover_url":131,"cover_url_1_1":132,"title":133,"summary":134,"author":63},98,"2024-12-17","#AI Video Generator #Sora #Pika","article_res/cover/3b86e85d03fff4f356a3e4cf2bb329c9.jpeg","article_res/cover/5fa5c20ad0b40f8f544d257c0ef02938.jpeg","Pika 2.0 video generation officially released: effect comparison with Sora","今天，我们推出了Pika 2.0模型。卓越的文字对齐效果。惊人的视觉表现。还有✨场景成分✨",{"id":136,"publish_date":137,"is_original":32,"collection":138,"cover_url":139,"cover_url_1_1":140,"title":141,"summary":142,"author":63},71,"2025-01-14","#Nvidia #World Foundation Model #Cosmos #Physical AI #Embodied AI","article_res/cover/feddf8c832dfb45d28804291f6a42a9e.jpeg","article_res/cover/d6bc2f1186d96b78228c2283a17a3645.jpeg","NVIDIA's Cosmos World Model","Cosmos World Foundation Model Platform for Physical AI",[144,163,188],{"title":8,"items":145},[146,147,155],{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},{"id":148,"publish_date":149,"is_original":32,"collection":150,"cover_url":151,"cover_url_1_1":152,"title":153,"summary":154,"author":63},622,"2025-03-24","#OWL #AI Agent #MAS #MCP #CUA","article_res/cover/cb50ca7f2bf4d1ed50202d7406e1c19a.jpeg","article_res/cover/4aa7aa3badfacf3cc84121334f1050dd.jpeg","OWL: Multi-agent collaboration","OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation",{"id":156,"publish_date":157,"is_original":32,"collection":158,"cover_url":159,"cover_url_1_1":160,"title":161,"summary":162,"author":63},620,"2025-03-26","#LLM #Google #Gemini #AI Agent","article_res/cover/53751a6dbbe990b1eb0b63f3b062aed4.jpeg","article_res/cover/031344981f0a212ff82d1f3a64aa5756.jpeg","Gemini 2.5 Pro, claimed to be far ahead of the competition, has been released with great fanfare: comprehensively surpassing other LLMs and topping the global rankings","Gemini 2.5: Our most intelligent AI model",{"title":9,"items":164},[165,172,180],{"id":166,"publish_date":157,"is_original":32,"collection":167,"cover_url":168,"cover_url_1_1":169,"title":170,"summary":171,"author":63},619,"#OpenAI #AI Image Generator #4o #MMM #AR Transformer","article_res/cover/2faffc97fcecf3151552cb0fd3206d89.jpeg","article_res/cover/1133cb4948af44cee2e7fbe79efb69e5.jpeg","The native image function of GPT-4o is officially launched","Introducing 4o Image Generation",{"id":173,"publish_date":174,"is_original":4,"collection":175,"cover_url":176,"cover_url_1_1":177,"title":178,"summary":179,"author":63},434,"2023-07-15","#Anthropic #OpenAI #Google #AI Code Generator #Claude","article_res/cover/e1b6f600a2b9f262a4392684e5f2ce25.jpeg","article_res/cover/6e1772e83f78f9a351ab23d3e414adee.jpeg","Latest Updates on Google Bard /Anthropic Claude2 / ChatGPT Code Interpreter","We want our models to use their programming skills to provide more natural interfaces to the basic functions of our computers.  \n - OpenAI",{"id":181,"publish_date":182,"is_original":4,"collection":183,"cover_url":184,"cover_url_1_1":185,"title":186,"summary":187,"author":63},417,"2023-08-24","#OpenAI","article_res/cover/bccf897d50a88b18364e35f7466387e0.jpeg","article_res/cover/2f871085c1073717c1703ae86e18056f.jpeg","The GPT-3.5 Turbo fine-tuning (fine-tuning function) has been released～","Developers can now bring their own data to customize GPT-3.5 Turbo for their use cases.",{"title":10,"items":189},[190,191,192],{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},true,{"code":4,"msg":5,"data":195},{"id":196,"publish_date":197,"is_original":32,"collection":198,"articles_id":199,"cover_url":200,"cover_url_1_1":201,"title":202,"summary":203,"author":63,"content":204},59,"2025-01-25","#LLM #DeepSeek #RL #Distillation","RMKVC6kK5Bzw1fQy7D0Zww","article_res/cover/110330c9174424ce76666ff1ebfe0b67.jpeg","article_res/cover/6c8fe943cee312750ef7c49d673f63f8.jpeg","Paper of DeepSeek-R1: Exploration and Breakthrough of the New Generation Inference Model","DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","\u003Cdiv class=\"rich_media_content js_underline_content\n                       autoTypeSetting24psection\n            \" id=\"js_content\">\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>论文是1月22号 前天发表的，文章介绍了\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">DeepSeek-R1-Zero\u003C/strong>与\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">DeepSeek-R1\u003C/strong>两种新型推理模型，展示了推理性能提升背后的技术创新与实验成果。\u003C/p>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">深入了解 DeepSeek-R1\u003C/span>\u003C/h2>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">DeepSeek-R1-Zero：纯强化学习的尝试\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>DeepSeek-R1-Zero 是一个完全通过\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">大规模强化学习（RL）\u003C/strong>训练的模型，没有采用监督微调（SFT）作为初始步骤。这一策略使其在推理任务中展现了显著能力，同时也自然地表现出一些有趣而强大的推理行为。然而，该模型也面临一定的挑战，如\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">语言可读性较差\u003C/strong>及\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">语言混用\u003C/strong>等问题。\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">DeepSeek-R1：多阶段训练的创新\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>为了解决 DeepSeek-R1-Zero 的局限性，研究团队引入了改进版本 DeepSeek-R1。\u003C/p>\u003Col style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">冷启动数据预训练\u003C/strong>：使用少量冷启动数据对基础模型 DeepSeek-V3-Base 进行微调。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">强化学习优化\u003C/strong>：类似于 DeepSeek-R1-Zero 的强化学习过程。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">监督微调与拒绝采样\u003C/strong>：结合 RL 中生成的新数据与领域数据（如写作、问答、自我认知）进行监督微调。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">全场景提示训练\u003C/strong>：在所有场景下对模型进行额外优化。\u003C/section>\u003C/li>\u003C/ol>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>通过这一流程，DeepSeek-R1 在推理基准上的表现达到与 OpenAI o1-1217 相当的水平。\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100009520\" data-ratio=\"0.655608214849921\" data-s=\"300,640\" data-type=\"png\" data-w=\"633\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423771739880.08622890947025752.png\">\u003C/p>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">蒸馏与模型轻量化\u003C/span>\u003C/h2>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>为支持更广泛的应用场景，研究团队进一步从 DeepSeek-R1 蒸馏出小规模密集模型，取得了显著成果。例如：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">蒸馏后的 \u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">14B 参数模型\u003C/strong> 超越了同等规模的 QwQ-32B-Preview；\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">蒸馏的 \u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">32B 和 70B 模型\u003C/strong> 在推理基准上刷新了密集模型的表现纪录。\u003C/section>\u003C/li>\u003C/ul>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">评估结果概览\u003C/span>\u003C/h2>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100009532\" data-ratio=\"0.7936507936507936\" data-s=\"300,640\" data-type=\"png\" data-w=\"567\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423771740740.3110154096332831.png\">\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">1. 推理任务\u003C/span>\u003C/h3>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">AIME 2024\u003C/strong>：DeepSeek-R1 在 Pass@1 上达到 79.8%，略超 OpenAI-o1-1217。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">MATH-500\u003C/strong>：表现尤为突出，取得 97.3%，与 OpenAI-o1-1217 相当，远超其他模型。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">编码任务\u003C/strong>：在 Codeforces 编程竞赛中，DeepSeek-R1 获得 2,029 Elo 积分，击败 96.3% 的人类参赛者。\u003C/section>\u003C/li>\u003C/ul>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">2. 知识任务\u003C/span>\u003C/h3>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">多任务基准测试\u003C/strong>：在 MMLU（90.8%）、MMLU-Pro（84.0%）和 GPQA Diamond（71.5%）等基准上，DeepSeek-R1 的表现显著优于 DeepSeek-V3，并在教育任务中展示了强大的竞争力。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">事实问答\u003C/strong>：在 SimpleQA 数据集上表现卓越，进一步证明其处理事实性查询的能力。\u003C/section>\u003C/li>\u003C/ul>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">3. 长上下文与非考试任务\u003C/span>\u003C/h3>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">长上下文理解\u003C/strong>：DeepSeek-R1 在 AlpacaEval 2.0 和 ArenaHard 等长上下文基准上展现了强大的性能，分别达到 87.6% 和 92.3% 的胜率。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">多样化任务\u003C/strong>：在创意写作、问答、编辑和总结任务中，DeepSeek-R1 同样表现优异，进一步凸显了其广泛的适用性。\u003C/section>\u003C/li>\u003C/ul>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">DeepSeek-R1-Zero 方法概览\u003C/span>\u003C/h2>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">强化学习算法\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>为优化训练成本，采用了 \u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">Group Relative Policy Optimization (GRPO)\u003C/strong> 算法，该算法避免了传统强化学习中的评论者模型（critic model），通过组内得分估计基线，显著降低了训练资源需求。具体而言：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">组采样与策略优化\u003C/strong>：对于每个问题 (q)，GRPO 从旧策略中采样多个输出，并优化目标函数，最大化策略性能，同时引入 KL 散度正则化项以保证模型生成的多样性。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">奖励机制\u003C/strong>：通过组内奖励差异（advantage）计算优势值，进一步指导模型优化。\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>公式如下：\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100009521\" data-ratio=\"0.3220338983050847\" data-s=\"300,640\" data-type=\"png\" data-w=\"649\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423771739980.7521207400073837.png\">\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">奖励建模\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>奖励建模是强化学习的关键，用以引导模型的优化方向。团队设计了两种奖励模型：\u003C/p>\u003Col style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">准确性奖励\u003C/strong>：通过规则评估模型的答案是否正确，例如数学问题中使用明确格式的答案，以便于验证；在编程任务中，利用编译器反馈测试用例的正确性。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">格式奖励\u003C/strong>：鼓励模型在输出中包含清晰的推理过程，并以特定格式标注思考过程和答案，例如用 \u003Ccode style=\"font-family: Consolas, Monaco, Menlo, monospace;font-size: 14px;;color: rgb(30, 107, 184);line-height: 1.8em;letter-spacing: 0em;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(27, 31, 35, 0.05);width: auto;margin-right: 2px;margin-left: 2px;padding: 2px 4px;border-style: none;border-width: 3px;border-color: rgb(0, 0, 0) rgba(0, 0, 0, 0.4) rgba(0, 0, 0, 0.4);border-radius: 4px;word-break: break-all;\">&lt;think&gt;\u003C/code> 和 \u003Ccode style=\"font-family: Consolas, Monaco, Menlo, monospace;font-size: 14px;;color: rgb(30, 107, 184);line-height: 1.8em;letter-spacing: 0em;background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(27, 31, 35, 0.05);width: auto;margin-right: 2px;margin-left: 2px;padding: 2px 4px;border-style: none;border-width: 3px;border-color: rgb(0, 0, 0) rgba(0, 0, 0, 0.4) rgba(0, 0, 0, 0.4);border-radius: 4px;word-break: break-all;\">&lt;/think&gt;\u003C/code> 标签包裹推理过程。\u003C/section>\u003C/li>\u003C/ol>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>这种奖励设计避免了神经奖励模型可能带来的“奖励劫持”问题，同时简化了训练流程。\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">训练模板与模型表现\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>在训练中，团队设计了一种简单的模板，要求模型生成\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">推理过程\u003C/strong>并给出\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">最终答案\u003C/strong>。这一结构化格式允许团队准确观察模型在强化学习过程中的自然演化，而无需人为加入特定问题解决策略的偏倚。\u003C/p>\u003Ch4 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 18px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">表现提升与“自我进化”\u003C/span>\u003C/h4>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>DeepSeek-R1-Zero 在 AIME 2024 基准上的 Pass@1 分数从初始的 15.6% 稳步提升至 71.0%，通过多数投票更是进一步提升至 86.7%，超越 OpenAI-o1-0912 的表现。这表明，纯 RL 方法能够显著提升模型推理能力。\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100009522\" data-ratio=\"0.6237424547283702\" data-s=\"300,640\" data-type=\"png\" data-w=\"497\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423771739980.033790215034910975.png\">\u003C/p>\u003Ch4 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 18px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">自主行为与“顿悟时刻”\u003C/span>\u003C/h4>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>在强化学习过程中，DeepSeek-R1-Zero 展现了一系列自然演化的行为，例如：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">反思能力\u003C/strong>：模型能够重新评估先前步骤并探索替代解决方案。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">长链推理\u003C/strong>：通过生成更长的推理链解决复杂问题。\u003C/section>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100009527\" data-ratio=\"0.6130346232179226\" data-s=\"300,640\" data-type=\"png\" data-w=\"491\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423771739990.08130608358732183.png\">\u003C/p>\u003C/li>\u003C/ul>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>一个令人兴奋的现象是模型的“顿悟时刻”，即中间版本的模型通过自我调整分配更多的思考时间，优化了解决问题的策略。这种行为未经过显式编程，而是完全依赖于强化学习环境中的自然发展。\u003C/p>\u003Cp style=\"text-align: center;\">\u003Cimg class=\"rich_pages wxw-img js_insertlocalimg\" data-imgfileid=\"100009528\" data-ratio=\"0.5855379188712522\" data-s=\"300,640\" data-type=\"png\" data-w=\"567\" style=\"\" src=\"https://res.cooltool.vip/article_res/assets/17423771750090.8675517517570832.png\">\u003C/p>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">DeepSeek-R1 的训练方法\u003C/span>\u003C/h2>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">冷启动：优化强化学习的初始阶段\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>为了避免 DeepSeek-R1-Zero 中早期 RL 训练的不稳定性，团队为 DeepSeek-R1 构建并收集了少量长链式思维（CoT）数据，用于对基础模型（DeepSeek-V3-Base）进行微调作为初始 RL 模型。这些数据的来源包括：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">少样本提示\u003C/strong>：使用长链式思维示例指导生成；\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">直接提示\u003C/strong>：让模型生成包含反思与验证的详细答案；\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">DeepSeek-R1-Zero 输出\u003C/strong>：采用人类注释和后处理优化输出质量。\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>通过这些方法，收集了数千条冷启动数据进行微调，相较于 DeepSeek-R1-Zero，该阶段的冷启动数据有以下优点：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">可读性\u003C/strong>：冷启动数据采用设计良好的输出格式，包括推理过程和总结部分，以提高用户体验。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">性能潜力\u003C/strong>：冷启动数据为模型提供了更强的初始推理能力，从而实现了更快的性能提升。\u003C/section>\u003C/li>\u003C/ul>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">推理导向的强化学习\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>完成冷启动数据微调后，对模型进行与 DeepSeek-R1-Zero 相同的大规模 RL 训练，但在 DeepSeek-R1 中引入了语言一致性奖励，旨在解决 CoT 输出中出现的语言混用问题：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">语言一致性奖励\u003C/strong>：计算目标语言单词占比，将其作为奖励信号，尽管该方法可能会略微降低模型性能，但它提高了输出的可读性。\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>最终，通过将推理任务准确性奖励与语言一致性奖励相结合，在多种推理密集型任务（如数学、编程、科学和逻辑推理）上训练模型直至收敛。\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">拒绝采样与监督微调\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>当推理导向的 RL 收敛后，利用生成的检查点数据开展监督微调（SFT）以进一步提升模型的通用能力：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">推理数据\u003C/strong>：通过拒绝采样扩展数据集，仅保留正确且高质量的推理轨迹。最终收集了约 60 万条推理相关样本。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">非推理数据\u003C/strong>：包括写作、事实问答、自我认知等其他任务，主要通过调用 DeepSeek-V3 生成。最终收集了约 20 万条非推理样本。\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>使用上述约 80 万条样本数据对 DeepSeek-V3-Base 进行两轮微调，大幅提升了模型的推理和非推理能力。\u003C/p>\u003Ch3 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 20px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">全场景强化学习\u003C/span>\u003C/h3>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>为了进一步优化模型对人类偏好的适配性，在所有场景下引入了次级 RL 阶段，以改进模型的\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">帮助性\u003C/strong>与\u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">无害性\u003C/strong>：\u003C/p>\u003Cul style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">帮助性\u003C/strong>：专注于评估总结部分的实用性和相关性，确保输出能够满足用户需求。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">无害性\u003C/strong>：评估模型完整响应，避免潜在风险、偏见或有害内容。\u003C/section>\u003C/li>\u003C/ul>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>这一阶段结合了规则奖励和多样化提示分布，通过强化学习提升模型在推理、生成、写作等广泛任务中的表现。\u003C/p>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>以下是 \u003Cstrong style=\";background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">结论、局限性与未来工作\u003C/strong>部分报告：\u003C/p>\u003Ch2 style='margin-top: 30px;margin-bottom: 15px;color: rgba(0, 0, 0, 0.85);;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);'>\u003Cspan style=\";font-size: 22px;color: rgb(0, 0, 0);line-height: 1.5em;letter-spacing: 0em;font-weight: bold;display: block;\">未来工作\u003C/span>\u003C/h2>\u003Cp style='margin-bottom: 0px;;color: rgb(0, 0, 0);font-size: 16px;line-height: 1.8em;letter-spacing: normal;text-align: left;padding-top: 8px;padding-bottom: 8px;font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;background-color: rgb(255, 255, 255);'>为进一步提升 DeepSeek-R1 的能力，团队计划在以下方向进行深入研究：\u003C/p>\u003Col style='margin-top: 8px;margin-bottom: 8px;;padding-left: 25px;color: rgb(0, 0, 0);font-family: Optima, \"Microsoft YaHei\", PingFangSC-regular, serif;font-size: 16px;letter-spacing: normal;text-align: left;background-color: rgb(255, 255, 255);' class=\"list-paddingleft-1\">\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">提升通用能力\u003C/strong>：研究如何利用长链式思维提升函数调用、多轮对话等任务的表现，从而实现更全面的通用能力。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">优化语言支持\u003C/strong>：解决语言混用问题，增强对多语言场景的支持，确保模型在多语言输入时的响应一致性。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">改进提示工程\u003C/strong>：研究更鲁棒的提示策略，减少模型对提示变化的敏感性，进一步优化用户体验。\u003C/section>\u003C/li>\u003Cli style=\";\">\u003Csection style=\";margin-top: 5px;margin-bottom: 5px;color: rgb(1, 1, 1);line-height: 1.8em;letter-spacing: 0em;\">\u003Cstrong style=\";color: rgb(0, 0, 0);background: none 0% 0% / auto no-repeat scroll padding-box border-box rgba(0, 0, 0, 0);width: auto;height: auto;border-style: none;border-width: 3px;border-color: rgba(0, 0, 0, 0.4);border-radius: 0px;\">软件工程任务优化\u003C/strong>：引入拒绝采样（rejection sampling）和异步评价方法，提升 RL 在软件工程任务上的效率，并推动模型性能提升。\u003C/section>\u003C/li>\u003C/ol>\u003Cp style=\"display: none;\">\u003Cmp-style-type data-value=\"3\">\u003C/mp-style-type>\u003C/p>\u003C/div>",1752585425272]