[{"data":1,"prerenderedAt":206},["ShallowReactive",2],{"DlFXI4Eibt_Bn9lrEZz1TYbHCWFZj3IvqwHQSEW-Exc":3,"_qqy1BOGp0pfpYoDWCgOIRcTNZZ_aIyQUJT2nvQJDMg":194},{"code":4,"msg":5,"data":6},0,"",{"category":7,"tag":11,"hot":39,"new":78,"banner":118,"data":143,"cache":193},[8,9,10],"Agent","OpenAI","LLM",[12,14,17,20,23,25,27,30,33,36],{"title":8,"total":13},39,{"title":15,"total":16},"Google",44,{"title":18,"total":19},"Nvidia",13,{"title":21,"total":22},"Claude",11,{"title":9,"total":24},35,{"title":10,"total":26},85,{"title":28,"total":29},"DeepSeek",9,{"title":31,"total":32},"OCR",1,{"title":34,"total":35},"Chat",7,{"title":37,"total":38},"Generator",116,[40,48,55,64,71],{"id":41,"publish_date":42,"is_original":4,"collection":5,"cover_url":43,"cover_url_1_1":44,"title":45,"summary":46,"author":47},557,"2022-04-29","article_res/cover/7a9b1375ed9bb298154981bae42b794d.jpeg","article_res/cover/afa281dd52bc0454e6735daa8e6b0706.jpeg","Translation and summary of Messari Report [2.8 Kristin Smith, Blockchain Association and Katie Haun, a16z]","We need unity and speed right now.","Translation",{"id":49,"publish_date":50,"is_original":4,"collection":5,"cover_url":51,"cover_url_1_1":52,"title":53,"summary":54,"author":47},531,"2022-05-25","article_res/cover/e8362057f8fa189594c60afdfaaeb6e5.jpeg","article_res/cover/8ea08d0d6fa7eee6b57ed4ec61b61ad6.jpeg","Decentralized Society: Finding Web3’s Soul / Decentralized Society: Finding the Soul of Web3 -7","Decentralization through Pluralism When analyzing ecosystems, it's desirable to measure how decentralized it is.",{"id":56,"publish_date":57,"is_original":32,"collection":58,"cover_url":59,"cover_url_1_1":60,"title":61,"summary":62,"author":63},127,"2024-11-14","#Google #AI Game #World Model #AI Story","article_res/cover/0233a875b7ec2debf59779e311547569.jpeg","article_res/cover/6ffddb6ae4914b3c699493311aa9f198.jpeg","Google Launches \"Unbounded\": A Generative Infinite Character Life Simulation Game","Unbounded: A Generative Infinite Game of Character Life Simulation","Renee's Entrepreneurial Journey",{"id":13,"publish_date":65,"is_original":32,"collection":66,"cover_url":67,"cover_url_1_1":68,"title":69,"summary":70,"author":63},"2025-02-14","#Deep Dive into LLMs #Andrej Karpathy #LLM #Tool Use #Hallucination","article_res/cover/11e858ad6b74dfa80f923d549b62855c.jpeg","article_res/cover/615e1b320f1fc163edc1d2d154a6de33.jpeg","Andrej Karpathy's in-depth explanation of LLM (Part 4): Hallucinations","hallucinations, tool use, knowledge/working memory",{"id":72,"publish_date":73,"is_original":4,"collection":5,"cover_url":74,"cover_url_1_1":75,"title":76,"summary":77,"author":47},579,"2022-04-07","article_res/cover/39387376ba28447af1eb40576b9df215.jpeg","article_res/cover/02727ede8551ed49901d0abe6d6305b7.jpeg","Messari Report Translation and Summary 【1-7 Surviving the Winter】","I’d be more cautious here: 10 year and 10 hour thinking only.",[79,87,95,103,111],{"id":80,"publish_date":81,"is_original":32,"collection":82,"cover_url":83,"cover_url_1_1":84,"title":85,"summary":86,"author":63},627,"2025-03-20","#AI Avatar #AI Video Generation","article_res/cover/d95481358f73924989f8c4ee9c75d1c8.jpeg","article_res/cover/b74bc0fab01f8b6a6aa87696c0c3ed8b.jpeg","DisPose: Generating Animated Videos by Driving Video with Reference Images","DisPose is a controllable human image animation method that enhances video generation.",{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},626,"2025-03-21","#Deep Dive into LLMs #LLM #RL #Andrej Karpathy #AlphaGo","article_res/cover/446553a5c8f8f2f07d97b20eaee84e56.jpeg","article_res/cover/e6c2823409c9b34624064b9acbaca6f1.jpeg","AlphaGo and the Power of Reinforcement Learning - Andrej Karpathy's Deep Dive on LLMs (Part 9)","Simply learning from humans will never surpass human capabilities.",{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},625,"2025-03-22","#Deep Dive into LLMs #LLM #RL #RLHF #Andrej Karpathy","article_res/cover/8da81d38b1e5cf558a164710fd8a5389.jpeg","article_res/cover/96f028d76c362a99a0dd56389e8f7a9b.jpeg","Reinforcement Learning from Human Feedback (RLHF) - Andrej Karpathy's Deep Dive on LLMs (Part 10)","Fine-Tuning Language Models from Human Preferences",{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},624,"2025-03-23","#Deep Dive into LLMs #LLM #Andrej Karpathy #AI Agent #MMM","article_res/cover/a5e7c3d48bb09109684d6513287c661d.jpeg","article_res/cover/d3f22b7c0ab8d82fd2da457a299e0773.jpeg","The Future of Large Language Models - Andrej Karpathy's In-Depth Explanation of LLM (Part 11)","preview of things to come",{"id":112,"publish_date":105,"is_original":32,"collection":113,"cover_url":114,"cover_url_1_1":115,"title":116,"summary":117,"author":63},623,"#Google #Voe #AI Video Generation","article_res/cover/c44062fea0f336c2b96b3928292392c2.jpeg","article_res/cover/a041041c69092ad3db191c5bf3ff981b.jpeg","Trial of Google's video generation model VOE2","Our state-of-the-art video generation model",[119,127,135],{"id":120,"publish_date":121,"is_original":32,"collection":122,"cover_url":123,"cover_url_1_1":124,"title":125,"summary":126,"author":63},160,"2024-10-04","#Philosophy","article_res/cover/496990c49211e8b7f996b7d39c18168e.jpeg","article_res/cover/14dbaa1ade9cb4316d5829423a900362.jpeg","Time","The fungus of the morning does not know the waxing and waning of the moon, and the cicada does not know the seasons; this is a short life. To the south of the state of Chu there is a dark spirit which regards five hundred years as spring and five hundred years as autumn. In ancient times there was a great tree called the Ming which regarded eight thousand years as spring and eight thousand years as autumn; this is a long life.",{"id":128,"publish_date":129,"is_original":32,"collection":130,"cover_url":131,"cover_url_1_1":132,"title":133,"summary":134,"author":63},98,"2024-12-17","#AI Video Generator #Sora #Pika","article_res/cover/3b86e85d03fff4f356a3e4cf2bb329c9.jpeg","article_res/cover/5fa5c20ad0b40f8f544d257c0ef02938.jpeg","Pika 2.0 video generation officially released: effect comparison with Sora","今天，我们推出了Pika 2.0模型。卓越的文字对齐效果。惊人的视觉表现。还有✨场景成分✨",{"id":136,"publish_date":137,"is_original":32,"collection":138,"cover_url":139,"cover_url_1_1":140,"title":141,"summary":142,"author":63},71,"2025-01-14","#Nvidia #World Foundation Model #Cosmos #Physical AI #Embodied AI","article_res/cover/feddf8c832dfb45d28804291f6a42a9e.jpeg","article_res/cover/d6bc2f1186d96b78228c2283a17a3645.jpeg","NVIDIA's Cosmos World Model","Cosmos World Foundation Model Platform for Physical AI",[144,163,188],{"title":8,"items":145},[146,147,155],{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},{"id":148,"publish_date":149,"is_original":32,"collection":150,"cover_url":151,"cover_url_1_1":152,"title":153,"summary":154,"author":63},622,"2025-03-24","#OWL #AI Agent #MAS #MCP #CUA","article_res/cover/cb50ca7f2bf4d1ed50202d7406e1c19a.jpeg","article_res/cover/4aa7aa3badfacf3cc84121334f1050dd.jpeg","OWL: Multi-agent collaboration","OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation",{"id":156,"publish_date":157,"is_original":32,"collection":158,"cover_url":159,"cover_url_1_1":160,"title":161,"summary":162,"author":63},620,"2025-03-26","#LLM #Google #Gemini #AI Agent","article_res/cover/53751a6dbbe990b1eb0b63f3b062aed4.jpeg","article_res/cover/031344981f0a212ff82d1f3a64aa5756.jpeg","Gemini 2.5 Pro, claimed to be far ahead of the competition, has been released with great fanfare: comprehensively surpassing other LLMs and topping the global rankings","Gemini 2.5: Our most intelligent AI model",{"title":9,"items":164},[165,172,180],{"id":166,"publish_date":157,"is_original":32,"collection":167,"cover_url":168,"cover_url_1_1":169,"title":170,"summary":171,"author":63},619,"#OpenAI #AI Image Generator #4o #MMM #AR Transformer","article_res/cover/2faffc97fcecf3151552cb0fd3206d89.jpeg","article_res/cover/1133cb4948af44cee2e7fbe79efb69e5.jpeg","The native image function of GPT-4o is officially launched","Introducing 4o Image Generation",{"id":173,"publish_date":174,"is_original":4,"collection":175,"cover_url":176,"cover_url_1_1":177,"title":178,"summary":179,"author":63},434,"2023-07-15","#Anthropic #OpenAI #Google #AI Code Generator #Claude","article_res/cover/e1b6f600a2b9f262a4392684e5f2ce25.jpeg","article_res/cover/6e1772e83f78f9a351ab23d3e414adee.jpeg","Latest Updates on Google Bard /Anthropic Claude2 / ChatGPT Code Interpreter","We want our models to use their programming skills to provide more natural interfaces to the basic functions of our computers.  \n - OpenAI",{"id":181,"publish_date":182,"is_original":4,"collection":183,"cover_url":184,"cover_url_1_1":185,"title":186,"summary":187,"author":63},417,"2023-08-24","#OpenAI","article_res/cover/bccf897d50a88b18364e35f7466387e0.jpeg","article_res/cover/2f871085c1073717c1703ae86e18056f.jpeg","The GPT-3.5 Turbo fine-tuning (fine-tuning function) has been released～","Developers can now bring their own data to customize GPT-3.5 Turbo for their use cases.",{"title":10,"items":189},[190,191,192],{"id":88,"publish_date":89,"is_original":32,"collection":90,"cover_url":91,"cover_url_1_1":92,"title":93,"summary":94,"author":63},{"id":96,"publish_date":97,"is_original":32,"collection":98,"cover_url":99,"cover_url_1_1":100,"title":101,"summary":102,"author":63},{"id":104,"publish_date":105,"is_original":32,"collection":106,"cover_url":107,"cover_url_1_1":108,"title":109,"summary":110,"author":63},true,{"code":4,"msg":5,"data":195},{"id":196,"publish_date":197,"is_original":4,"collection":198,"articles_id":199,"cover_url":200,"cover_url_1_1":201,"title":202,"summary":203,"author":204,"content":205},422,"2023-08-09","#LLM","wNL1CkpnEH8oq3_DATqA_g","article_res/cover/333388145626f84ced5762dba425a64d.jpeg","article_res/cover/27249377525a79a36fc10bd7c09ee374.jpeg","DPO vs RLHF","DPO is able to bypass the reward modeling phase and optimize directly for the preferences expressed in the preference data.","--","\u003Cdiv class=\"rich_media_content js_underline_content\n                       defaultNoSetting\n            \" id=\"js_content\">\u003Csection data-tool=\"markdown编辑器\" data-website=\"https://markdown.com.cn/editor\" style='font-size: 16px;font-style: normal;font-variant-caps: normal;font-weight: 400;letter-spacing: 0px;orphans: auto;text-indent: 0px;text-transform: none;white-space: normal;widows: auto;word-spacing: 0px;-webkit-tap-highlight-color: rgba(26, 26, 26, 0.3);-webkit-text-size-adjust: auto;-webkit-text-stroke-width: 0px;text-decoration: none;color: black;padding: 25px 30px;line-height: 1.6;word-break: break-word;overflow-wrap: break-word;text-align: justify;font-family: Optima-Regular, Optima, PingFangSC-light, PingFangTC-light, \"PingFang SC\", Cambria, Cochin, Georgia, Times, \"Times New Roman\", serif;margin-top: -10px;'>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">is Direct Preference Optimization\u003C/p>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">is Reinforcement Learning from Human Feedback\u003C/p>\u003Ch2 data-tool=\"markdown.com.cn编辑器\" style=\"margin-top: 30px;margin-bottom: 15px;font-weight: bold;color: black;font-size: 22px;\">\u003Cspan class=\"content\">Let's first talk about RLHF\u003C/span>\u003Cspan class=\"suffix\">\u003C/span>\u003C/h2>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">OpenAI GPT relies on a new large language model (LLM) training paradigm: namely RLHF (Reinforcement Learning from Human Feedback). In short, it is a method of reinforcement learning optimization through human feedback.\u003C/p>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">Before this, LLMs mainly generated responses based on human input prompts (prompts), and such evaluations were typically subjective and context-dependent. Traditional models usually just predicted the next word and used simple loss functions (such as cross-entropy), without explicitly incorporating human preferences and subjective opinions.\u003C/p>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">Then RLHF was introduced. This strategy uses human feedback on generated texts as an evaluation criterion, and even incorporates this feedback into the loss function for optimizing the model. Simply put, it uses reinforcement learning methods to directly optimize a language model that takes human feedback into account. This ensures that the language model can better align with complex human values.\u003C/p>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">RLHF mainly consists of three steps:\u003C/p>\u003Col data-tool=\"markdown.com.cn编辑器\" style=\"margin-top: 8px;margin-bottom: 8px;padding-left: 25px;color: black;list-style-type: decimal;\">\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">Pre-training a language model (LM).\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">Aggregating question-and-answer data and using it to train a reward model (Reward Model, RM).\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">Fine-tuning the LM using reinforcement learning (RL).\u003C/section>\u003C/li>\u003C/ol>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">\u003Csection>\u003Csection style=\"display: inline-block;\">\u003Cimg data-ratio=\"0.3333333333333333\" data-type=\"jpg\" data-w=\"2085\" src=\"https://res.cooltool.vip/article_res/assets/17434959630470.9399600891244193.jpeg\">\u003C/section>​\u003C/section>\u003C/section>\u003Ch2 data-tool=\"markdown.com.cn编辑器\" style=\"margin-top: 30px;margin-bottom: 15px;font-weight: bold;color: black;font-size: 22px;\">\u003Cspan class=\"content\">Let's talk about DPO.\u003C/span>\u003Cspan class=\"suffix\">\u003C/span>\u003C/h2>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">Although RLHF introduces the concept of human preferences and provides a method for integrating reinforcement learning with large language models, it often appears complex and unstable in practical applications. Its working principle is to first fit a reward model to capture human preferences, then fine-tune a large unsupervised learning model to maximize these rewards while trying to stay as close as possible to the original model.\u003C/p>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">To address these issues, researchers proposed the DPO algorithm. DPO not only directly uses the mapping between the reward function and the optimal strategy but also proves that the constrained reward maximization problem can be fully optimized through single-stage policy training. Essentially, DPO provides a solution to the classification problem based on human preference data.\u003C/p>\u003Csection>\u003Csection style=\"display: inline-block;\">\u003Cimg data-ratio=\"0.21357142857142858\" data-type=\"jpg\" data-w=\"1400\" src=\"https://res.cooltool.vip/article_res/assets/17434959630430.8616027496594765.jpeg\">\u003C/section>​\u003C/section>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">Compared with RLHF, DPO has many advantages:\u003C/p>\u003Col data-tool=\"markdown.com.cn编辑器\" style=\"margin-top: 8px;margin-bottom: 8px;padding-left: 25px;color: black;list-style-type: decimal;\">\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">It offers higher stability and computational efficiency.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">It does not require fitting a reward model or sampling during fine-tuning.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">It reduces the reliance on a large number of hyperparameters.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">DPO can more effectively fine-tune LMs to align with human preferences, often surpassing existing methods.\u003C/section>\u003C/li>\u003Cli>\u003Csection style=\"margin-top: 5px;margin-bottom: 5px;line-height: 26px;text-align: left;color: rgb(1, 1, 1);font-weight: 500;\">Fine-tuning with DPO performs better in controlling the sentiment of generated results, improving the quality of summaries and single-turn dialogue responses.\u003C/section>\u003C/li>\u003C/ol>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">You can view the detailed research paper on DPO at https://arxiv.org/abs/2305.18290\u003C/p>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">Here we can see the performance comparison between DPO and RLHF (PPO is a reinforcement learning algorithm under the RLHF framework):\u003C/p>\u003Csection>\u003Csection style=\"display: inline-block;\">\u003Cimg data-ratio=\"0.37969543147208124\" data-type=\"jpg\" data-w=\"985\" src=\"https://res.cooltool.vip/article_res/assets/17434959636090.2335555212334901.jpeg\">\u003C/section>​\u003C/section>\u003Cp data-tool=\"markdown.com.cn编辑器\" style=\"font-size: 16px;padding-top: 8px;padding-bottom: 8px;margin: 0px;line-height: 26px;color: black;\">Reinforcement learning is a more difficult and unstable method, and so far only OpenAI and Anthropic have successfully implemented it. Many open-source models have not seen significant performance improvements after adopting RLHF. However, with the emergence of new methods like DPO, reinforcement learning is no longer the only option.\u003C/p>\u003C/section>\u003Cp style=\"display: none;\">\u003Cmp-style-type data-value=\"10000\">\u003C/mp-style-type>\u003C/p>\u003C/div>",1752585434805]